CIS490/training/dashboard/feeder.py

"""Real producers that wire the receiver's on-disk state to the
dashboard message bus.

Three feeders, all started by ``app.lifespan``:

  - ``watch_index_jsonl`` — tails ``/var/lib/cis490/index.jsonl`` and
    publishes one ``episode`` event per new line. Survives file
    rotation by tracking inode.
  - ``snapshot_loop`` — periodically derives ground-truth from disk
    (per-host episode counts, total counts, alert tail) and updates
    the broadcaster's persistent ``state`` so reconnecting clients
    see warm numbers, not zero.
  - ``watch_alerts_jsonl`` — same as the index tailer but for the
    receiver's alerts log.

If a path doesn't exist (e.g. ``health/`` on a fresh deploy) the
feeder logs once and keeps polling — it'll start producing events the
moment the path appears.
"""
from __future__ import annotations

import asyncio
import json
import logging
import os
import random
import subprocess
import tarfile
from pathlib import Path
from typing import Any, Awaitable, Callable


log = logging.getLogger("cis490.dashboard.feeder")

DEFAULT_DATA_ROOT = Path("/var/lib/cis490")

PublishFn = Callable[[dict[str, Any]], Awaitable[int]]


# ─────────────────────────────────────────────────────────────────────
# Tail helpers
# ─────────────────────────────────────────────────────────────────────

async def _tail_jsonl(
    path: Path,
    publish: PublishFn,
    parse: Callable[[dict], dict | None],
    *,
    poll_interval: float = 1.0,
    label: str = "tail",
) -> None:
    """Generic append-only JSONL tailer. ``parse`` shapes each record
    into the dict we publish (return None to skip)."""
    fd = None
    inode: int | None = None
    missing_logged = False
    while True:
        try:
            if not path.exists():
                if not missing_logged:
                    log.info("[%s] %s does not exist; will retry", label, path)
                    missing_logged = True
                await asyncio.sleep(poll_interval * 5)
                continue
            missing_logged = False

            st = path.stat()
            if fd is None or inode != st.st_ino:
                if fd is not None:
                    try: fd.close()
                    except Exception: pass
                fd = path.open("r", encoding="utf-8", errors="replace")
                inode = st.st_ino
                fd.seek(0, os.SEEK_END)
                log.info("[%s] watching %s (inode=%d, starting at %d bytes)",
                         label, path, inode, fd.tell())

            chunk = await asyncio.to_thread(fd.read)
            if chunk:
                for line in chunk.splitlines():
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        rec = json.loads(line)
                    except json.JSONDecodeError:
                        log.warning("[%s] skipping malformed line: %r", label, line[:120])
                        continue
                    out = parse(rec)
                    if out is not None:
                        await publish(out)
            await asyncio.sleep(poll_interval)
        except asyncio.CancelledError:
            raise
        except Exception:
            log.exception("[%s] error; reopening in 5s", label)
            if fd is not None:
                try: fd.close()
                except Exception: pass
                fd = None
                inode = None
            await asyncio.sleep(5)


# ─────────────────────────────────────────────────────────────────────
# Specific feeders
# ─────────────────────────────────────────────────────────────────────

async def watch_index_jsonl(broadcaster, path: Path) -> None:
    """Episode ingest log. One ``episode`` event per new line, plus
    we maintain ``broadcaster.state["recent_episodes"]`` in lockstep
    so reconnecting clients see warm history without re-reading
    index.jsonl from disk."""
    def parse(rec: dict) -> dict | None:
        if "episode_id" not in rec or "host_id" not in rec:
            return None
        ep = {
            "episode_id": rec.get("episode_id"),
            "host_id": rec.get("host_id"),
            "sha256": rec.get("sha256"),
            "size_bytes": rec.get("size_bytes"),
            "received_at": rec.get("received_at_wall"),
        }
        # Keep the live ring buffer + running totals in sync.
        recent = broadcaster.state.setdefault("recent_episodes", [])
        recent.insert(0, ep)
        if len(recent) > RECENT_EPISODES_LIMIT:
            del recent[RECENT_EPISODES_LIMIT:]
        sb = ep["size_bytes"]
        if isinstance(sb, (int, float)):
            broadcaster.state["total_bytes"] = (
                int(broadcaster.state.get("total_bytes", 0)) + int(sb)
            )
        broadcaster.state["total_episodes"] = (
            int(broadcaster.state.get("total_episodes", 0)) + 1
        )
        return {"type": "episode", **ep}
    await _tail_jsonl(path, broadcaster.publish, parse, label="index.jsonl")


async def watch_alerts_jsonl(publish: PublishFn, path: Path) -> None:
    """Operator-facing alerts (sick hosts, stuck shippers, etc.)."""
    def parse(rec: dict) -> dict | None:
        return {
            "type": "alert",
            "host_id": rec.get("host"),
            "symptom": rec.get("symptom"),
            "detail": rec.get("detail"),
            "suggested_fix": rec.get("suggested_fix"),
            "detected_at": rec.get("detected_at_wall"),
            "dedup_key": rec.get("dedup_key"),
        }
    await _tail_jsonl(path, publish, parse, label="alerts.jsonl")


def _count_lines(p: Path) -> int:
    """Cheap line count — used for bootstrap totals. We accept 'one
    extra line at the moment we read mid-write' as acceptable noise."""
    try:
        with p.open("rb") as f:
            return sum(1 for _ in f)
    except OSError:
        return 0


RECENT_EPISODES_LIMIT = 200


def _snapshot_state(
    data_root: Path,
    index_path: Path,
    alerts_path: Path,
    *,
    recent_limit: int = RECENT_EPISODES_LIMIT,
) -> dict:
    """Derive the canonical view from disk in one pass.

    Reads index.jsonl front-to-back to collect ``total_episodes``,
    ``total_bytes`` and the trailing ``recent_episodes`` window for
    the database-explorer widget. ~76k lines / 23 MiB takes ~1 s on
    the Pi; this runs in ``to_thread`` so the event loop is unaffected.
    Counts per host come from the filesystem listing — cheaper and
    immune to JSON parse hiccups."""
    host_counts: dict[str, int] = {}
    episodes_root = data_root / "episodes"
    if episodes_root.exists():
        for host_dir in episodes_root.iterdir():
            if not host_dir.is_dir():
                continue
            try:
                host_counts[host_dir.name] = sum(
                    1 for entry in host_dir.iterdir() if entry.is_file()
                )
            except OSError:
                continue

    total_episodes = 0
    total_bytes = 0
    recent: list[dict] = []
    if index_path.exists():
        try:
            with index_path.open("r", encoding="utf-8", errors="replace") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        rec = json.loads(line)
                    except json.JSONDecodeError:
                        continue
                    total_episodes += 1
                    sb = rec.get("size_bytes")
                    if isinstance(sb, (int, float)):
                        total_bytes += int(sb)
                    recent.append({
                        "episode_id": rec.get("episode_id"),
                        "host_id": rec.get("host_id"),
                        "received_at": rec.get("received_at_wall"),
                        "size_bytes": sb,
                        "sha256": rec.get("sha256"),
                    })
                    # Periodic truncation keeps the buffer from growing
                    # unboundedly while we read all 76k lines.
                    if len(recent) > recent_limit * 4:
                        recent = recent[-recent_limit:]
        except OSError:
            pass
    recent = recent[-recent_limit:]
    recent.reverse()  # newest-first for the UI

    return {
        "total_episodes": total_episodes,
        "total_alerts": _count_lines(alerts_path),
        "host_counts": host_counts,
        "total_bytes": total_bytes,
        "recent_episodes": recent,
    }


async def snapshot_loop(
    broadcaster,
    *,
    data_root: Path,
    index_path: Path,
    alerts_path: Path,
    poll_interval: float = 30.0,
) -> None:
    """Refresh the broadcaster's persistent ``state`` periodically so
    reconnecting clients see disk-truth, not just the in-session
    delta. Also publishes a ``snapshot`` event so already-connected
    widgets that want a hard reset can re-key on it."""
    first = True
    while True:
        try:
            snap = await asyncio.to_thread(
                _snapshot_state, data_root, index_path, alerts_path
            )
            # `update` instead of `=` so out-of-band keys (phase_mix,
            # anything future loops add) survive snapshot cycles.
            broadcaster.state.update(snap)
            if first:
                log.info(
                    "snapshot: total_episodes=%d total_alerts=%d hosts=%d",
                    snap["total_episodes"], snap["total_alerts"], len(snap["host_counts"]),
                )
                first = False
            await broadcaster.publish({"type": "snapshot", **snap})
        except asyncio.CancelledError:
            raise
        except Exception:
            log.exception("snapshot_loop error")
        await asyncio.sleep(poll_interval)


# ─────────────────────────────────────────────────────────────────────
# Phase mix (dataset)
# ─────────────────────────────────────────────────────────────────────
#
# The baseline scene shows the proportion of time the workload spent
# in each labelled phase. Originally a rolling 5-min window of live
# `phase` events — but live events only flow when the orchestrator is
# running. To keep the slide reflecting *actual data* we sample N
# random episode tarballs on disk, extract the labels.jsonl from
# each (which is a list of phase-transition events stamped with
# t_mono_ns), and aggregate phase durations across the sample.

PHASE_MIX_SAMPLE = 500
PHASE_MIX_INTERVAL = 600.0  # seconds; ~10 min


def _read_episode_labels(path: Path, *, timeout: float = 10.0) -> list[dict] | None:
    """Stream-extract labels.jsonl from an episode tarball.

    Uses ``zstd -dc | tarfile r|`` so we can break out of the stream as
    soon as labels.jsonl appears (it's near the front of the tar) and
    kill the zstd subprocess immediately after — avoiding a full
    decompress when we only need ~10 lines.
    """
    if not path.is_file():
        return None
    try:
        proc = subprocess.Popen(
            ["zstd", "-dc", str(path)],
            stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
        )
    except OSError:
        return None
    out: list[dict] | None = None
    try:
        with tarfile.open(fileobj=proc.stdout, mode="r|") as tar:
            for member in tar:
                if not member.isfile():
                    continue
                name = member.name.rsplit("/", 1)[-1]
                if name == "labels.jsonl":
                    f = tar.extractfile(member)
                    if f is None:
                        continue
                    data = f.read()
                    out = []
                    for line in data.splitlines():
                        line = line.strip()
                        if not line:
                            continue
                        try:
                            out.append(json.loads(line))
                        except json.JSONDecodeError:
                            pass
                    break
    except (tarfile.TarError, OSError):
        out = None
    finally:
        if proc.stdout:
            try: proc.stdout.close()
            except Exception: pass
        try: proc.kill()  # short-circuit zstd if labels was the only file we needed
        except Exception: pass
        try: proc.wait(timeout=timeout)
        except Exception: pass
    return out


def _aggregate_episode_labels(
    records: list[dict],
    counts: dict[str, int],
    weighted: dict[str, float],
) -> int:
    """Fold one episode's labels into running counters.

    Each record is a phase-transition event with ``t_mono_ns``. The
    duration of phase ``rec[i].phase`` is ``t_mono_ns[i+1] -
    t_mono_ns[i]``; the trailing record gets a count bump but no
    duration weight (we don't know its end without meta.json, and the
    last phase is usually short ``infected_running`` cleanup so the
    bias is acceptable for proportional display).
    """
    if not records:
        return 0
    rec = sorted(records, key=lambda r: r.get("t_mono_ns") or 0)
    n_labels = 0
    if len(rec) == 1:
        p = rec[0].get("phase")
        if p:
            counts[p] = counts.get(p, 0) + 1
            n_labels = 1
        return n_labels
    for i in range(len(rec) - 1):
        p = rec[i].get("phase")
        if not p:
            continue
        t0 = rec[i].get("t_mono_ns")
        t1 = rec[i + 1].get("t_mono_ns")
        if not (isinstance(t0, (int, float)) and isinstance(t1, (int, float))):
            continue
        dur_s = max(0.0, (float(t1) - float(t0)) / 1e9)
        weighted[p] = weighted.get(p, 0.0) + dur_s
        counts[p] = counts.get(p, 0) + 1
        n_labels += 1
    tail = rec[-1].get("phase")
    if tail:
        counts[tail] = counts.get(tail, 0) + 1
        n_labels += 1
    return n_labels


def _compute_phase_mix(data_root: Path, sample: int = PHASE_MIX_SAMPLE) -> dict:
    """Sample N random episodes, aggregate their phase durations.

    Returns ``{}`` if no episodes are on disk yet. Returns a dict with
    ``counts``, ``weighted_seconds``, ``sampled_episodes``,
    ``population_episodes``, and ``total_labels`` otherwise.
    """
    episodes_root = data_root / "episodes"
    if not episodes_root.is_dir():
        return {}

    files: list[Path] = []
    try:
        for host_dir in episodes_root.iterdir():
            if not host_dir.is_dir():
                continue
            try:
                for entry in host_dir.iterdir():
                    if entry.is_file() and entry.name.endswith(".tar.zst"):
                        files.append(entry)
            except OSError:
                continue
    except OSError:
        return {}
    if not files:
        return {}
    population = len(files)

    chosen = files if sample >= population else random.sample(files, sample)

    counts: dict[str, int] = {}
    weighted: dict[str, float] = {}
    sampled_episodes = 0
    total_labels = 0
    for path in chosen:
        labels = _read_episode_labels(path)
        if not labels:
            continue
        added = _aggregate_episode_labels(labels, counts, weighted)
        if added:
            sampled_episodes += 1
            total_labels += added

    return {
        "counts": counts,
        "weighted_seconds": weighted,
        "sampled_episodes": sampled_episodes,
        "population_episodes": population,
        "total_labels": total_labels,
    }


async def phase_mix_loop(
    broadcaster,
    *,
    data_root: Path,
    poll_interval: float = PHASE_MIX_INTERVAL,
    sample: int = PHASE_MIX_SAMPLE,
) -> None:
    """Recompute the dataset phase mix on a slow timer.

    Lives off the main event loop via ``to_thread`` because the zstd
    decompress + tar parse for a few hundred episodes takes long
    enough to be noticeable on a Pi (still typically < 30 s).
    """
    while True:
        try:
            mix = await asyncio.to_thread(_compute_phase_mix, data_root, sample)
            if mix:
                broadcaster.state["phase_mix"] = mix
                await broadcaster.publish({"type": "phase_mix", **mix})
                log.info(
                    "phase_mix: %d/%d episodes sampled, %d labels, "
                    "weighted=%s",
                    mix.get("sampled_episodes", 0),
                    mix.get("population_episodes", 0),
                    mix.get("total_labels", 0),
                    {k: round(v, 1) for k, v in mix.get("weighted_seconds", {}).items()},
                )
            else:
                log.info("phase_mix: no episodes on disk yet, retrying")
        except asyncio.CancelledError:
            raise
        except Exception:
            log.exception("phase_mix_loop error")
        await asyncio.sleep(poll_interval)


# ─────────────────────────────────────────────────────────────────────
# Lifecycle
# ─────────────────────────────────────────────────────────────────────

def start_feeders(broadcaster, *, data_root: Path = DEFAULT_DATA_ROOT) -> list[asyncio.Task]:
    """Kick off all feeder tasks. Caller is responsible for cancelling
    them on shutdown (lifespan context handles that)."""
    index_path  = data_root / "index.jsonl"
    alerts_path = data_root / "alerts.jsonl"
    publish = broadcaster.publish
    tasks = [
        asyncio.create_task(
            snapshot_loop(broadcaster, data_root=data_root,
                          index_path=index_path, alerts_path=alerts_path),
            name="cis490.feeder.snapshot"),
        asyncio.create_task(
            watch_index_jsonl(broadcaster, index_path),
            name="cis490.feeder.index"),
        asyncio.create_task(
            watch_alerts_jsonl(publish, alerts_path),
            name="cis490.feeder.alerts"),
        asyncio.create_task(
            phase_mix_loop(broadcaster, data_root=data_root),
            name="cis490.feeder.phase_mix"),
    ]
    return tasks