CIS490/tools/prune_episodes.py

"""``cis490-prune`` — retroactively filter low-quality episodes from
the receiver's dataset.

The signals that mark an episode as low-quality:

  no-sample          meta.sample is null. Pre-Sample-propagation code
                     (commit a193d17 or earlier) ran the v1 yes-loop
                     fallback regardless of what the fleet picked, so
                     post-infection variety isn't recorded in meta.

  no-workload-events events.jsonl has zero workload_* rows. Pre-audit-
                     trail code (commit d86502d or earlier) ran with
                     no event emission from VMLoadController, so we
                     can't tell whether the workload actually fired.

  workload-failed    events.jsonl contains a workload_failed row. The
                     SerialClient.run() raised mid-phase; the labels
                     and telemetry don't match what the orchestrator
                     was supposed to be doing.

  workload-silent    workload_killed event during the dormant phase
                     has pre_kill_probe.yes == "0", meaning no
                     ``yes``-loop process was running when we tried
                     to kill it. This is the elliott-lab fingerprint:
                     the schedule walked but nothing fired in-guest.

  flat-cpu           /proc CPU% delta between phases is under 5
                     percentage points across all phase boundaries.
                     A model trained on these episodes can't
                     distinguish phases.

Usage:
    cis490-prune                     # dry-run summary, no changes
    cis490-prune --reason no-sample  # filter to one signal
    cis490-prune --archive           # mv flagged episodes to
                                     #   /var/lib/cis490/episodes-archive/
    cis490-prune --delete            # rm flagged episodes + index rows

Run from the receiver's host where /var/lib/cis490/ lives. Operator
runs as root because the episode store is owned by the cis490 user
mode 0640.
"""

from __future__ import annotations

import argparse
import io
import json
import os
import shutil
import statistics
import subprocess
import sys
import tarfile
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterator


_REASONS = (
    "no-sample",
    "no-workload-events",
    "workload-failed",
    "workload-silent",
    "flat-cpu",
)


@dataclass
class EpisodeQuality:
    host_id: str
    episode_id: str
    tar_path: Path
    size_bytes: int
    reasons: list[str] = field(default_factory=list)
    sample_name: str | None = None
    module_name: str | None = None

    @property
    def fake(self) -> bool:
        return bool(self.reasons)


# ---------------------------------------------------------------------------
# tarball introspection
# ---------------------------------------------------------------------------


def _read_jsonl_from_tar(tar: tarfile.TarFile, name_suffix: str) -> list[dict]:
    """Extract a JSONL member by name suffix (e.g. 'events.jsonl')."""
    for m in tar.getmembers():
        if m.name.endswith(name_suffix) and m.isfile():
            f = tar.extractfile(m)
            if f is None:
                return []
            text = f.read().decode("utf-8", errors="replace")
            return [json.loads(line) for line in text.splitlines() if line.strip()]
    return []


def _read_meta_from_tar(tar: tarfile.TarFile) -> dict:
    for m in tar.getmembers():
        if m.name.endswith("meta.json") and m.isfile():
            f = tar.extractfile(m)
            if f is None:
                return {}
            return json.loads(f.read().decode("utf-8"))
    return {}


def _decompress_zstd(zst_path: Path) -> bytes:
    """Pure stdlib doesn't have zstd; shell out (already a project dep
    — install scripts require it)."""
    p = subprocess.run(
        ["zstd", "-q", "-d", "--stdout", str(zst_path)],
        check=True, capture_output=True,
    )
    return p.stdout


def classify_episode(tar_zst: Path, host_id: str, episode_id: str) -> EpisodeQuality:
    """Open the tarball, scan meta + events + telemetry, return a
    quality verdict. Each signal is independent — an episode can hit
    multiple reasons (e.g. no-sample + workload-silent)."""
    q = EpisodeQuality(
        host_id=host_id,
        episode_id=episode_id,
        tar_path=tar_zst,
        size_bytes=tar_zst.stat().st_size,
    )

    try:
        raw = _decompress_zstd(tar_zst)
    except (subprocess.CalledProcessError, OSError) as e:
        q.reasons.append(f"unreadable: {e}"[:80])
        return q

    with tarfile.open(fileobj=io.BytesIO(raw)) as tar:
        meta = _read_meta_from_tar(tar)
        events = _read_jsonl_from_tar(tar, "events.jsonl")
        proc = _read_jsonl_from_tar(tar, "telemetry-proc.jsonl")
        labels = _read_jsonl_from_tar(tar, "labels.jsonl")

    sample = meta.get("sample")
    if sample is None:
        q.reasons.append("no-sample")
    else:
        q.sample_name = sample.get("name")

    exploit = meta.get("exploit")
    if exploit is not None:
        q.module_name = exploit.get("module_name")

    workload_events = [e for e in events if str(e.get("event", "")).startswith("workload_")]
    if not workload_events:
        q.reasons.append("no-workload-events")
    if any(e.get("event") == "workload_failed" for e in events):
        q.reasons.append("workload-failed")

    # workload-silent (provisional): dormant transition's probe shows
    # no `yes` proc. This is a weak signal on its own — see CIS490#15:
    # busybox pgrep -c is unsupported, so pre-fix episodes always
    # report yes=0 even when the workload is saturating the vCPU. We
    # only confirm workload-silent when host-side /proc telemetry
    # (computed below) AGREES that no signal is present (flat-cpu).
    probe_says_silent = False
    for e in events:
        if e.get("event") != "workload_killed":
            continue
        if e.get("phase") != "dormant":
            continue
        probe = e.get("pre_kill_probe")
        if isinstance(probe, dict) and probe.get("yes") == "0":
            probe_says_silent = True
            break

    # flat-cpu: bucket /proc CPU% by phase, check inter-phase spread.
    if proc and labels:
        clk_tck = os.sysconf("SC_CLK_TCK")

        def phase_at(t_ns: int) -> str:
            cur = "(pre)"
            for l in labels:
                if l["t_mono_ns"] <= t_ns:
                    cur = l["phase"]
                else:
                    break
            return cur

        per_phase: dict[str, list[float]] = {}
        prev = None
        for r in proc:
            if prev is not None:
                dt = (r["t_mono_ns"] - prev["t_mono_ns"]) / 1e9
                if dt > 0:
                    djiff = (r["cpu_user_jiffies"] + r["cpu_sys_jiffies"]) - \
                            (prev["cpu_user_jiffies"] + prev["cpu_sys_jiffies"])
                    pct = 100.0 * (djiff / clk_tck) / dt
                    per_phase.setdefault(phase_at(r["t_mono_ns"]), []).append(pct)
            prev = r
        if per_phase:
            medians = [statistics.median(v) for v in per_phase.values() if v]
            if medians and (max(medians) - min(medians)) < 5.0:
                q.reasons.append("flat-cpu")

    # Confirm workload-silent only when host-side telemetry agrees.
    # If the probe said silent but /proc CPU% shows a real inter-phase
    # delta (i.e. NOT flat-cpu), trust the host-side ground truth and
    # discard the probe result — the probe is busybox-pgrep-broken.
    if probe_says_silent and "flat-cpu" in q.reasons:
        q.reasons.append("workload-silent")

    return q


# ---------------------------------------------------------------------------
# Index walking + actions
# ---------------------------------------------------------------------------


def walk_index(index_path: Path, episodes_root: Path) -> Iterator[tuple[dict, Path]]:
    if not index_path.exists():
        return
    for line in index_path.read_text().splitlines():
        if not line.strip():
            continue
        try:
            row = json.loads(line)
        except json.JSONDecodeError:
            continue
        host = row.get("host_id", "")
        ep = row.get("episode_id", "")
        if not host or not ep:
            continue
        tar = episodes_root / host / f"{ep}.tar.zst"
        if not tar.exists():
            continue
        yield row, tar


def apply_action(
    quals: list[EpisodeQuality],
    *,
    action: str,
    archive_root: Path,
    index_path: Path,
) -> None:
    """Carry out --delete or --archive on flagged episodes + drop
    matching rows from index.jsonl. Atomic-ish: index rewrite is
    single-shot after all tarballs are handled."""
    if action not in ("delete", "archive"):
        return
    flagged_ids = {q.episode_id for q in quals if q.fake}
    if not flagged_ids:
        return

    if action == "archive":
        archive_root.mkdir(parents=True, exist_ok=True)
    for q in quals:
        if not q.fake:
            continue
        if action == "archive":
            target = archive_root / q.host_id
            target.mkdir(parents=True, exist_ok=True)
            shutil.move(str(q.tar_path), target / q.tar_path.name)
        elif action == "delete":
            q.tar_path.unlink(missing_ok=True)

    if index_path.exists():
        kept = []
        for line in index_path.read_text().splitlines():
            try:
                row = json.loads(line)
            except json.JSONDecodeError:
                kept.append(line)
                continue
            if row.get("episode_id") in flagged_ids:
                continue
            kept.append(line)
        # Rewrite via tempfile + replace so a crash mid-write doesn't
        # corrupt the live index. os.replace drops ownership/mode from
        # the original — when prune runs as root that leaves the new
        # file root:root and locks out the cis490 receiver service
        # (every PUT then 500s on _append_index). Snapshot stat before
        # the rename, restore after.
        st = index_path.stat()
        tmp = index_path.with_suffix(".jsonl.partial")
        tmp.write_text("\n".join(kept) + ("\n" if kept else ""))
        os.replace(tmp, index_path)
        try:
            os.chown(index_path, st.st_uid, st.st_gid)
        except (PermissionError, OSError):
            # Best-effort: chown requires root, but if we got here as a
            # non-root user the original ownership matched ours anyway.
            pass
        os.chmod(index_path, st.st_mode & 0o7777)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser(prog="cis490-prune")
    p.add_argument("--episodes-root", type=Path,
                   default=Path("/var/lib/cis490/episodes"))
    p.add_argument("--index", type=Path,
                   default=Path("/var/lib/cis490/index.jsonl"))
    p.add_argument("--archive-root", type=Path,
                   default=Path("/var/lib/cis490/episodes-archive"))
    p.add_argument("--reason", action="append", choices=_REASONS,
                   help="Only flag episodes matching this reason. Repeat "
                        "to OR multiple. Default: all reasons.")
    p.add_argument("--host", help="Only consider episodes from this host_id")
    action = p.add_mutually_exclusive_group()
    action.add_argument("--delete", action="store_true",
                        help="Remove flagged tarballs + drop their index rows")
    action.add_argument("--archive", action="store_true",
                        help="Move flagged tarballs to --archive-root + drop index rows")
    p.add_argument("--json", action="store_true",
                   help="Machine-readable output instead of summary")
    args = p.parse_args(argv)

    if not args.episodes_root.exists():
        print(f"no episodes dir at {args.episodes_root}", file=sys.stderr)
        return 2

    selected_reasons = set(args.reason or _REASONS)

    quals: list[EpisodeQuality] = []
    for row, tar in walk_index(args.index, args.episodes_root):
        if args.host and row["host_id"] != args.host:
            continue
        q = classify_episode(tar, row["host_id"], row["episode_id"])
        # Only mark "fake" if at least one of the selected reasons hits.
        q.reasons = [r for r in q.reasons if r in selected_reasons]
        quals.append(q)

    flagged = [q for q in quals if q.fake]
    kept = [q for q in quals if not q.fake]

    if args.json:
        print(json.dumps({
            "scanned": len(quals),
            "flagged": len(flagged),
            "kept": len(kept),
            "by_reason": {
                r: sum(1 for q in flagged if r in q.reasons) for r in _REASONS
            },
            "flagged_episodes": [
                {
                    "host": q.host_id,
                    "episode": q.episode_id,
                    "size_bytes": q.size_bytes,
                    "reasons": q.reasons,
                    "sample": q.sample_name,
                    "module": q.module_name,
                } for q in flagged
            ],
        }, indent=2))
    else:
        print(f"scanned: {len(quals)}  flagged: {len(flagged)}  kept: {len(kept)}")
        if flagged:
            print()
            print(f"{'host':<14} {'episode':<28} {'size':>9} reasons")
            for q in flagged:
                print(f"{q.host_id:<14} {q.episode_id:<28} {q.size_bytes:>9}  "
                      f"{','.join(q.reasons)}")
        if not (args.delete or args.archive):
            print()
            print("dry-run only. Re-run with --archive (safer) or --delete.")

    if args.delete or args.archive:
        action = "delete" if args.delete else "archive"
        apply_action(
            quals,
            action=action,
            archive_root=args.archive_root,
            index_path=args.index,
        )
        print(f"\n{action}d {sum(1 for q in flagged)} episodes")

    return 0 if not flagged else 1


if __name__ == "__main__":
    sys.exit(main())