CIS490/tests/test_episode.py

from __future__ import annotations

import json
import os
from pathlib import Path

import pytest

from orchestrator.episode import EpisodeConfig, EpisodeRunner


def _read_jsonl(p: Path) -> list[dict]:
    return [json.loads(l) for l in p.read_text().splitlines()]


def test_episode_against_self_pid_produces_full_directory(tmp_path: Path) -> None:
    cfg = EpisodeConfig(
        target_pid=os.getpid(),
        duration_s=0.5,
        interval_ms=50,
        data_root=tmp_path,
    )
    result = EpisodeRunner(cfg).run()

    d = result.episode_dir
    assert d.exists()
    assert (d / "meta.json").exists()
    assert (d / "events.jsonl").exists()
    assert (d / "labels.jsonl").exists()
    assert (d / "telemetry-proc.jsonl").exists()
    assert (d / "done.marker").exists()

    # meta.json structure
    meta = json.loads((d / "meta.json").read_text())
    assert meta["episode_id"] == result.episode_id
    assert meta["schema_version"] == 1
    # code_version stamps which commit produced the episode so trainers
    # can filter out pre-fix data without scanning every tarball.
    assert "code_version" in meta
    cv = meta["code_version"]
    assert "commit" in cv and "source" in cv
    # Source is "git" (we run tests in a git checkout) or "VERSION-file"
    # (someone running tests against /opt/cis490/) or "unknown" (CI
    # without git). All three are acceptable; the field is what matters.
    assert cv["source"] in {"git", "VERSION-file", "unknown"}
    assert meta["started_at_wall"] is not None
    assert meta["ended_at_wall"] is not None
    assert meta["vm"]["target_pid"] == os.getpid()
    assert meta["schedule"]["baseline_seconds"] == 0.5
    assert meta["schedule"]["interval_ms"] == 50
    assert meta["result"]["rows_proc"] == result.rows_proc
    assert "clean" in meta["result"]["phases_observed"]

    # labels.jsonl: at least one clean label at t=0.
    labels = _read_jsonl(d / "labels.jsonl")
    assert any(r["phase"] == "clean" and r["t_mono_ns"] == 0 for r in labels)

    # events.jsonl: snapshot_load + episode_end.
    events = _read_jsonl(d / "events.jsonl")
    event_names = [e["event"] for e in events]
    assert "snapshot_load" in event_names
    assert "episode_end" in event_names

    # telemetry-proc.jsonl: roughly 10 ticks @ 50ms over 500ms.
    proc_rows = _read_jsonl(d / "telemetry-proc.jsonl")
    assert len(proc_rows) >= 5
    for row in proc_rows:
        assert row["source"] == "host_proc"
        assert row["available_in_deployment"] is False
        assert row["rss_bytes"] > 0


def test_episode_id_can_be_overridden(tmp_path: Path) -> None:
    cfg = EpisodeConfig(
        target_pid=os.getpid(),
        duration_s=0.1,
        interval_ms=50,
        data_root=tmp_path,
        episode_id="01TEST",
    )
    result = EpisodeRunner(cfg).run()
    assert result.episode_id == "01TEST"
    assert result.episode_dir == tmp_path / "episodes" / "01TEST"


def test_meta_sample_records_full_sample_when_passed(tmp_path: Path) -> None:
    """EpisodeConfig.sample → meta.sample carries identity + kind so
    trainers can join episodes by family/sha256 without re-deriving
    from events. With no Sample, meta.sample stays null."""
    import os as _os

    from samples.manifest import Sample

    s = Sample(
        name="xmrig-cryptominer",
        family="XMRig",
        category="cryptominer",
        profile="cpu-saturate",
        sha256="abc" * 21 + "d",  # 64 hex
        source="MalwareBazaar",
    )
    cfg = EpisodeConfig(
        target_pid=_os.getpid(),
        duration_s=0.1,
        interval_ms=50,
        data_root=tmp_path,
        sample=s,
    )
    result = EpisodeRunner(cfg).run()

    meta = json.loads((result.episode_dir / "meta.json").read_text())
    assert meta["sample"] is not None
    assert meta["sample"]["name"] == "xmrig-cryptominer"
    assert meta["sample"]["family"] == "XMRig"
    assert meta["sample"]["category"] == "cryptominer"
    assert meta["sample"]["profile"] == "cpu-saturate"
    assert meta["sample"]["kind"] == "real"
    assert meta["sample"]["sha256"] == "abc" * 21 + "d"


def test_meta_sample_is_null_for_v1_path(tmp_path: Path) -> None:
    """No sample passed → the v1 fallback path. meta.sample stays
    null so trainers can detect (and filter out) info-less runs."""
    import os as _os

    cfg = EpisodeConfig(
        target_pid=_os.getpid(),
        duration_s=0.1,
        interval_ms=50,
        data_root=tmp_path,
    )
    result = EpisodeRunner(cfg).run()
    meta = json.loads((result.episode_dir / "meta.json").read_text())
    assert meta["sample"] is None


def test_episode_writes_done_marker_last(tmp_path: Path) -> None:
    """done.marker should not appear until meta.json has ended_at_wall set."""
    cfg = EpisodeConfig(
        target_pid=os.getpid(),
        duration_s=0.1,
        interval_ms=50,
        data_root=tmp_path,
    )
    result = EpisodeRunner(cfg).run()
    assert (result.episode_dir / "done.marker").exists()
    meta = json.loads((result.episode_dir / "meta.json").read_text())
    assert meta["ended_at_wall"] is not None