workload audit trail: meta.sample + per-phase events + pre-kill probe

The elliott-lab episode showed every phase median'd 20% CPU because the in-guest workload silently never fired — and there was no signal in events.jsonl to detect that from outside, so a trainer would treat the labels as ground truth and learn "all phases look identical". This commit closes the audit gap so the failure is visible in meta: orchestrator/episode.py EpisodeConfig.sample: Sample | None — the manifest entry that drove this episode's workload selection. Stamped into meta.sample as {name, family, category, profile, kind, sha256} so trainers can join cleanly without re-deriving from events. None means the v1 yes-loop fallback path ran (and the trainer should treat the episode with appropriate skepticism). tools/vm_load_controller.py VMLoadController gains an emit_event callable. Every phase now emits a workload_* event into the runner's events.jsonl: workload_setup login + initial cleanup OK workload_killed clean / dormant. Dormant carries a `pre_kill_probe` dict from inside the guest (`pgrep -c yes`, `pgrep -c sh`, /proc/loadavg) so the trainer can detect the elliott-lab failure mode where the workload never actually ran. workload_armed armed handshake fired workload_infecting dd urandom / payload write fired workload_started infected_running command sent workload_failed any of the above raised inside SerialClient (timeout, EOF, partial login). The runner would have silently swallowed the exception via its on_phase try/except; the audit row makes the failure detectable. Exceptions in shell calls surface as workload_failed events but do NOT propagate, matching the runner's existing on_phase contract. tools/run_real_vm_demo.py Wires the controller's emit_event to the runner's emit_event via a small forward-reference closure (controller is built before runner; runner.emit_event needs to be the sink). Sample also flows into EpisodeConfig.sample so meta.sample matches what the controller actually ran. Tests: 119 (was 106). New cases: tests/test_vm_load_controller.py (11 tests against a FakeSerial) - setup emits workload_setup - infected_running runs the v1 yes-loop AND emits workload_started - dormant probes BEFORE killing and stamps pre_kill_probe - dormant probe records "yes=0" (the elliott-lab fingerprint) - clean / armed / infecting all emit their respective events - serial.run() exception → workload_failed event, no propagation - sample-with-profile dispatches to exploits.workloads command (NOT the v1 yes-loop) - missing emit_event callback is a no-op (back-compat) tests/test_episode.py (2 new) - meta.sample carries name/family/category/profile/kind/sha256 when EpisodeConfig.sample is set - meta.sample stays null in the v1 fallback path Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 02:12:34 -05:00 · 2026-04-30 02:12:34 -05:00 · d86502d950
commit d86502d950
parent 8753340ea3
5 changed files with 397 additions and 24 deletions
--- a/orchestrator/episode.py
+++ b/orchestrator/episode.py
@ -37,6 +37,7 @@ from pathlib import Path
 from typing import Callable

 from collectors import guest_agent, pcap, perf_qemu, proc_qemu, qmp
+from samples.manifest import Sample

 from .ulid import new_ulid

@ -77,6 +78,10 @@ class EpisodeConfig:
    # explicitly per-episode when the host supports it.
    enable_perf: bool = False
    perf_interval_ms: int = 100
+    # The Sample that drove this episode's workload selection. Stamped
+    # into meta.json so trainers can join episodes by family / kind
+    # without re-deriving from events. None = v1 yes-loop fallback.
+    sample: Sample | None = None
    # Snapshot/revert (Tier 0+):
    #   revert_at_start — before any phase walks, loadvm <snapshot_name>.
    #     Use this to drop the guest back to a known-good baseline at
@ -341,6 +346,17 @@ class EpisodeRunner:
        return observed

    def _initial_meta(self, started_at_wall: str) -> dict:
+        sample_meta: dict | None = None
+        if self.cfg.sample is not None:
+            s = self.cfg.sample
+            sample_meta = {
+                "name": s.name,
+                "family": s.family,
+                "category": s.category,
+                "profile": s.profile,
+                "kind": s.kind,
+                "sha256": s.sha256,
+            }
        return {
            "episode_id": self.episode_id,
            "schema_version": SCHEMA_VERSION,
@ -359,7 +375,7 @@ class EpisodeRunner:
                "target_pid": self.cfg.target_pid,
            },
            "exploit": None,
-            "sample": None,
+            "sample": sample_meta,
            "schedule": {
                "baseline_seconds": self.cfg.duration_s,
                "interval_ms": self.cfg.interval_ms,
--- a/tests/test_episode.py
+++ b/tests/test_episode.py
@ -74,6 +74,57 @@ def test_episode_id_can_be_overridden(tmp_path: Path) -> None:
    assert result.episode_dir == tmp_path / "episodes" / "01TEST"


+def test_meta_sample_records_full_sample_when_passed(tmp_path: Path) -> None:
+    """EpisodeConfig.sample → meta.sample carries identity + kind so
+    trainers can join episodes by family/sha256 without re-deriving
+    from events. With no Sample, meta.sample stays null."""
+    import os as _os
+
+    from samples.manifest import Sample
+
+    s = Sample(
+        name="xmrig-cryptominer",
+        family="XMRig",
+        category="cryptominer",
+        profile="cpu-saturate",
+        sha256="abc" * 21 + "d",  # 64 hex
+        source="MalwareBazaar",
+    )
+    cfg = EpisodeConfig(
+        target_pid=_os.getpid(),
+        duration_s=0.1,
+        interval_ms=50,
+        data_root=tmp_path,
+        sample=s,
+    )
+    result = EpisodeRunner(cfg).run()
+
+    meta = json.loads((result.episode_dir / "meta.json").read_text())
+    assert meta["sample"] is not None
+    assert meta["sample"]["name"] == "xmrig-cryptominer"
+    assert meta["sample"]["family"] == "XMRig"
+    assert meta["sample"]["category"] == "cryptominer"
+    assert meta["sample"]["profile"] == "cpu-saturate"
+    assert meta["sample"]["kind"] == "real"
+    assert meta["sample"]["sha256"] == "abc" * 21 + "d"
+
+
+def test_meta_sample_is_null_for_v1_path(tmp_path: Path) -> None:
+    """No sample passed → the v1 fallback path. meta.sample stays
+    null so trainers can detect (and filter out) info-less runs."""
+    import os as _os
+
+    cfg = EpisodeConfig(
+        target_pid=_os.getpid(),
+        duration_s=0.1,
+        interval_ms=50,
+        data_root=tmp_path,
+    )
+    result = EpisodeRunner(cfg).run()
+    meta = json.loads((result.episode_dir / "meta.json").read_text())
+    assert meta["sample"] is None
+
+
 def test_episode_writes_done_marker_last(tmp_path: Path) -> None:
    """done.marker should not appear until meta.json has ended_at_wall set."""
    cfg = EpisodeConfig(
--- a/tests/test_vm_load_controller.py
+++ b/tests/test_vm_load_controller.py
@ -0,0 +1,213 @@
+"""Tests for VMLoadController against a fake SerialClient.
+
+The controller's only job is to translate phases into shell commands
+on a serial console + emit audit events. The key invariants we
+encode here come from the elliott-lab incident where every phase
+median'd 20% CPU because the workload silently never fired:
+
+  - every set_phase emits some event (so absence in events.jsonl is
+    a hard signal)
+  - infected_running emits workload_started AFTER sending the load
+    command
+  - dormant emits workload_killed WITH a pre_kill_probe so trainers
+    can detect "the workload was never running"
+  - exceptions in the shell call surface as workload_failed; they
+    do NOT propagate (the runner's on_phase callback would swallow
+    them anyway, but we want the audit row regardless)
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+# Mirror the same path hack run_real_vm_demo.py uses so the tools/
+# module imports work.
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+sys.path.insert(0, str(ROOT / "tools"))
+
+from samples.manifest import Sample
+from vm_load_controller import VMLoadController  # noqa: E402
+
+
+class FakeSerial:
+    """Records every shell command. Returns canned probe output."""
+
+    def __init__(self, probe_response: str = "yes=1\nsh=1\nloadavg=0.45") -> None:
+        self.calls: list[str] = []
+        self.probe_response = probe_response
+        self.fail_on: list[str] = []
+
+    def run(self, cmd: str, timeout_s: float = 10.0) -> str:
+        self.calls.append(cmd)
+        for substr in self.fail_on:
+            if substr in cmd:
+                raise RuntimeError(f"fake-serial: failing on {substr!r}")
+        if "pgrep -c yes" in cmd or "pgrep -c sh" in cmd or "loadavg" in cmd:
+            return self.probe_response
+        return ""
+
+
+# ---------------------------------------------------------------------------
+# Event emission — the audit trail
+# ---------------------------------------------------------------------------
+
+
+def test_setup_emits_workload_setup_event() -> None:
+    serial = FakeSerial()
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
+    c.setup()
+    names = [e for e, _ in events]
+    assert "workload_setup" in names
+    setup = next(kw for e, kw in events if e == "workload_setup")
+    assert setup["profile"] == "v1-yes"  # no Sample → fallback path
+    assert setup["sample"] is None
+
+
+def test_setup_records_profile_when_sample_present() -> None:
+    serial = FakeSerial()
+    s = Sample(name="x", family="X", category="rat", profile="cpu-saturate")
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, sample=s, emit_event=lambda e, **kw: events.append((e, kw)))
+    c.setup()
+    setup = next(kw for e, kw in events if e == "workload_setup")
+    assert setup["profile"] == "cpu-saturate"
+    assert setup["sample"] == "x"
+
+
+def test_infected_running_emits_workload_started_after_command() -> None:
+    serial = FakeSerial()
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
+    c.set_phase("infected_running")
+
+    # The command was sent.
+    assert any("yes > /dev/null" in cmd for cmd in serial.calls), \
+        f"expected v1 yes-loop in serial calls; got {serial.calls}"
+    # And the audit event followed it.
+    started = [kw for e, kw in events if e == "workload_started"]
+    assert started, "workload_started event must fire"
+    assert started[0]["phase"] == "infected_running"
+    assert started[0]["profile"] == "v1-yes"
+
+
+def test_dormant_probes_before_killing() -> None:
+    """The pre_kill_probe is the load-bearing diagnostic: it tells the
+    trainer whether the workload was actually running before we
+    killed it. If pgrep returns 0 yes processes, the previous
+    infected_running was a no-op and the episode is filterable."""
+    serial = FakeSerial(probe_response="yes=2\nsh=1\nloadavg=1.32")
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
+    c.set_phase("dormant")
+
+    killed = [kw for e, kw in events if e == "workload_killed" and kw["phase"] == "dormant"]
+    assert killed, "dormant must emit workload_killed"
+    probe = killed[0].get("pre_kill_probe")
+    assert probe is not None
+    assert probe["yes"] == "2"
+    assert probe["loadavg"] == "1.32"
+
+
+def test_dormant_probe_records_zero_when_workload_never_ran() -> None:
+    """The exact symptom from elliott-lab: dormant probe shows 0
+    yes processes → trainer can flag this episode as workload-not-firing."""
+    serial = FakeSerial(probe_response="yes=0\nsh=1\nloadavg=0.18")
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
+    c.set_phase("dormant")
+    killed = next(kw for e, kw in events if e == "workload_killed" and kw["phase"] == "dormant")
+    assert killed["pre_kill_probe"]["yes"] == "0"
+
+
+def test_clean_phase_emits_workload_killed() -> None:
+    serial = FakeSerial()
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
+    c.set_phase("clean")
+    assert any(
+        e == "workload_killed" and kw["phase"] == "clean" for e, kw in events
+    ), "clean must emit workload_killed"
+
+
+def test_armed_emits_workload_armed_with_handshake_command() -> None:
+    serial = FakeSerial()
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
+    c.set_phase("armed")
+    assert any("armed-handshake" in cmd for cmd in serial.calls)
+    assert any(e == "workload_armed" for e, _ in events)
+
+
+def test_infecting_emits_workload_infecting_with_dd() -> None:
+    serial = FakeSerial()
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
+    c.set_phase("infecting")
+    assert any("dd if=/dev/urandom" in cmd for cmd in serial.calls)
+    assert any(e == "workload_infecting" for e, _ in events)
+
+
+# ---------------------------------------------------------------------------
+# Exception handling — failures must surface as events, not propagate
+# ---------------------------------------------------------------------------
+
+
+def test_command_failure_emits_workload_failed_and_does_not_raise() -> None:
+    """If the serial.run() raises (timeout, EOF, login bad), the
+    runner would silently swallow the exception. We want a hard
+    audit row in events.jsonl regardless."""
+    serial = FakeSerial()
+    serial.fail_on = ["yes > /dev/null"]
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
+    # Must NOT raise.
+    c.set_phase("infected_running")
+    failed = [kw for e, kw in events if e == "workload_failed"]
+    assert failed, "expected workload_failed event"
+    assert failed[0]["phase"] == "infected_running"
+    assert "fake-serial" in failed[0]["error"]
+
+
+# ---------------------------------------------------------------------------
+# Profile dispatch — Sample-driven workload picks the right command
+# ---------------------------------------------------------------------------
+
+
+def test_sample_with_profile_uses_workloads_module_command() -> None:
+    """When constructed with a Sample, infected_running runs the
+    profile's start_cmd (from exploits.workloads) — NOT the v1 yes-loop."""
+    s = Sample(name="x", family="X", category="cryptominer", profile="cpu-saturate")
+    serial = FakeSerial()
+    events: list[tuple[str, dict]] = []
+    c = VMLoadController(serial, sample=s, emit_event=lambda e, **kw: events.append((e, kw)))
+    c.set_phase("infected_running")
+
+    # The sample's workload script + the post-kill yes sweep both ran.
+    # The new workload is profile-shaped, not the simple yes-loop.
+    profile_command_seen = any(".cis490-workload-cpu-saturate" in cmd for cmd in serial.calls)
+    assert profile_command_seen, f"expected workload script in serial calls; got {serial.calls}"
+    started = next(kw for e, kw in events if e == "workload_started")
+    assert started["profile"] == "cpu-saturate"
+    assert started["sample"] == "x"
+
+
+# ---------------------------------------------------------------------------
+# Default emit (no callback supplied) is a no-op
+# ---------------------------------------------------------------------------
+
+
+def test_no_emit_callback_is_safe() -> None:
+    """Tests + code paths that don't pass an emitter shouldn't
+    crash. The default is a no-op lambda."""
+    serial = FakeSerial()
+    c = VMLoadController(serial)
+    # Should not raise.
+    c.setup()
+    c.set_phase("infected_running")
+    c.set_phase("dormant")
+    c.set_phase("clean")
--- a/tools/run_real_vm_demo.py
+++ b/tools/run_real_vm_demo.py
@ -169,7 +169,19 @@ def main() -> int:
        serial.connect()
        serial.login(boot_timeout_s=args.boot_timeout)

-        controller = VMLoadController(serial, sample=sample)
+        # Bind the controller to the runner's event log so workload
+        # success/failure shows up alongside phase_transition events.
+        # Sample also goes into EpisodeConfig below so meta.sample
+        # records what was supposed to run.
+        runner_for_emit = {"runner": None}
+        controller = VMLoadController(
+            serial,
+            sample=sample,
+            emit_event=lambda ev, **kw: (
+                runner_for_emit["runner"].emit_event(ev, **kw)
+                if runner_for_emit["runner"] else None
+            ),
+        )
        controller.setup()

        qmp_sock = run_dir / "qmp.sock"
@ -185,9 +197,15 @@ def main() -> int:
            qmp_socket=qmp_sock if qmp_sock.exists() else None,
            guest_agent_socket=agent_sock if agent_sock.exists() else None,
            bridge_iface=os.environ.get("BRIDGE") or None,
+            sample=sample,
        )

-        result = EpisodeRunner(cfg, on_phase=controller.set_phase).run()
+        runner = EpisodeRunner(cfg, on_phase=controller.set_phase)
+        # Connect the controller's event sink to the runner now that
+        # both exist. (Forward-reference closure pattern keeps the
+        # constructor argument order natural.)
+        runner_for_emit["runner"] = runner
+        result = runner.run()

        controller.teardown()
        serial.close()
--- a/tools/vm_load_controller.py
+++ b/tools/vm_load_controller.py
@ -24,6 +24,7 @@ from __future__ import annotations
 import logging
 import sys
 from pathlib import Path
+from typing import Callable

 from vm_serial import SerialClient

@ -37,6 +38,9 @@ from samples.manifest import Sample  # noqa: E402
 log = logging.getLogger("cis490.vm_load_controller")


+EmitEvent = Callable[..., None]
+
+
 class VMLoadController:
    """Drives a real Alpine guest through the phase schedule for
    Tier 2 (no exploit). Workload is chosen by ``sample.profile`` —
@ -44,17 +48,37 @@ class VMLoadController:
    produces matched envelopes whether or not an exploit fires.

    Without a sample, falls back to the original cpu-saturate yes-loop
-    (the original Tier-2 demo behaviour)."""
+    (the original Tier-2 demo behaviour).

-    def __init__(self, serial: SerialClient, sample: Sample | None = None) -> None:
+    Every set_phase call emits an event into the runner's events.jsonl
+    so we can audit (a) whether the workload command actually got
+    sent, (b) whether the guest acknowledged it, and (c) whether the
+    expected process is running afterwards. Without those events,
+    silent failures (login partial, command swallowed by tty) produce
+    well-labeled but information-less episodes — see CIS490 history
+    where every phase median'd 20% CPU on elliott-lab."""
+
+    def __init__(
+        self,
+        serial: SerialClient,
+        sample: Sample | None = None,
+        emit_event: EmitEvent | None = None,
+    ) -> None:
        self.s = serial
        self.sample = sample
        self.workload: Workload | None = workload_for(sample)
+        # No-op default so callers don't have to thread an emitter.
+        self.emit: EmitEvent = emit_event or (lambda *a, **kw: None)

    def setup(self) -> None:
        # Kill any pre-existing load and clear scratch space.
        self._kill_load()
        self.s.run("rm -f /tmp/payload /tmp/armed.log; echo setup-ok")
+        self.emit(
+            "workload_setup",
+            profile=self.workload.profile if self.workload else "v1-yes",
+            sample=self.sample.name if self.sample else None,
+        )

    def teardown(self) -> None:
        self._kill_load()
@ -64,27 +88,48 @@ class VMLoadController:
    def set_phase(self, phase: str) -> None:
        log.info("vm phase -> %s (profile=%s)",
                 phase, self.workload.profile if self.workload else "v1")
-        if phase == "clean":
-            self._kill_load()
-        elif phase == "armed":
-            self.s.run("echo armed-handshake-$(date +%s) > /tmp/armed.log")
-        elif phase == "infecting":
-            self.s.run(
-                "dd if=/dev/urandom of=/tmp/payload bs=4k count=128 2>/dev/null && "
-                "chmod +x /tmp/payload"
-            )
-        elif phase == "infected_running":
-            self._kill_load()
-            if self.workload is not None:
-                self.s.run(self.workload.start_cmd)
-            else:
+        try:
+            if phase == "clean":
+                self._kill_load()
+                self._emit_phase("workload_killed", phase)
+            elif phase == "armed":
+                self.s.run("echo armed-handshake-$(date +%s) > /tmp/armed.log")
+                self._emit_phase("workload_armed", phase)
+            elif phase == "infecting":
                self.s.run(
-                    "nohup sh -c 'yes > /dev/null' </dev/null >/dev/null 2>&1 & disown"
+                    "dd if=/dev/urandom of=/tmp/payload bs=4k count=128 2>/dev/null && "
+                    "chmod +x /tmp/payload"
                )
-        elif phase == "dormant":
-            self._kill_load()
-        else:
-            log.warning("unknown phase: %s", phase)
+                self._emit_phase("workload_infecting", phase)
+            elif phase == "infected_running":
+                self._kill_load()
+                if self.workload is not None:
+                    self.s.run(self.workload.start_cmd)
+                else:
+                    self.s.run(
+                        "nohup sh -c 'yes > /dev/null' </dev/null >/dev/null 2>&1 & disown"
+                    )
+                self._emit_phase("workload_started", phase)
+            elif phase == "dormant":
+                # Probe BEFORE we kill so we see whether the workload
+                # was actually running. If the probe says nothing was
+                # running, the previous infected_running was a no-op
+                # and the trainer should filter this episode.
+                probe = self._probe()
+                self._kill_load()
+                self._emit_phase("workload_killed", phase, pre_kill_probe=probe)
+            else:
+                log.warning("unknown phase: %s", phase)
+        except Exception as e:
+            # Don't propagate — the runner already swallows on_phase
+            # exceptions. But DO record so the episode is filterable.
+            log.exception("set_phase(%s) failed", phase)
+            self.emit(
+                "workload_failed",
+                phase=phase,
+                error=str(e)[:200],
+                profile=self.workload.profile if self.workload else "v1-yes",
+            )

    # ---- internals ------------------------------------------------------

@ -94,3 +139,33 @@ class VMLoadController:
        # Always sweep the v1 leftover commands too, in case we just
        # switched profiles mid-fleet-run.
        self.s.run("pkill yes 2>/dev/null; pkill stress-ng 2>/dev/null; true")
+
+    def _probe(self) -> dict:
+        """Ask the guest what's actually running. Returns a small dict
+        the caller stamps into the event so trainers can detect the
+        "workload didn't fire" case from meta alone."""
+        try:
+            out = self.s.run(
+                "echo yes=$(pgrep -c yes 2>/dev/null || echo 0); "
+                "echo sh=$(pgrep -c sh 2>/dev/null || echo 0); "
+                "echo loadavg=$(awk '{print $1}' /proc/loadavg)"
+            )
+            stats: dict = {}
+            for line in out.splitlines():
+                line = line.strip()
+                if "=" not in line:
+                    continue
+                k, _, v = line.partition("=")
+                stats[k.strip()] = v.strip()
+            return stats
+        except Exception as e:
+            return {"probe_error": str(e)[:120]}
+
+    def _emit_phase(self, event: str, phase: str, **extra) -> None:
+        self.emit(
+            event,
+            phase=phase,
+            profile=self.workload.profile if self.workload else "v1-yes",
+            sample=self.sample.name if self.sample else None,
+            **extra,
+        )