workload audit trail: meta.sample + per-phase events + pre-kill probe

The elliott-lab episode showed every phase median'd 20% CPU because
the in-guest workload silently never fired — and there was no signal
in events.jsonl to detect that from outside, so a trainer would
treat the labels as ground truth and learn "all phases look identical".
This commit closes the audit gap so the failure is visible in meta:

orchestrator/episode.py
  EpisodeConfig.sample: Sample | None — the manifest entry that
  drove this episode's workload selection. Stamped into meta.sample
  as {name, family, category, profile, kind, sha256} so trainers
  can join cleanly without re-deriving from events. None means the
  v1 yes-loop fallback path ran (and the trainer should treat the
  episode with appropriate skepticism).

tools/vm_load_controller.py
  VMLoadController gains an emit_event callable. Every phase now
  emits a workload_* event into the runner's events.jsonl:
    workload_setup        login + initial cleanup OK
    workload_killed       clean / dormant. Dormant carries a
                          `pre_kill_probe` dict from inside the
                          guest (`pgrep -c yes`, `pgrep -c sh`,
                          /proc/loadavg) so the trainer can detect
                          the elliott-lab failure mode where the
                          workload never actually ran.
    workload_armed        armed handshake fired
    workload_infecting    dd urandom / payload write fired
    workload_started      infected_running command sent
    workload_failed       any of the above raised inside SerialClient
                          (timeout, EOF, partial login). The runner
                          would have silently swallowed the
                          exception via its on_phase try/except;
                          the audit row makes the failure detectable.
  Exceptions in shell calls surface as workload_failed events but
  do NOT propagate, matching the runner's existing on_phase
  contract.

tools/run_real_vm_demo.py
  Wires the controller's emit_event to the runner's emit_event via
  a small forward-reference closure (controller is built before
  runner; runner.emit_event needs to be the sink). Sample also
  flows into EpisodeConfig.sample so meta.sample matches what the
  controller actually ran.

Tests: 119 (was 106). New cases:
  tests/test_vm_load_controller.py  (11 tests against a FakeSerial)
    - setup emits workload_setup
    - infected_running runs the v1 yes-loop AND emits workload_started
    - dormant probes BEFORE killing and stamps pre_kill_probe
    - dormant probe records "yes=0" (the elliott-lab fingerprint)
    - clean / armed / infecting all emit their respective events
    - serial.run() exception → workload_failed event, no propagation
    - sample-with-profile dispatches to exploits.workloads command
      (NOT the v1 yes-loop)
    - missing emit_event callback is a no-op (back-compat)
  tests/test_episode.py  (2 new)
    - meta.sample carries name/family/category/profile/kind/sha256
      when EpisodeConfig.sample is set
    - meta.sample stays null in the v1 fallback path

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
max 2026-04-30 02:12:34 -05:00
parent 8753340ea3
commit d86502d950
5 changed files with 397 additions and 24 deletions

View file

@ -37,6 +37,7 @@ from pathlib import Path
from typing import Callable
from collectors import guest_agent, pcap, perf_qemu, proc_qemu, qmp
from samples.manifest import Sample
from .ulid import new_ulid
@ -77,6 +78,10 @@ class EpisodeConfig:
# explicitly per-episode when the host supports it.
enable_perf: bool = False
perf_interval_ms: int = 100
# The Sample that drove this episode's workload selection. Stamped
# into meta.json so trainers can join episodes by family / kind
# without re-deriving from events. None = v1 yes-loop fallback.
sample: Sample | None = None
# Snapshot/revert (Tier 0+):
# revert_at_start — before any phase walks, loadvm <snapshot_name>.
# Use this to drop the guest back to a known-good baseline at
@ -341,6 +346,17 @@ class EpisodeRunner:
return observed
def _initial_meta(self, started_at_wall: str) -> dict:
sample_meta: dict | None = None
if self.cfg.sample is not None:
s = self.cfg.sample
sample_meta = {
"name": s.name,
"family": s.family,
"category": s.category,
"profile": s.profile,
"kind": s.kind,
"sha256": s.sha256,
}
return {
"episode_id": self.episode_id,
"schema_version": SCHEMA_VERSION,
@ -359,7 +375,7 @@ class EpisodeRunner:
"target_pid": self.cfg.target_pid,
},
"exploit": None,
"sample": None,
"sample": sample_meta,
"schedule": {
"baseline_seconds": self.cfg.duration_s,
"interval_ms": self.cfg.interval_ms,

View file

@ -74,6 +74,57 @@ def test_episode_id_can_be_overridden(tmp_path: Path) -> None:
assert result.episode_dir == tmp_path / "episodes" / "01TEST"
def test_meta_sample_records_full_sample_when_passed(tmp_path: Path) -> None:
"""EpisodeConfig.sample → meta.sample carries identity + kind so
trainers can join episodes by family/sha256 without re-deriving
from events. With no Sample, meta.sample stays null."""
import os as _os
from samples.manifest import Sample
s = Sample(
name="xmrig-cryptominer",
family="XMRig",
category="cryptominer",
profile="cpu-saturate",
sha256="abc" * 21 + "d", # 64 hex
source="MalwareBazaar",
)
cfg = EpisodeConfig(
target_pid=_os.getpid(),
duration_s=0.1,
interval_ms=50,
data_root=tmp_path,
sample=s,
)
result = EpisodeRunner(cfg).run()
meta = json.loads((result.episode_dir / "meta.json").read_text())
assert meta["sample"] is not None
assert meta["sample"]["name"] == "xmrig-cryptominer"
assert meta["sample"]["family"] == "XMRig"
assert meta["sample"]["category"] == "cryptominer"
assert meta["sample"]["profile"] == "cpu-saturate"
assert meta["sample"]["kind"] == "real"
assert meta["sample"]["sha256"] == "abc" * 21 + "d"
def test_meta_sample_is_null_for_v1_path(tmp_path: Path) -> None:
"""No sample passed → the v1 fallback path. meta.sample stays
null so trainers can detect (and filter out) info-less runs."""
import os as _os
cfg = EpisodeConfig(
target_pid=_os.getpid(),
duration_s=0.1,
interval_ms=50,
data_root=tmp_path,
)
result = EpisodeRunner(cfg).run()
meta = json.loads((result.episode_dir / "meta.json").read_text())
assert meta["sample"] is None
def test_episode_writes_done_marker_last(tmp_path: Path) -> None:
"""done.marker should not appear until meta.json has ended_at_wall set."""
cfg = EpisodeConfig(

View file

@ -0,0 +1,213 @@
"""Tests for VMLoadController against a fake SerialClient.
The controller's only job is to translate phases into shell commands
on a serial console + emit audit events. The key invariants we
encode here come from the elliott-lab incident where every phase
median'd 20% CPU because the workload silently never fired:
- every set_phase emits some event (so absence in events.jsonl is
a hard signal)
- infected_running emits workload_started AFTER sending the load
command
- dormant emits workload_killed WITH a pre_kill_probe so trainers
can detect "the workload was never running"
- exceptions in the shell call surface as workload_failed; they
do NOT propagate (the runner's on_phase callback would swallow
them anyway, but we want the audit row regardless)
"""
from __future__ import annotations
import sys
from pathlib import Path
import pytest
# Mirror the same path hack run_real_vm_demo.py uses so the tools/
# module imports work.
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
sys.path.insert(0, str(ROOT / "tools"))
from samples.manifest import Sample
from vm_load_controller import VMLoadController # noqa: E402
class FakeSerial:
"""Records every shell command. Returns canned probe output."""
def __init__(self, probe_response: str = "yes=1\nsh=1\nloadavg=0.45") -> None:
self.calls: list[str] = []
self.probe_response = probe_response
self.fail_on: list[str] = []
def run(self, cmd: str, timeout_s: float = 10.0) -> str:
self.calls.append(cmd)
for substr in self.fail_on:
if substr in cmd:
raise RuntimeError(f"fake-serial: failing on {substr!r}")
if "pgrep -c yes" in cmd or "pgrep -c sh" in cmd or "loadavg" in cmd:
return self.probe_response
return ""
# ---------------------------------------------------------------------------
# Event emission — the audit trail
# ---------------------------------------------------------------------------
def test_setup_emits_workload_setup_event() -> None:
serial = FakeSerial()
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
c.setup()
names = [e for e, _ in events]
assert "workload_setup" in names
setup = next(kw for e, kw in events if e == "workload_setup")
assert setup["profile"] == "v1-yes" # no Sample → fallback path
assert setup["sample"] is None
def test_setup_records_profile_when_sample_present() -> None:
serial = FakeSerial()
s = Sample(name="x", family="X", category="rat", profile="cpu-saturate")
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, sample=s, emit_event=lambda e, **kw: events.append((e, kw)))
c.setup()
setup = next(kw for e, kw in events if e == "workload_setup")
assert setup["profile"] == "cpu-saturate"
assert setup["sample"] == "x"
def test_infected_running_emits_workload_started_after_command() -> None:
serial = FakeSerial()
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
c.set_phase("infected_running")
# The command was sent.
assert any("yes > /dev/null" in cmd for cmd in serial.calls), \
f"expected v1 yes-loop in serial calls; got {serial.calls}"
# And the audit event followed it.
started = [kw for e, kw in events if e == "workload_started"]
assert started, "workload_started event must fire"
assert started[0]["phase"] == "infected_running"
assert started[0]["profile"] == "v1-yes"
def test_dormant_probes_before_killing() -> None:
"""The pre_kill_probe is the load-bearing diagnostic: it tells the
trainer whether the workload was actually running before we
killed it. If pgrep returns 0 yes processes, the previous
infected_running was a no-op and the episode is filterable."""
serial = FakeSerial(probe_response="yes=2\nsh=1\nloadavg=1.32")
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
c.set_phase("dormant")
killed = [kw for e, kw in events if e == "workload_killed" and kw["phase"] == "dormant"]
assert killed, "dormant must emit workload_killed"
probe = killed[0].get("pre_kill_probe")
assert probe is not None
assert probe["yes"] == "2"
assert probe["loadavg"] == "1.32"
def test_dormant_probe_records_zero_when_workload_never_ran() -> None:
"""The exact symptom from elliott-lab: dormant probe shows 0
yes processes trainer can flag this episode as workload-not-firing."""
serial = FakeSerial(probe_response="yes=0\nsh=1\nloadavg=0.18")
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
c.set_phase("dormant")
killed = next(kw for e, kw in events if e == "workload_killed" and kw["phase"] == "dormant")
assert killed["pre_kill_probe"]["yes"] == "0"
def test_clean_phase_emits_workload_killed() -> None:
serial = FakeSerial()
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
c.set_phase("clean")
assert any(
e == "workload_killed" and kw["phase"] == "clean" for e, kw in events
), "clean must emit workload_killed"
def test_armed_emits_workload_armed_with_handshake_command() -> None:
serial = FakeSerial()
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
c.set_phase("armed")
assert any("armed-handshake" in cmd for cmd in serial.calls)
assert any(e == "workload_armed" for e, _ in events)
def test_infecting_emits_workload_infecting_with_dd() -> None:
serial = FakeSerial()
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
c.set_phase("infecting")
assert any("dd if=/dev/urandom" in cmd for cmd in serial.calls)
assert any(e == "workload_infecting" for e, _ in events)
# ---------------------------------------------------------------------------
# Exception handling — failures must surface as events, not propagate
# ---------------------------------------------------------------------------
def test_command_failure_emits_workload_failed_and_does_not_raise() -> None:
"""If the serial.run() raises (timeout, EOF, login bad), the
runner would silently swallow the exception. We want a hard
audit row in events.jsonl regardless."""
serial = FakeSerial()
serial.fail_on = ["yes > /dev/null"]
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, emit_event=lambda e, **kw: events.append((e, kw)))
# Must NOT raise.
c.set_phase("infected_running")
failed = [kw for e, kw in events if e == "workload_failed"]
assert failed, "expected workload_failed event"
assert failed[0]["phase"] == "infected_running"
assert "fake-serial" in failed[0]["error"]
# ---------------------------------------------------------------------------
# Profile dispatch — Sample-driven workload picks the right command
# ---------------------------------------------------------------------------
def test_sample_with_profile_uses_workloads_module_command() -> None:
"""When constructed with a Sample, infected_running runs the
profile's start_cmd (from exploits.workloads) — NOT the v1 yes-loop."""
s = Sample(name="x", family="X", category="cryptominer", profile="cpu-saturate")
serial = FakeSerial()
events: list[tuple[str, dict]] = []
c = VMLoadController(serial, sample=s, emit_event=lambda e, **kw: events.append((e, kw)))
c.set_phase("infected_running")
# The sample's workload script + the post-kill yes sweep both ran.
# The new workload is profile-shaped, not the simple yes-loop.
profile_command_seen = any(".cis490-workload-cpu-saturate" in cmd for cmd in serial.calls)
assert profile_command_seen, f"expected workload script in serial calls; got {serial.calls}"
started = next(kw for e, kw in events if e == "workload_started")
assert started["profile"] == "cpu-saturate"
assert started["sample"] == "x"
# ---------------------------------------------------------------------------
# Default emit (no callback supplied) is a no-op
# ---------------------------------------------------------------------------
def test_no_emit_callback_is_safe() -> None:
"""Tests + code paths that don't pass an emitter shouldn't
crash. The default is a no-op lambda."""
serial = FakeSerial()
c = VMLoadController(serial)
# Should not raise.
c.setup()
c.set_phase("infected_running")
c.set_phase("dormant")
c.set_phase("clean")

View file

@ -169,7 +169,19 @@ def main() -> int:
serial.connect()
serial.login(boot_timeout_s=args.boot_timeout)
controller = VMLoadController(serial, sample=sample)
# Bind the controller to the runner's event log so workload
# success/failure shows up alongside phase_transition events.
# Sample also goes into EpisodeConfig below so meta.sample
# records what was supposed to run.
runner_for_emit = {"runner": None}
controller = VMLoadController(
serial,
sample=sample,
emit_event=lambda ev, **kw: (
runner_for_emit["runner"].emit_event(ev, **kw)
if runner_for_emit["runner"] else None
),
)
controller.setup()
qmp_sock = run_dir / "qmp.sock"
@ -185,9 +197,15 @@ def main() -> int:
qmp_socket=qmp_sock if qmp_sock.exists() else None,
guest_agent_socket=agent_sock if agent_sock.exists() else None,
bridge_iface=os.environ.get("BRIDGE") or None,
sample=sample,
)
result = EpisodeRunner(cfg, on_phase=controller.set_phase).run()
runner = EpisodeRunner(cfg, on_phase=controller.set_phase)
# Connect the controller's event sink to the runner now that
# both exist. (Forward-reference closure pattern keeps the
# constructor argument order natural.)
runner_for_emit["runner"] = runner
result = runner.run()
controller.teardown()
serial.close()

View file

@ -24,6 +24,7 @@ from __future__ import annotations
import logging
import sys
from pathlib import Path
from typing import Callable
from vm_serial import SerialClient
@ -37,6 +38,9 @@ from samples.manifest import Sample # noqa: E402
log = logging.getLogger("cis490.vm_load_controller")
EmitEvent = Callable[..., None]
class VMLoadController:
"""Drives a real Alpine guest through the phase schedule for
Tier 2 (no exploit). Workload is chosen by ``sample.profile``
@ -44,17 +48,37 @@ class VMLoadController:
produces matched envelopes whether or not an exploit fires.
Without a sample, falls back to the original cpu-saturate yes-loop
(the original Tier-2 demo behaviour)."""
(the original Tier-2 demo behaviour).
def __init__(self, serial: SerialClient, sample: Sample | None = None) -> None:
Every set_phase call emits an event into the runner's events.jsonl
so we can audit (a) whether the workload command actually got
sent, (b) whether the guest acknowledged it, and (c) whether the
expected process is running afterwards. Without those events,
silent failures (login partial, command swallowed by tty) produce
well-labeled but information-less episodes see CIS490 history
where every phase median'd 20% CPU on elliott-lab."""
def __init__(
self,
serial: SerialClient,
sample: Sample | None = None,
emit_event: EmitEvent | None = None,
) -> None:
self.s = serial
self.sample = sample
self.workload: Workload | None = workload_for(sample)
# No-op default so callers don't have to thread an emitter.
self.emit: EmitEvent = emit_event or (lambda *a, **kw: None)
def setup(self) -> None:
# Kill any pre-existing load and clear scratch space.
self._kill_load()
self.s.run("rm -f /tmp/payload /tmp/armed.log; echo setup-ok")
self.emit(
"workload_setup",
profile=self.workload.profile if self.workload else "v1-yes",
sample=self.sample.name if self.sample else None,
)
def teardown(self) -> None:
self._kill_load()
@ -64,27 +88,48 @@ class VMLoadController:
def set_phase(self, phase: str) -> None:
log.info("vm phase -> %s (profile=%s)",
phase, self.workload.profile if self.workload else "v1")
if phase == "clean":
self._kill_load()
elif phase == "armed":
self.s.run("echo armed-handshake-$(date +%s) > /tmp/armed.log")
elif phase == "infecting":
self.s.run(
"dd if=/dev/urandom of=/tmp/payload bs=4k count=128 2>/dev/null && "
"chmod +x /tmp/payload"
)
elif phase == "infected_running":
self._kill_load()
if self.workload is not None:
self.s.run(self.workload.start_cmd)
else:
try:
if phase == "clean":
self._kill_load()
self._emit_phase("workload_killed", phase)
elif phase == "armed":
self.s.run("echo armed-handshake-$(date +%s) > /tmp/armed.log")
self._emit_phase("workload_armed", phase)
elif phase == "infecting":
self.s.run(
"nohup sh -c 'yes > /dev/null' </dev/null >/dev/null 2>&1 & disown"
"dd if=/dev/urandom of=/tmp/payload bs=4k count=128 2>/dev/null && "
"chmod +x /tmp/payload"
)
elif phase == "dormant":
self._kill_load()
else:
log.warning("unknown phase: %s", phase)
self._emit_phase("workload_infecting", phase)
elif phase == "infected_running":
self._kill_load()
if self.workload is not None:
self.s.run(self.workload.start_cmd)
else:
self.s.run(
"nohup sh -c 'yes > /dev/null' </dev/null >/dev/null 2>&1 & disown"
)
self._emit_phase("workload_started", phase)
elif phase == "dormant":
# Probe BEFORE we kill so we see whether the workload
# was actually running. If the probe says nothing was
# running, the previous infected_running was a no-op
# and the trainer should filter this episode.
probe = self._probe()
self._kill_load()
self._emit_phase("workload_killed", phase, pre_kill_probe=probe)
else:
log.warning("unknown phase: %s", phase)
except Exception as e:
# Don't propagate — the runner already swallows on_phase
# exceptions. But DO record so the episode is filterable.
log.exception("set_phase(%s) failed", phase)
self.emit(
"workload_failed",
phase=phase,
error=str(e)[:200],
profile=self.workload.profile if self.workload else "v1-yes",
)
# ---- internals ------------------------------------------------------
@ -94,3 +139,33 @@ class VMLoadController:
# Always sweep the v1 leftover commands too, in case we just
# switched profiles mid-fleet-run.
self.s.run("pkill yes 2>/dev/null; pkill stress-ng 2>/dev/null; true")
def _probe(self) -> dict:
"""Ask the guest what's actually running. Returns a small dict
the caller stamps into the event so trainers can detect the
"workload didn't fire" case from meta alone."""
try:
out = self.s.run(
"echo yes=$(pgrep -c yes 2>/dev/null || echo 0); "
"echo sh=$(pgrep -c sh 2>/dev/null || echo 0); "
"echo loadavg=$(awk '{print $1}' /proc/loadavg)"
)
stats: dict = {}
for line in out.splitlines():
line = line.strip()
if "=" not in line:
continue
k, _, v = line.partition("=")
stats[k.strip()] = v.strip()
return stats
except Exception as e:
return {"probe_error": str(e)[:120]}
def _emit_phase(self, event: str, phase: str, **extra) -> None:
self.emit(
event,
phase=phase,
profile=self.workload.profile if self.workload else "v1-yes",
sample=self.sample.name if self.sample else None,
**extra,
)