The elliott-lab episode showed every phase median'd 20% CPU because
the in-guest workload silently never fired — and there was no signal
in events.jsonl to detect that from outside, so a trainer would
treat the labels as ground truth and learn "all phases look identical".
This commit closes the audit gap so the failure is visible in meta:
orchestrator/episode.py
EpisodeConfig.sample: Sample | None — the manifest entry that
drove this episode's workload selection. Stamped into meta.sample
as {name, family, category, profile, kind, sha256} so trainers
can join cleanly without re-deriving from events. None means the
v1 yes-loop fallback path ran (and the trainer should treat the
episode with appropriate skepticism).
tools/vm_load_controller.py
VMLoadController gains an emit_event callable. Every phase now
emits a workload_* event into the runner's events.jsonl:
workload_setup login + initial cleanup OK
workload_killed clean / dormant. Dormant carries a
`pre_kill_probe` dict from inside the
guest (`pgrep -c yes`, `pgrep -c sh`,
/proc/loadavg) so the trainer can detect
the elliott-lab failure mode where the
workload never actually ran.
workload_armed armed handshake fired
workload_infecting dd urandom / payload write fired
workload_started infected_running command sent
workload_failed any of the above raised inside SerialClient
(timeout, EOF, partial login). The runner
would have silently swallowed the
exception via its on_phase try/except;
the audit row makes the failure detectable.
Exceptions in shell calls surface as workload_failed events but
do NOT propagate, matching the runner's existing on_phase
contract.
tools/run_real_vm_demo.py
Wires the controller's emit_event to the runner's emit_event via
a small forward-reference closure (controller is built before
runner; runner.emit_event needs to be the sink). Sample also
flows into EpisodeConfig.sample so meta.sample matches what the
controller actually ran.
Tests: 119 (was 106). New cases:
tests/test_vm_load_controller.py (11 tests against a FakeSerial)
- setup emits workload_setup
- infected_running runs the v1 yes-loop AND emits workload_started
- dormant probes BEFORE killing and stamps pre_kill_probe
- dormant probe records "yes=0" (the elliott-lab fingerprint)
- clean / armed / infecting all emit their respective events
- serial.run() exception → workload_failed event, no propagation
- sample-with-profile dispatches to exploits.workloads command
(NOT the v1 yes-loop)
- missing emit_event callback is a no-op (back-compat)
tests/test_episode.py (2 new)
- meta.sample carries name/family/category/profile/kind/sha256
when EpisodeConfig.sample is set
- meta.sample stays null in the v1 fallback path
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
139 lines
4.5 KiB
Python
139 lines
4.5 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from orchestrator.episode import EpisodeConfig, EpisodeRunner
|
|
|
|
|
|
def _read_jsonl(p: Path) -> list[dict]:
|
|
return [json.loads(l) for l in p.read_text().splitlines()]
|
|
|
|
|
|
def test_episode_against_self_pid_produces_full_directory(tmp_path: Path) -> None:
|
|
cfg = EpisodeConfig(
|
|
target_pid=os.getpid(),
|
|
duration_s=0.5,
|
|
interval_ms=50,
|
|
data_root=tmp_path,
|
|
)
|
|
result = EpisodeRunner(cfg).run()
|
|
|
|
d = result.episode_dir
|
|
assert d.exists()
|
|
assert (d / "meta.json").exists()
|
|
assert (d / "events.jsonl").exists()
|
|
assert (d / "labels.jsonl").exists()
|
|
assert (d / "telemetry-proc.jsonl").exists()
|
|
assert (d / "done.marker").exists()
|
|
|
|
# meta.json structure
|
|
meta = json.loads((d / "meta.json").read_text())
|
|
assert meta["episode_id"] == result.episode_id
|
|
assert meta["schema_version"] == 1
|
|
assert meta["started_at_wall"] is not None
|
|
assert meta["ended_at_wall"] is not None
|
|
assert meta["vm"]["target_pid"] == os.getpid()
|
|
assert meta["schedule"]["baseline_seconds"] == 0.5
|
|
assert meta["schedule"]["interval_ms"] == 50
|
|
assert meta["result"]["rows_proc"] == result.rows_proc
|
|
assert "clean" in meta["result"]["phases_observed"]
|
|
|
|
# labels.jsonl: at least one clean label at t=0.
|
|
labels = _read_jsonl(d / "labels.jsonl")
|
|
assert any(r["phase"] == "clean" and r["t_mono_ns"] == 0 for r in labels)
|
|
|
|
# events.jsonl: snapshot_load + episode_end.
|
|
events = _read_jsonl(d / "events.jsonl")
|
|
event_names = [e["event"] for e in events]
|
|
assert "snapshot_load" in event_names
|
|
assert "episode_end" in event_names
|
|
|
|
# telemetry-proc.jsonl: roughly 10 ticks @ 50ms over 500ms.
|
|
proc_rows = _read_jsonl(d / "telemetry-proc.jsonl")
|
|
assert len(proc_rows) >= 5
|
|
for row in proc_rows:
|
|
assert row["source"] == "host_proc"
|
|
assert row["available_in_deployment"] is False
|
|
assert row["rss_bytes"] > 0
|
|
|
|
|
|
def test_episode_id_can_be_overridden(tmp_path: Path) -> None:
|
|
cfg = EpisodeConfig(
|
|
target_pid=os.getpid(),
|
|
duration_s=0.1,
|
|
interval_ms=50,
|
|
data_root=tmp_path,
|
|
episode_id="01TEST",
|
|
)
|
|
result = EpisodeRunner(cfg).run()
|
|
assert result.episode_id == "01TEST"
|
|
assert result.episode_dir == tmp_path / "episodes" / "01TEST"
|
|
|
|
|
|
def test_meta_sample_records_full_sample_when_passed(tmp_path: Path) -> None:
|
|
"""EpisodeConfig.sample → meta.sample carries identity + kind so
|
|
trainers can join episodes by family/sha256 without re-deriving
|
|
from events. With no Sample, meta.sample stays null."""
|
|
import os as _os
|
|
|
|
from samples.manifest import Sample
|
|
|
|
s = Sample(
|
|
name="xmrig-cryptominer",
|
|
family="XMRig",
|
|
category="cryptominer",
|
|
profile="cpu-saturate",
|
|
sha256="abc" * 21 + "d", # 64 hex
|
|
source="MalwareBazaar",
|
|
)
|
|
cfg = EpisodeConfig(
|
|
target_pid=_os.getpid(),
|
|
duration_s=0.1,
|
|
interval_ms=50,
|
|
data_root=tmp_path,
|
|
sample=s,
|
|
)
|
|
result = EpisodeRunner(cfg).run()
|
|
|
|
meta = json.loads((result.episode_dir / "meta.json").read_text())
|
|
assert meta["sample"] is not None
|
|
assert meta["sample"]["name"] == "xmrig-cryptominer"
|
|
assert meta["sample"]["family"] == "XMRig"
|
|
assert meta["sample"]["category"] == "cryptominer"
|
|
assert meta["sample"]["profile"] == "cpu-saturate"
|
|
assert meta["sample"]["kind"] == "real"
|
|
assert meta["sample"]["sha256"] == "abc" * 21 + "d"
|
|
|
|
|
|
def test_meta_sample_is_null_for_v1_path(tmp_path: Path) -> None:
|
|
"""No sample passed → the v1 fallback path. meta.sample stays
|
|
null so trainers can detect (and filter out) info-less runs."""
|
|
import os as _os
|
|
|
|
cfg = EpisodeConfig(
|
|
target_pid=_os.getpid(),
|
|
duration_s=0.1,
|
|
interval_ms=50,
|
|
data_root=tmp_path,
|
|
)
|
|
result = EpisodeRunner(cfg).run()
|
|
meta = json.loads((result.episode_dir / "meta.json").read_text())
|
|
assert meta["sample"] is None
|
|
|
|
|
|
def test_episode_writes_done_marker_last(tmp_path: Path) -> None:
|
|
"""done.marker should not appear until meta.json has ended_at_wall set."""
|
|
cfg = EpisodeConfig(
|
|
target_pid=os.getpid(),
|
|
duration_s=0.1,
|
|
interval_ms=50,
|
|
data_root=tmp_path,
|
|
)
|
|
result = EpisodeRunner(cfg).run()
|
|
assert (result.episode_dir / "done.marker").exists()
|
|
meta = json.loads((result.episode_dir / "meta.json").read_text())
|
|
assert meta["ended_at_wall"] is not None
|