End-to-end: ``python -m orchestrator --target-pid <pid> --duration N`` now
writes a complete episode directory matching docs/data-model.md, with phase
labels, events, and a 10 Hz host /proc telemetry stream. No VM yet — pid is
arbitrary so we can validate the loop against e.g. ``sleep 5`` while the lab
side comes up.
collectors/proc_qemu.py — parses /proc/<pid>/{stat,io,status} (handles parens
in comm), single-shot collect_once(), and a stop-event-driven run_loop()
that ticks at a fixed cadence and exits when the pid disappears. Tagged
``available_in_deployment: false`` per the threat-model doc.
orchestrator/episode.py — EpisodeRunner: creates data/episodes/<ulid>/,
atomic meta.json, events.jsonl + labels.jsonl writers, drives the collector
in a thread for duration_s, writes done.marker last so the shipper never
sees a half-finished episode.
orchestrator/ulid.py — tiny 26-char Crockford-base32 ULID generator.
Time-sortable, no third-party dep.
orchestrator/__main__.py — CLI entry point.
Tests (15 new, 28 total green):
- proc_qemu: real-ish stat with parens-in-comm, missing /proc/<pid>/io,
missing pid, run_loop cadence, run_loop terminates when pid disappears.
- episode: full directory shape against os.getpid(), id override,
done.marker written after meta.json finalize.
- ulid: length+alphabet, 2000-burst uniqueness, time-sortability.
Smoke-tested against ``sleep 10``: 16 rows over 1.5s at 100ms cadence,
monotonic clock, RSS stable at ~3.5 MiB as expected for an idle sleep.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
207 lines
6.4 KiB
Python
207 lines
6.4 KiB
Python
"""EpisodeRunner — minimum-viable single-episode driver.
|
|
|
|
This v0 runner does NOT boot a VM yet. It samples a target pid that you
|
|
provide on the command line, labels the entire window as ``clean``, and
|
|
writes the full per-episode directory shape from ``docs/data-model.md``:
|
|
|
|
data/episodes/<ulid>/
|
|
meta.json
|
|
events.jsonl
|
|
labels.jsonl
|
|
telemetry-proc.jsonl
|
|
done.marker
|
|
|
|
The point of v0 is to validate the directory shape, the JSONL schemas, the
|
|
collector loop, and (next step) the shipper, *before* the VM lab is wired
|
|
up. Once the VM bring-up exists, the runner will be extended with phase
|
|
transitions (armed → infecting → infected_running → dormant).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import threading
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from collectors import proc_qemu
|
|
|
|
from .ulid import new_ulid
|
|
|
|
|
|
log = logging.getLogger("cis490.orchestrator")
|
|
|
|
SCHEMA_VERSION = 1
|
|
|
|
|
|
@dataclass
|
|
class EpisodeConfig:
|
|
target_pid: int
|
|
duration_s: float
|
|
data_root: Path
|
|
interval_ms: int = 100
|
|
image_name: str = "(stub)"
|
|
snapshot_name: str = "(none-stub)"
|
|
episode_id: str | None = None
|
|
|
|
|
|
@dataclass
|
|
class EpisodeResult:
|
|
episode_id: str
|
|
episode_dir: Path
|
|
rows_proc: int
|
|
pid_disappeared: bool
|
|
duration_observed_s: float
|
|
|
|
|
|
class EpisodeRunner:
|
|
def __init__(self, cfg: EpisodeConfig) -> None:
|
|
self.cfg = cfg
|
|
self.episode_id = cfg.episode_id or new_ulid()
|
|
self.episode_dir: Path = cfg.data_root / "episodes" / self.episode_id
|
|
self._t_mono_origin_ns: int = 0
|
|
self._stop = threading.Event()
|
|
|
|
# ---- public ---------------------------------------------------------
|
|
|
|
def run(self) -> EpisodeResult:
|
|
self.episode_dir.mkdir(parents=True, exist_ok=True)
|
|
self._t_mono_origin_ns = time.monotonic_ns()
|
|
started_at_wall = datetime.now(timezone.utc).isoformat()
|
|
|
|
meta = self._initial_meta(started_at_wall)
|
|
self._write_meta(meta)
|
|
|
|
self._emit_event(0, "snapshot_load", snapshot=self.cfg.snapshot_name)
|
|
self._emit_label(0, "clean", prev=None, reason="snapshot_loaded")
|
|
|
|
rows_holder: dict[str, int] = {"rows": 0}
|
|
|
|
def _collector() -> None:
|
|
rows_holder["rows"] = proc_qemu.run_loop(
|
|
pid=self.cfg.target_pid,
|
|
output_path=self.episode_dir / "telemetry-proc.jsonl",
|
|
t_mono_origin_ns=self._t_mono_origin_ns,
|
|
interval_ms=self.cfg.interval_ms,
|
|
stop_event=self._stop,
|
|
)
|
|
|
|
t = threading.Thread(target=_collector, daemon=True, name="proc_qemu")
|
|
t.start()
|
|
try:
|
|
# Wait either for duration to elapse or for stop to be set.
|
|
self._stop.wait(timeout=self.cfg.duration_s)
|
|
finally:
|
|
self._stop.set()
|
|
t.join(timeout=2.0)
|
|
|
|
end_mono_ns = time.monotonic_ns() - self._t_mono_origin_ns
|
|
pid_alive = _pid_alive(self.cfg.target_pid)
|
|
self._emit_event(
|
|
end_mono_ns,
|
|
"episode_end",
|
|
target_pid_alive=pid_alive,
|
|
)
|
|
|
|
meta["ended_at_wall"] = datetime.now(timezone.utc).isoformat()
|
|
meta["result"] = {
|
|
"phases_observed": ["clean"],
|
|
"rows_proc": rows_holder["rows"],
|
|
"pid_alive_at_end": pid_alive,
|
|
"duration_observed_s": end_mono_ns / 1_000_000_000,
|
|
}
|
|
self._write_meta(meta)
|
|
(self.episode_dir / "done.marker").touch()
|
|
|
|
log.info(
|
|
"episode %s complete: rows=%d duration=%.2fs",
|
|
self.episode_id, rows_holder["rows"], end_mono_ns / 1e9,
|
|
)
|
|
return EpisodeResult(
|
|
episode_id=self.episode_id,
|
|
episode_dir=self.episode_dir,
|
|
rows_proc=rows_holder["rows"],
|
|
pid_disappeared=not pid_alive,
|
|
duration_observed_s=end_mono_ns / 1_000_000_000,
|
|
)
|
|
|
|
def stop(self) -> None:
|
|
self._stop.set()
|
|
|
|
# ---- internals ------------------------------------------------------
|
|
|
|
def _initial_meta(self, started_at_wall: str) -> dict:
|
|
return {
|
|
"episode_id": self.episode_id,
|
|
"schema_version": SCHEMA_VERSION,
|
|
"started_at_wall": started_at_wall,
|
|
"ended_at_wall": None,
|
|
"host_fingerprint": {
|
|
"kernel": os.uname().release,
|
|
"qemu_version": None,
|
|
},
|
|
"vm": {
|
|
"image_name": self.cfg.image_name,
|
|
"image_sha256": None,
|
|
"snapshot_name": self.cfg.snapshot_name,
|
|
"vcpus": None,
|
|
"ram_mib": None,
|
|
"target_pid": self.cfg.target_pid,
|
|
},
|
|
"exploit": None,
|
|
"sample": None,
|
|
"schedule": {
|
|
"baseline_seconds": self.cfg.duration_s,
|
|
"infected_seconds": 0,
|
|
"dormant_seconds": 0,
|
|
"interval_ms": self.cfg.interval_ms,
|
|
},
|
|
"result": None,
|
|
}
|
|
|
|
def _write_meta(self, meta: dict) -> None:
|
|
path = self.episode_dir / "meta.json"
|
|
tmp = path.with_suffix(".json.partial")
|
|
with tmp.open("w") as f:
|
|
json.dump(meta, f, indent=2, sort_keys=True)
|
|
f.write("\n")
|
|
os.replace(tmp, path)
|
|
|
|
def _emit_event(self, t_mono_ns: int, event: str, **extra) -> None:
|
|
row = {
|
|
"t_mono_ns": t_mono_ns,
|
|
"t_wall_ns": time.time_ns(),
|
|
"event": event,
|
|
**extra,
|
|
}
|
|
with (self.episode_dir / "events.jsonl").open("a") as f:
|
|
f.write(json.dumps(row, sort_keys=True) + "\n")
|
|
|
|
def _emit_label(self, t_mono_ns: int, phase: str, prev: str | None, reason: str) -> None:
|
|
row = {
|
|
"t_mono_ns": t_mono_ns,
|
|
"t_wall_ns": time.time_ns(),
|
|
"phase": phase,
|
|
"prev": prev,
|
|
"reason": reason,
|
|
}
|
|
with (self.episode_dir / "labels.jsonl").open("a") as f:
|
|
f.write(json.dumps(row, sort_keys=True) + "\n")
|
|
|
|
|
|
def _pid_alive(pid: int) -> bool:
|
|
try:
|
|
os.kill(pid, 0)
|
|
return True
|
|
except (ProcessLookupError, PermissionError):
|
|
# PermissionError means the pid exists but we can't signal it.
|
|
return isinstance(_last_exception(), PermissionError)
|
|
|
|
|
|
def _last_exception() -> BaseException | None:
|
|
import sys
|
|
return sys.exc_info()[1]
|