CIS490/orchestrator/episode.py
Maximus Gorog 064387b7a0 Add v0 orchestrator + first oracle collector (host /proc)
End-to-end: ``python -m orchestrator --target-pid <pid> --duration N`` now
writes a complete episode directory matching docs/data-model.md, with phase
labels, events, and a 10 Hz host /proc telemetry stream. No VM yet — pid is
arbitrary so we can validate the loop against e.g. ``sleep 5`` while the lab
side comes up.

collectors/proc_qemu.py — parses /proc/<pid>/{stat,io,status} (handles parens
in comm), single-shot collect_once(), and a stop-event-driven run_loop()
that ticks at a fixed cadence and exits when the pid disappears. Tagged
``available_in_deployment: false`` per the threat-model doc.

orchestrator/episode.py — EpisodeRunner: creates data/episodes/<ulid>/,
atomic meta.json, events.jsonl + labels.jsonl writers, drives the collector
in a thread for duration_s, writes done.marker last so the shipper never
sees a half-finished episode.

orchestrator/ulid.py — tiny 26-char Crockford-base32 ULID generator.
Time-sortable, no third-party dep.

orchestrator/__main__.py — CLI entry point.

Tests (15 new, 28 total green):
- proc_qemu: real-ish stat with parens-in-comm, missing /proc/<pid>/io,
  missing pid, run_loop cadence, run_loop terminates when pid disappears.
- episode: full directory shape against os.getpid(), id override,
  done.marker written after meta.json finalize.
- ulid: length+alphabet, 2000-burst uniqueness, time-sortability.

Smoke-tested against ``sleep 10``: 16 rows over 1.5s at 100ms cadence,
monotonic clock, RSS stable at ~3.5 MiB as expected for an idle sleep.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 23:40:25 -06:00

207 lines
6.4 KiB
Python

"""EpisodeRunner — minimum-viable single-episode driver.
This v0 runner does NOT boot a VM yet. It samples a target pid that you
provide on the command line, labels the entire window as ``clean``, and
writes the full per-episode directory shape from ``docs/data-model.md``:
data/episodes/<ulid>/
meta.json
events.jsonl
labels.jsonl
telemetry-proc.jsonl
done.marker
The point of v0 is to validate the directory shape, the JSONL schemas, the
collector loop, and (next step) the shipper, *before* the VM lab is wired
up. Once the VM bring-up exists, the runner will be extended with phase
transitions (armed → infecting → infected_running → dormant).
"""
from __future__ import annotations
import json
import logging
import os
import threading
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from collectors import proc_qemu
from .ulid import new_ulid
log = logging.getLogger("cis490.orchestrator")
SCHEMA_VERSION = 1
@dataclass
class EpisodeConfig:
target_pid: int
duration_s: float
data_root: Path
interval_ms: int = 100
image_name: str = "(stub)"
snapshot_name: str = "(none-stub)"
episode_id: str | None = None
@dataclass
class EpisodeResult:
episode_id: str
episode_dir: Path
rows_proc: int
pid_disappeared: bool
duration_observed_s: float
class EpisodeRunner:
def __init__(self, cfg: EpisodeConfig) -> None:
self.cfg = cfg
self.episode_id = cfg.episode_id or new_ulid()
self.episode_dir: Path = cfg.data_root / "episodes" / self.episode_id
self._t_mono_origin_ns: int = 0
self._stop = threading.Event()
# ---- public ---------------------------------------------------------
def run(self) -> EpisodeResult:
self.episode_dir.mkdir(parents=True, exist_ok=True)
self._t_mono_origin_ns = time.monotonic_ns()
started_at_wall = datetime.now(timezone.utc).isoformat()
meta = self._initial_meta(started_at_wall)
self._write_meta(meta)
self._emit_event(0, "snapshot_load", snapshot=self.cfg.snapshot_name)
self._emit_label(0, "clean", prev=None, reason="snapshot_loaded")
rows_holder: dict[str, int] = {"rows": 0}
def _collector() -> None:
rows_holder["rows"] = proc_qemu.run_loop(
pid=self.cfg.target_pid,
output_path=self.episode_dir / "telemetry-proc.jsonl",
t_mono_origin_ns=self._t_mono_origin_ns,
interval_ms=self.cfg.interval_ms,
stop_event=self._stop,
)
t = threading.Thread(target=_collector, daemon=True, name="proc_qemu")
t.start()
try:
# Wait either for duration to elapse or for stop to be set.
self._stop.wait(timeout=self.cfg.duration_s)
finally:
self._stop.set()
t.join(timeout=2.0)
end_mono_ns = time.monotonic_ns() - self._t_mono_origin_ns
pid_alive = _pid_alive(self.cfg.target_pid)
self._emit_event(
end_mono_ns,
"episode_end",
target_pid_alive=pid_alive,
)
meta["ended_at_wall"] = datetime.now(timezone.utc).isoformat()
meta["result"] = {
"phases_observed": ["clean"],
"rows_proc": rows_holder["rows"],
"pid_alive_at_end": pid_alive,
"duration_observed_s": end_mono_ns / 1_000_000_000,
}
self._write_meta(meta)
(self.episode_dir / "done.marker").touch()
log.info(
"episode %s complete: rows=%d duration=%.2fs",
self.episode_id, rows_holder["rows"], end_mono_ns / 1e9,
)
return EpisodeResult(
episode_id=self.episode_id,
episode_dir=self.episode_dir,
rows_proc=rows_holder["rows"],
pid_disappeared=not pid_alive,
duration_observed_s=end_mono_ns / 1_000_000_000,
)
def stop(self) -> None:
self._stop.set()
# ---- internals ------------------------------------------------------
def _initial_meta(self, started_at_wall: str) -> dict:
return {
"episode_id": self.episode_id,
"schema_version": SCHEMA_VERSION,
"started_at_wall": started_at_wall,
"ended_at_wall": None,
"host_fingerprint": {
"kernel": os.uname().release,
"qemu_version": None,
},
"vm": {
"image_name": self.cfg.image_name,
"image_sha256": None,
"snapshot_name": self.cfg.snapshot_name,
"vcpus": None,
"ram_mib": None,
"target_pid": self.cfg.target_pid,
},
"exploit": None,
"sample": None,
"schedule": {
"baseline_seconds": self.cfg.duration_s,
"infected_seconds": 0,
"dormant_seconds": 0,
"interval_ms": self.cfg.interval_ms,
},
"result": None,
}
def _write_meta(self, meta: dict) -> None:
path = self.episode_dir / "meta.json"
tmp = path.with_suffix(".json.partial")
with tmp.open("w") as f:
json.dump(meta, f, indent=2, sort_keys=True)
f.write("\n")
os.replace(tmp, path)
def _emit_event(self, t_mono_ns: int, event: str, **extra) -> None:
row = {
"t_mono_ns": t_mono_ns,
"t_wall_ns": time.time_ns(),
"event": event,
**extra,
}
with (self.episode_dir / "events.jsonl").open("a") as f:
f.write(json.dumps(row, sort_keys=True) + "\n")
def _emit_label(self, t_mono_ns: int, phase: str, prev: str | None, reason: str) -> None:
row = {
"t_mono_ns": t_mono_ns,
"t_wall_ns": time.time_ns(),
"phase": phase,
"prev": prev,
"reason": reason,
}
with (self.episode_dir / "labels.jsonl").open("a") as f:
f.write(json.dumps(row, sort_keys=True) + "\n")
def _pid_alive(pid: int) -> bool:
try:
os.kill(pid, 0)
return True
except (ProcessLookupError, PermissionError):
# PermissionError means the pid exists but we can't signal it.
return isinstance(_last_exception(), PermissionError)
def _last_exception() -> BaseException | None:
import sys
return sys.exc_info()[1]