This is the chunk that makes "real data" actually flow on multiple
hosts in parallel. End-to-end pipe was up at 613c6fa / 2579683; now
the lab-host side has the diversity + concurrency it needs.
Collectors landed:
collectors/qmp.py — source 2 (oracle). Tiny synchronous QMP
client + row builder + run loop. Tolerates
older qemu without query-stats.
collectors/guest_agent.py — source 5 (deployable). Reads the
virtio-serial host-side socket, parses
agent JSON-lines, re-stamps to the host
monotonic clock, persists.
collectors/pcap.py — source 4 (deployable). tcpdump capture
+ pure-Python pcap reader + 100 ms
netflow.jsonl bucketizer. Decodes
Ethernet/IPv4/TCP/UDP enough for the
schema in docs/data-model.md.
In-guest agent:
vm/guest-agent/cis490_agent.py — stdlib-only Python agent. Reads
/proc/{stat,meminfo,loadavg,net/dev,net/tcp*}, top-N RSS procs,
thermal. Writes JSON-lines to /dev/virtio-ports/cis490.guest.agent.
tools/build_cidata.py — embeds the agent + an OpenRC service into
user-data so first boot of the Alpine cidata image auto-starts it.
Launchers:
vm/launch_demo.sh / launch_target.sh — second virtio-serial port for
the agent socket; SLOT env support so multiple VMs run without
socket / port collisions; PORT_BASE on launch_target so multiple
target VMs hostfwd different host ports.
vm/setup_bridge.sh — creates host-only br-malware (10.200.0.1/24,
no NAT). Idempotent.
Fleet:
orchestrator/fleet.py — capacity detector (cores / RAM / load
headroom) + concurrent-slot runner. Per-slot ENV selects the
sample. FleetCapacity dataclass round-trips into meta.json so
"this episode ran with 6 concurrent VMs" is auditable post-hoc.
tools/run_fleet.py — CLI: --capacity report; --waves N runs N
waves of (max_concurrent) episodes each, every slot with a
different sample.
etc/cis490-orchestrator.service — now drives the fleet runner with
Restart=always so each invocation runs one wave and respawns,
giving a continuous stream.
Samples:
samples/manifest.toml — six profiles spanning the five major
behaviour shapes. Each entry is real OR mimic (sha256 distinguishes).
samples/manifest.py — strict TOML loader (rejects dups, unknown
categories) + deterministic select(host_id, slot, episode_index)
so different hosts on the network walk the catalog in different
orders without any coordinator.
EpisodeRunner:
orchestrator/episode.py — optional qmp_socket + guest_agent_socket
fields on EpisodeConfig; when set, additional collector threads
run alongside proc_qemu. EpisodeResult now carries rows_qmp +
rows_guest counters.
Tier-3 setup automation:
scripts/install-msfrpcd.sh — installs metasploit-framework where
the package manager has it, generates a strong password into
/etc/cis490/msfrpc.env, drops a hardened systemd unit bound to
127.0.0.1:55553. After this, run_tier3_demo.py works zero-touch
once MSFRPC_PASSWORD is sourced.
scripts/fetch-metasploitable2.sh — accepts IMAGE_URL + IMAGE_SHA256
from the operator (Rapid7 download is registration-walled), pulls,
verifies, converts vmdk → qcow2, lands at vm/images/.
Tests: 82 pass (was 51). New suites:
tests/test_qmp.py — fake QMP server, capability handshake,
blockstats, async-event interleaving,
5-failure backoff
tests/test_guest_agent.py — fake virtio socket, JSON-lines read +
re-stamp, malformed-line tolerance
tests/test_pcap.py — synthetic pcap with TCP/UDP/ARP frames,
bucketize correctness across windows
tests/test_fleet.py — capacity math (8-core idle / low-RAM /
high-load / Pi5 / 1-core box), manifest
selection determinism + diversity
What's queued for the next commit (already discussed in convo):
- MSFExploitDriver v2: map sample.profile → distinct in-session
workload so Tier-3 episodes don't all produce the same yes-loop
envelope. Critical for ML to learn varied malware shapes.
- Real-sample fetch from MalwareBazaar by sha256.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
298 lines
10 KiB
Python
298 lines
10 KiB
Python
"""EpisodeRunner — single-episode driver.
|
|
|
|
Writes the full per-episode directory shape from ``docs/data-model.md``:
|
|
|
|
data/episodes/<ulid>/
|
|
meta.json
|
|
events.jsonl
|
|
labels.jsonl
|
|
telemetry-proc.jsonl
|
|
done.marker
|
|
|
|
Two modes:
|
|
|
|
1. **Single-phase** (no ``phase_schedule`` set) — labels the whole window
|
|
``clean``. Useful for v0 sanity tests against any pid.
|
|
|
|
2. **Scheduled** (``phase_schedule`` set on the config) — walks a list of
|
|
``(phase, duration_s)`` tuples, emitting one label per transition. An
|
|
optional ``on_phase`` callback fires at each transition, used to drive an
|
|
external load mimic (or, later, the real exploit/sample driver).
|
|
|
|
Real VM bring-up will arrive as a third mode that boots a guest, fires an
|
|
exploit, and adapts the schedule based on observed events
|
|
(``session_open``, ``sample_executed``, ...).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import threading
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Callable
|
|
|
|
from collectors import guest_agent, proc_qemu, qmp
|
|
|
|
from .ulid import new_ulid
|
|
|
|
|
|
log = logging.getLogger("cis490.orchestrator")
|
|
|
|
SCHEMA_VERSION = 1
|
|
|
|
PhaseSchedule = list[tuple[str, float]]
|
|
OnPhase = Callable[[str], None]
|
|
|
|
|
|
@dataclass
|
|
class EpisodeConfig:
|
|
target_pid: int
|
|
duration_s: float
|
|
data_root: Path
|
|
interval_ms: int = 100
|
|
image_name: str = "(stub)"
|
|
snapshot_name: str = "(none-stub)"
|
|
episode_id: str | None = None
|
|
# When set, walk this schedule and ignore duration_s for sleep timing.
|
|
# ``duration_s`` still goes in meta.schedule for record-keeping.
|
|
phase_schedule: PhaseSchedule | None = None
|
|
# Optional: paths to QEMU sockets exposed by the launcher. When
|
|
# set, EpisodeRunner spins up additional collector threads.
|
|
qmp_socket: Path | None = None
|
|
qmp_interval_ms: int = 1000 # QMP queries are heavier than /proc reads
|
|
guest_agent_socket: Path | None = None
|
|
|
|
|
|
@dataclass
|
|
class EpisodeResult:
|
|
episode_id: str
|
|
episode_dir: Path
|
|
rows_proc: int
|
|
rows_qmp: int = 0
|
|
rows_guest: int = 0
|
|
pid_disappeared: bool = False
|
|
duration_observed_s: float = 0.0
|
|
phases_observed: list[str] = field(default_factory=list)
|
|
|
|
|
|
class EpisodeRunner:
|
|
def __init__(
|
|
self,
|
|
cfg: EpisodeConfig,
|
|
on_phase: OnPhase | None = None,
|
|
) -> None:
|
|
self.cfg = cfg
|
|
self.on_phase = on_phase
|
|
self.episode_id = cfg.episode_id or new_ulid()
|
|
self.episode_dir: Path = cfg.data_root / "episodes" / self.episode_id
|
|
# Create the dir up front so external drivers can call
|
|
# emit_event() between construction and run() — e.g. an exploit
|
|
# driver that writes a driver_setup event before the schedule
|
|
# walks. The dir is otherwise empty until run() opens files.
|
|
self.episode_dir.mkdir(parents=True, exist_ok=True)
|
|
self._t_mono_origin_ns: int = 0
|
|
self._stop = threading.Event()
|
|
|
|
# ---- public ---------------------------------------------------------
|
|
|
|
def run(self) -> EpisodeResult:
|
|
self._t_mono_origin_ns = time.monotonic_ns()
|
|
started_at_wall = datetime.now(timezone.utc).isoformat()
|
|
|
|
meta = self._initial_meta(started_at_wall)
|
|
self._write_meta(meta)
|
|
|
|
self.emit_event("snapshot_load", snapshot=self.cfg.snapshot_name)
|
|
|
|
rows_holder: dict[str, int] = {"proc": 0, "qmp": 0, "guest": 0}
|
|
|
|
def _proc_collector() -> None:
|
|
rows_holder["proc"] = proc_qemu.run_loop(
|
|
pid=self.cfg.target_pid,
|
|
output_path=self.episode_dir / "telemetry-proc.jsonl",
|
|
t_mono_origin_ns=self._t_mono_origin_ns,
|
|
interval_ms=self.cfg.interval_ms,
|
|
stop_event=self._stop,
|
|
)
|
|
|
|
def _qmp_collector() -> None:
|
|
assert self.cfg.qmp_socket is not None
|
|
rows_holder["qmp"] = qmp.run_loop(
|
|
socket_path=self.cfg.qmp_socket,
|
|
output_path=self.episode_dir / "telemetry-qmp.jsonl",
|
|
t_mono_origin_ns=self._t_mono_origin_ns,
|
|
interval_ms=self.cfg.qmp_interval_ms,
|
|
stop_event=self._stop,
|
|
)
|
|
|
|
def _guest_collector() -> None:
|
|
assert self.cfg.guest_agent_socket is not None
|
|
rows_holder["guest"] = guest_agent.run_loop(
|
|
socket_path=self.cfg.guest_agent_socket,
|
|
output_path=self.episode_dir / "telemetry-guest.jsonl",
|
|
t_mono_origin_ns=self._t_mono_origin_ns,
|
|
stop_event=self._stop,
|
|
)
|
|
|
|
threads: list[threading.Thread] = []
|
|
threads.append(threading.Thread(target=_proc_collector, daemon=True, name="proc_qemu"))
|
|
if self.cfg.qmp_socket is not None:
|
|
threads.append(threading.Thread(target=_qmp_collector, daemon=True, name="qmp"))
|
|
if self.cfg.guest_agent_socket is not None:
|
|
threads.append(threading.Thread(target=_guest_collector, daemon=True, name="guest_agent"))
|
|
for t in threads:
|
|
t.start()
|
|
|
|
phases_observed: list[str] = []
|
|
try:
|
|
if self.cfg.phase_schedule:
|
|
phases_observed = self._walk_schedule()
|
|
else:
|
|
self._emit_label(0, "clean", prev=None, reason="snapshot_loaded")
|
|
phases_observed = ["clean"]
|
|
self._stop.wait(timeout=self.cfg.duration_s)
|
|
finally:
|
|
self._stop.set()
|
|
for t in threads:
|
|
t.join(timeout=3.0)
|
|
|
|
pid_alive = _pid_alive(self.cfg.target_pid)
|
|
self.emit_event("episode_end", target_pid_alive=pid_alive)
|
|
end_mono_ns = time.monotonic_ns() - self._t_mono_origin_ns
|
|
|
|
meta["ended_at_wall"] = datetime.now(timezone.utc).isoformat()
|
|
meta["result"] = {
|
|
"phases_observed": phases_observed,
|
|
"rows_proc": rows_holder["proc"],
|
|
"rows_qmp": rows_holder["qmp"],
|
|
"rows_guest": rows_holder["guest"],
|
|
"pid_alive_at_end": pid_alive,
|
|
"duration_observed_s": end_mono_ns / 1_000_000_000,
|
|
}
|
|
self._write_meta(meta)
|
|
(self.episode_dir / "done.marker").touch()
|
|
|
|
log.info(
|
|
"episode %s complete: proc=%d qmp=%d guest=%d duration=%.2fs phases=%s",
|
|
self.episode_id,
|
|
rows_holder["proc"], rows_holder["qmp"], rows_holder["guest"],
|
|
end_mono_ns / 1e9,
|
|
phases_observed,
|
|
)
|
|
return EpisodeResult(
|
|
episode_id=self.episode_id,
|
|
episode_dir=self.episode_dir,
|
|
rows_proc=rows_holder["proc"],
|
|
rows_qmp=rows_holder["qmp"],
|
|
rows_guest=rows_holder["guest"],
|
|
pid_disappeared=not pid_alive,
|
|
duration_observed_s=end_mono_ns / 1_000_000_000,
|
|
phases_observed=phases_observed,
|
|
)
|
|
|
|
def stop(self) -> None:
|
|
self._stop.set()
|
|
|
|
# ---- internals ------------------------------------------------------
|
|
|
|
def _walk_schedule(self) -> list[str]:
|
|
observed: list[str] = []
|
|
prev: str | None = None
|
|
for phase, dur in self.cfg.phase_schedule or []:
|
|
if self._stop.is_set():
|
|
break
|
|
t_mono = time.monotonic_ns() - self._t_mono_origin_ns
|
|
self._emit_label(t_mono, phase, prev=prev, reason="scheduled")
|
|
self.emit_event("phase_transition", to=phase, prev=prev)
|
|
if self.on_phase is not None:
|
|
try:
|
|
self.on_phase(phase)
|
|
except Exception:
|
|
log.exception("on_phase callback raised; continuing")
|
|
observed.append(phase)
|
|
prev = phase
|
|
self._stop.wait(timeout=dur)
|
|
return observed
|
|
|
|
def _initial_meta(self, started_at_wall: str) -> dict:
|
|
return {
|
|
"episode_id": self.episode_id,
|
|
"schema_version": SCHEMA_VERSION,
|
|
"started_at_wall": started_at_wall,
|
|
"ended_at_wall": None,
|
|
"host_fingerprint": {
|
|
"kernel": os.uname().release,
|
|
"qemu_version": None,
|
|
},
|
|
"vm": {
|
|
"image_name": self.cfg.image_name,
|
|
"image_sha256": None,
|
|
"snapshot_name": self.cfg.snapshot_name,
|
|
"vcpus": None,
|
|
"ram_mib": None,
|
|
"target_pid": self.cfg.target_pid,
|
|
},
|
|
"exploit": None,
|
|
"sample": None,
|
|
"schedule": {
|
|
"baseline_seconds": self.cfg.duration_s,
|
|
"interval_ms": self.cfg.interval_ms,
|
|
"phase_schedule": self.cfg.phase_schedule,
|
|
},
|
|
"result": None,
|
|
}
|
|
|
|
def _write_meta(self, meta: dict) -> None:
|
|
path = self.episode_dir / "meta.json"
|
|
tmp = path.with_suffix(".json.partial")
|
|
with tmp.open("w") as f:
|
|
json.dump(meta, f, indent=2, sort_keys=True)
|
|
f.write("\n")
|
|
os.replace(tmp, path)
|
|
|
|
def emit_event(self, event: str, **extra) -> None:
|
|
"""Append a row to events.jsonl. Public so external drivers
|
|
(e.g. the MSF exploit driver) can stamp their own events with
|
|
the same monotonic clock the orchestrator is using."""
|
|
t_mono_ns = (
|
|
time.monotonic_ns() - self._t_mono_origin_ns
|
|
if self._t_mono_origin_ns
|
|
else 0
|
|
)
|
|
row = {
|
|
"t_mono_ns": t_mono_ns,
|
|
"t_wall_ns": time.time_ns(),
|
|
"event": event,
|
|
**extra,
|
|
}
|
|
with (self.episode_dir / "events.jsonl").open("a") as f:
|
|
f.write(json.dumps(row, sort_keys=True) + "\n")
|
|
|
|
def _emit_label(
|
|
self, t_mono_ns: int, phase: str, prev: str | None, reason: str
|
|
) -> None:
|
|
row = {
|
|
"t_mono_ns": t_mono_ns,
|
|
"t_wall_ns": time.time_ns(),
|
|
"phase": phase,
|
|
"prev": prev,
|
|
"reason": reason,
|
|
}
|
|
with (self.episode_dir / "labels.jsonl").open("a") as f:
|
|
f.write(json.dumps(row, sort_keys=True) + "\n")
|
|
|
|
|
|
def _pid_alive(pid: int) -> bool:
|
|
try:
|
|
os.kill(pid, 0)
|
|
return True
|
|
except ProcessLookupError:
|
|
return False
|
|
except PermissionError:
|
|
# Pid exists but we can't signal it.
|
|
return True
|