CIS490/tools/run_real_vm_demo.py
max d86502d950 workload audit trail: meta.sample + per-phase events + pre-kill probe
The elliott-lab episode showed every phase median'd 20% CPU because
the in-guest workload silently never fired — and there was no signal
in events.jsonl to detect that from outside, so a trainer would
treat the labels as ground truth and learn "all phases look identical".
This commit closes the audit gap so the failure is visible in meta:

orchestrator/episode.py
  EpisodeConfig.sample: Sample | None — the manifest entry that
  drove this episode's workload selection. Stamped into meta.sample
  as {name, family, category, profile, kind, sha256} so trainers
  can join cleanly without re-deriving from events. None means the
  v1 yes-loop fallback path ran (and the trainer should treat the
  episode with appropriate skepticism).

tools/vm_load_controller.py
  VMLoadController gains an emit_event callable. Every phase now
  emits a workload_* event into the runner's events.jsonl:
    workload_setup        login + initial cleanup OK
    workload_killed       clean / dormant. Dormant carries a
                          `pre_kill_probe` dict from inside the
                          guest (`pgrep -c yes`, `pgrep -c sh`,
                          /proc/loadavg) so the trainer can detect
                          the elliott-lab failure mode where the
                          workload never actually ran.
    workload_armed        armed handshake fired
    workload_infecting    dd urandom / payload write fired
    workload_started      infected_running command sent
    workload_failed       any of the above raised inside SerialClient
                          (timeout, EOF, partial login). The runner
                          would have silently swallowed the
                          exception via its on_phase try/except;
                          the audit row makes the failure detectable.
  Exceptions in shell calls surface as workload_failed events but
  do NOT propagate, matching the runner's existing on_phase
  contract.

tools/run_real_vm_demo.py
  Wires the controller's emit_event to the runner's emit_event via
  a small forward-reference closure (controller is built before
  runner; runner.emit_event needs to be the sink). Sample also
  flows into EpisodeConfig.sample so meta.sample matches what the
  controller actually ran.

Tests: 119 (was 106). New cases:
  tests/test_vm_load_controller.py  (11 tests against a FakeSerial)
    - setup emits workload_setup
    - infected_running runs the v1 yes-loop AND emits workload_started
    - dormant probes BEFORE killing and stamps pre_kill_probe
    - dormant probe records "yes=0" (the elliott-lab fingerprint)
    - clean / armed / infecting all emit their respective events
    - serial.run() exception → workload_failed event, no propagation
    - sample-with-profile dispatches to exploits.workloads command
      (NOT the v1 yes-loop)
    - missing emit_event callback is a no-op (back-compat)
  tests/test_episode.py  (2 new)
    - meta.sample carries name/family/category/profile/kind/sha256
      when EpisodeConfig.sample is set
    - meta.sample stays null in the v1 fallback path

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 02:12:34 -05:00

236 lines
8.3 KiB
Python

"""Tier-2: real VM, real workload, labeled phases.
Boots the Alpine cidata VM, logs in over the serial console, drives the
guest through an XMRig-shaped phase schedule (clean → armed → infecting →
infected_running → dormant → re-entry), and lets the orchestrator's host
/proc collector sample the qemu-system pid throughout.
Compared to ``run_envelope_demo.py``: same phase schedule, same labels,
same telemetry shape — but the load is now generated by ``yes`` and
``dd`` running *inside* a real Alpine guest, not by a Python program on
the host. Tier-3 replaces the controller with an MSF-driven exploit
fire.
"""
from __future__ import annotations
import argparse
import logging
import os
import signal
import subprocess
import sys
import time
from pathlib import Path
# Allow running as a script.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
sys.path.insert(0, str(Path(__file__).resolve().parent))
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
from samples.manifest import SampleManifest # noqa: E402
from vm_load_controller import VMLoadController # noqa: E402
from vm_serial import SerialClient # noqa: E402
# Same shape as run_envelope_demo so plots are comparable.
DEFAULT_SCHEDULE = [
("clean", 10.0),
("armed", 2.0),
("infecting", 3.0),
("infected_running", 25.0),
("dormant", 15.0),
("infected_running", 20.0),
("dormant", 5.0),
("clean", 5.0),
]
def _wait_for_socket(path: Path, timeout_s: float) -> None:
import socket as _sk
deadline = time.monotonic() + timeout_s
while time.monotonic() < deadline:
if path.exists():
try:
# Verify it's actually live, not a leftover from a dead VM.
t = _sk.socket(_sk.AF_UNIX, _sk.SOCK_STREAM)
t.settimeout(0.5)
t.connect(str(path))
t.close()
return
except OSError:
pass
time.sleep(0.2)
raise TimeoutError(f"socket {path} never came alive within {timeout_s}s")
def main() -> int:
parser = argparse.ArgumentParser(prog="run_real_vm_demo")
parser.add_argument("--data-root", default="data")
parser.add_argument("--interval-ms", type=int, default=100)
parser.add_argument(
"--run-dir",
# Per-slot defaults so the fleet runner's parallel calls don't
# collide on the same /tmp dir (which would have rmtree'd each
# other's pidfiles mid-boot — see CIS490 history). Resolution
# order:
# 1) explicit --run-dir CLI flag
# 2) RUN_DIR env (set by the fleet runner)
# 3) /tmp/cis490-vm-<SLOT> (SLOT defaults to 0)
default=(
os.environ.get("RUN_DIR")
or f"/tmp/cis490-vm-{os.environ.get('SLOT', '0')}"
),
help="QEMU run dir (sockets + pidfile go here)",
)
parser.add_argument(
"--keep-vm",
action="store_true",
help="leave the VM running after the episode finishes",
)
parser.add_argument(
"--boot-timeout",
type=float,
default=120.0,
help="how long to wait for serial login prompt",
)
parser.add_argument(
"--sample",
default=os.environ.get("SAMPLE_NAME"),
help="Pick a workload profile from the manifest by name. Fleet runner "
"passes this via SAMPLE_NAME env. If unset, runs the v1 yes-loop.",
)
parser.add_argument(
"--manifest",
default=str(Path(__file__).resolve().parent.parent / "samples" / "manifest.toml"),
)
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
log = logging.getLogger("cis490.run_real_vm_demo")
repo_root = Path(__file__).resolve().parent.parent
launcher = repo_root / "vm" / "launch_demo.sh"
# Resolve sample if requested.
sample = None
if args.sample:
manifest = SampleManifest.load(args.manifest)
sample = next((s for s in manifest.samples if s.name == args.sample), None)
if sample is None:
log.error("sample %r not in manifest %s", args.sample, args.manifest)
return 2
log.info("using sample=%s profile=%s kind=%s",
sample.name, sample.profile, sample.kind)
run_dir = Path(args.run_dir)
# Wipe any stale sockets/pidfile from a previous run.
if run_dir.exists():
import shutil
shutil.rmtree(run_dir)
run_dir.mkdir(parents=True, exist_ok=True)
serial_sock = run_dir / "serial.sock"
pid_file = run_dir / "qemu.pid"
log.info("booting VM via %s (RUN_DIR=%s)", launcher, run_dir)
env = os.environ.copy()
env["RUN_DIR"] = str(run_dir)
qemu = subprocess.Popen(
[str(launcher)],
cwd=str(repo_root),
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
try:
_wait_for_socket(serial_sock, timeout_s=15.0)
# Wait for the pid file to be non-empty.
deadline = time.monotonic() + 15.0
while time.monotonic() < deadline:
if pid_file.exists() and pid_file.read_text().strip():
break
time.sleep(0.2)
qemu_pid = int(pid_file.read_text().strip())
log.info("qemu pid = %d", qemu_pid)
# Cloud-init's runcmd (password setup, sshd hardening) needs some
# time after boot. Wait long enough that the credentials we'll
# send are actually valid.
log.info("waiting 35s for cloud-init runcmd to settle...")
time.sleep(35.0)
log.info("connecting serial + logging in (boot timeout %.0fs)",
args.boot_timeout)
serial = SerialClient(str(serial_sock))
serial.connect()
serial.login(boot_timeout_s=args.boot_timeout)
# Bind the controller to the runner's event log so workload
# success/failure shows up alongside phase_transition events.
# Sample also goes into EpisodeConfig below so meta.sample
# records what was supposed to run.
runner_for_emit = {"runner": None}
controller = VMLoadController(
serial,
sample=sample,
emit_event=lambda ev, **kw: (
runner_for_emit["runner"].emit_event(ev, **kw)
if runner_for_emit["runner"] else None
),
)
controller.setup()
qmp_sock = run_dir / "qmp.sock"
agent_sock = run_dir / "agent.sock"
cfg = EpisodeConfig(
target_pid=qemu_pid,
duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
interval_ms=args.interval_ms,
data_root=Path(args.data_root),
phase_schedule=DEFAULT_SCHEDULE,
image_name="alpine-3.21-cloudinit",
snapshot_name="baseline-v1",
qmp_socket=qmp_sock if qmp_sock.exists() else None,
guest_agent_socket=agent_sock if agent_sock.exists() else None,
bridge_iface=os.environ.get("BRIDGE") or None,
sample=sample,
)
runner = EpisodeRunner(cfg, on_phase=controller.set_phase)
# Connect the controller's event sink to the runner now that
# both exist. (Forward-reference closure pattern keeps the
# constructor argument order natural.)
runner_for_emit["runner"] = runner
result = runner.run()
controller.teardown()
serial.close()
print()
print(f"episode_id = {result.episode_id}")
print(f"path = {result.episode_dir}")
print(f"rows_proc = {result.rows_proc}")
print(f"phases = {result.phases_observed}")
print()
print("To plot:")
print(f" uv run python tools/plot_envelope.py {result.episode_dir}")
return 0
finally:
if not args.keep_vm:
log.info("shutting down VM (pid=%d)", qemu.pid)
try:
os.killpg(os.getpgid(qemu.pid), signal.SIGTERM)
except ProcessLookupError:
pass
try:
qemu.wait(timeout=5)
except subprocess.TimeoutExpired:
os.killpg(os.getpgid(qemu.pid), signal.SIGKILL)
if __name__ == "__main__":
sys.exit(main())