The elliott-lab episode showed every phase median'd 20% CPU because
the in-guest workload silently never fired — and there was no signal
in events.jsonl to detect that from outside, so a trainer would
treat the labels as ground truth and learn "all phases look identical".
This commit closes the audit gap so the failure is visible in meta:
orchestrator/episode.py
EpisodeConfig.sample: Sample | None — the manifest entry that
drove this episode's workload selection. Stamped into meta.sample
as {name, family, category, profile, kind, sha256} so trainers
can join cleanly without re-deriving from events. None means the
v1 yes-loop fallback path ran (and the trainer should treat the
episode with appropriate skepticism).
tools/vm_load_controller.py
VMLoadController gains an emit_event callable. Every phase now
emits a workload_* event into the runner's events.jsonl:
workload_setup login + initial cleanup OK
workload_killed clean / dormant. Dormant carries a
`pre_kill_probe` dict from inside the
guest (`pgrep -c yes`, `pgrep -c sh`,
/proc/loadavg) so the trainer can detect
the elliott-lab failure mode where the
workload never actually ran.
workload_armed armed handshake fired
workload_infecting dd urandom / payload write fired
workload_started infected_running command sent
workload_failed any of the above raised inside SerialClient
(timeout, EOF, partial login). The runner
would have silently swallowed the
exception via its on_phase try/except;
the audit row makes the failure detectable.
Exceptions in shell calls surface as workload_failed events but
do NOT propagate, matching the runner's existing on_phase
contract.
tools/run_real_vm_demo.py
Wires the controller's emit_event to the runner's emit_event via
a small forward-reference closure (controller is built before
runner; runner.emit_event needs to be the sink). Sample also
flows into EpisodeConfig.sample so meta.sample matches what the
controller actually ran.
Tests: 119 (was 106). New cases:
tests/test_vm_load_controller.py (11 tests against a FakeSerial)
- setup emits workload_setup
- infected_running runs the v1 yes-loop AND emits workload_started
- dormant probes BEFORE killing and stamps pre_kill_probe
- dormant probe records "yes=0" (the elliott-lab fingerprint)
- clean / armed / infecting all emit their respective events
- serial.run() exception → workload_failed event, no propagation
- sample-with-profile dispatches to exploits.workloads command
(NOT the v1 yes-loop)
- missing emit_event callback is a no-op (back-compat)
tests/test_episode.py (2 new)
- meta.sample carries name/family/category/profile/kind/sha256
when EpisodeConfig.sample is set
- meta.sample stays null in the v1 fallback path
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
236 lines
8.3 KiB
Python
236 lines
8.3 KiB
Python
"""Tier-2: real VM, real workload, labeled phases.
|
|
|
|
Boots the Alpine cidata VM, logs in over the serial console, drives the
|
|
guest through an XMRig-shaped phase schedule (clean → armed → infecting →
|
|
infected_running → dormant → re-entry), and lets the orchestrator's host
|
|
/proc collector sample the qemu-system pid throughout.
|
|
|
|
Compared to ``run_envelope_demo.py``: same phase schedule, same labels,
|
|
same telemetry shape — but the load is now generated by ``yes`` and
|
|
``dd`` running *inside* a real Alpine guest, not by a Python program on
|
|
the host. Tier-3 replaces the controller with an MSF-driven exploit
|
|
fire.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Allow running as a script.
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|
|
|
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
|
|
from samples.manifest import SampleManifest # noqa: E402
|
|
from vm_load_controller import VMLoadController # noqa: E402
|
|
from vm_serial import SerialClient # noqa: E402
|
|
|
|
|
|
# Same shape as run_envelope_demo so plots are comparable.
|
|
DEFAULT_SCHEDULE = [
|
|
("clean", 10.0),
|
|
("armed", 2.0),
|
|
("infecting", 3.0),
|
|
("infected_running", 25.0),
|
|
("dormant", 15.0),
|
|
("infected_running", 20.0),
|
|
("dormant", 5.0),
|
|
("clean", 5.0),
|
|
]
|
|
|
|
|
|
def _wait_for_socket(path: Path, timeout_s: float) -> None:
|
|
import socket as _sk
|
|
deadline = time.monotonic() + timeout_s
|
|
while time.monotonic() < deadline:
|
|
if path.exists():
|
|
try:
|
|
# Verify it's actually live, not a leftover from a dead VM.
|
|
t = _sk.socket(_sk.AF_UNIX, _sk.SOCK_STREAM)
|
|
t.settimeout(0.5)
|
|
t.connect(str(path))
|
|
t.close()
|
|
return
|
|
except OSError:
|
|
pass
|
|
time.sleep(0.2)
|
|
raise TimeoutError(f"socket {path} never came alive within {timeout_s}s")
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(prog="run_real_vm_demo")
|
|
parser.add_argument("--data-root", default="data")
|
|
parser.add_argument("--interval-ms", type=int, default=100)
|
|
parser.add_argument(
|
|
"--run-dir",
|
|
# Per-slot defaults so the fleet runner's parallel calls don't
|
|
# collide on the same /tmp dir (which would have rmtree'd each
|
|
# other's pidfiles mid-boot — see CIS490 history). Resolution
|
|
# order:
|
|
# 1) explicit --run-dir CLI flag
|
|
# 2) RUN_DIR env (set by the fleet runner)
|
|
# 3) /tmp/cis490-vm-<SLOT> (SLOT defaults to 0)
|
|
default=(
|
|
os.environ.get("RUN_DIR")
|
|
or f"/tmp/cis490-vm-{os.environ.get('SLOT', '0')}"
|
|
),
|
|
help="QEMU run dir (sockets + pidfile go here)",
|
|
)
|
|
parser.add_argument(
|
|
"--keep-vm",
|
|
action="store_true",
|
|
help="leave the VM running after the episode finishes",
|
|
)
|
|
parser.add_argument(
|
|
"--boot-timeout",
|
|
type=float,
|
|
default=120.0,
|
|
help="how long to wait for serial login prompt",
|
|
)
|
|
parser.add_argument(
|
|
"--sample",
|
|
default=os.environ.get("SAMPLE_NAME"),
|
|
help="Pick a workload profile from the manifest by name. Fleet runner "
|
|
"passes this via SAMPLE_NAME env. If unset, runs the v1 yes-loop.",
|
|
)
|
|
parser.add_argument(
|
|
"--manifest",
|
|
default=str(Path(__file__).resolve().parent.parent / "samples" / "manifest.toml"),
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
)
|
|
log = logging.getLogger("cis490.run_real_vm_demo")
|
|
|
|
repo_root = Path(__file__).resolve().parent.parent
|
|
launcher = repo_root / "vm" / "launch_demo.sh"
|
|
|
|
# Resolve sample if requested.
|
|
sample = None
|
|
if args.sample:
|
|
manifest = SampleManifest.load(args.manifest)
|
|
sample = next((s for s in manifest.samples if s.name == args.sample), None)
|
|
if sample is None:
|
|
log.error("sample %r not in manifest %s", args.sample, args.manifest)
|
|
return 2
|
|
log.info("using sample=%s profile=%s kind=%s",
|
|
sample.name, sample.profile, sample.kind)
|
|
run_dir = Path(args.run_dir)
|
|
# Wipe any stale sockets/pidfile from a previous run.
|
|
if run_dir.exists():
|
|
import shutil
|
|
shutil.rmtree(run_dir)
|
|
run_dir.mkdir(parents=True, exist_ok=True)
|
|
serial_sock = run_dir / "serial.sock"
|
|
pid_file = run_dir / "qemu.pid"
|
|
|
|
log.info("booting VM via %s (RUN_DIR=%s)", launcher, run_dir)
|
|
env = os.environ.copy()
|
|
env["RUN_DIR"] = str(run_dir)
|
|
qemu = subprocess.Popen(
|
|
[str(launcher)],
|
|
cwd=str(repo_root),
|
|
env=env,
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL,
|
|
start_new_session=True,
|
|
)
|
|
|
|
try:
|
|
_wait_for_socket(serial_sock, timeout_s=15.0)
|
|
# Wait for the pid file to be non-empty.
|
|
deadline = time.monotonic() + 15.0
|
|
while time.monotonic() < deadline:
|
|
if pid_file.exists() and pid_file.read_text().strip():
|
|
break
|
|
time.sleep(0.2)
|
|
qemu_pid = int(pid_file.read_text().strip())
|
|
log.info("qemu pid = %d", qemu_pid)
|
|
|
|
# Cloud-init's runcmd (password setup, sshd hardening) needs some
|
|
# time after boot. Wait long enough that the credentials we'll
|
|
# send are actually valid.
|
|
log.info("waiting 35s for cloud-init runcmd to settle...")
|
|
time.sleep(35.0)
|
|
|
|
log.info("connecting serial + logging in (boot timeout %.0fs)",
|
|
args.boot_timeout)
|
|
serial = SerialClient(str(serial_sock))
|
|
serial.connect()
|
|
serial.login(boot_timeout_s=args.boot_timeout)
|
|
|
|
# Bind the controller to the runner's event log so workload
|
|
# success/failure shows up alongside phase_transition events.
|
|
# Sample also goes into EpisodeConfig below so meta.sample
|
|
# records what was supposed to run.
|
|
runner_for_emit = {"runner": None}
|
|
controller = VMLoadController(
|
|
serial,
|
|
sample=sample,
|
|
emit_event=lambda ev, **kw: (
|
|
runner_for_emit["runner"].emit_event(ev, **kw)
|
|
if runner_for_emit["runner"] else None
|
|
),
|
|
)
|
|
controller.setup()
|
|
|
|
qmp_sock = run_dir / "qmp.sock"
|
|
agent_sock = run_dir / "agent.sock"
|
|
cfg = EpisodeConfig(
|
|
target_pid=qemu_pid,
|
|
duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
|
|
interval_ms=args.interval_ms,
|
|
data_root=Path(args.data_root),
|
|
phase_schedule=DEFAULT_SCHEDULE,
|
|
image_name="alpine-3.21-cloudinit",
|
|
snapshot_name="baseline-v1",
|
|
qmp_socket=qmp_sock if qmp_sock.exists() else None,
|
|
guest_agent_socket=agent_sock if agent_sock.exists() else None,
|
|
bridge_iface=os.environ.get("BRIDGE") or None,
|
|
sample=sample,
|
|
)
|
|
|
|
runner = EpisodeRunner(cfg, on_phase=controller.set_phase)
|
|
# Connect the controller's event sink to the runner now that
|
|
# both exist. (Forward-reference closure pattern keeps the
|
|
# constructor argument order natural.)
|
|
runner_for_emit["runner"] = runner
|
|
result = runner.run()
|
|
|
|
controller.teardown()
|
|
serial.close()
|
|
|
|
print()
|
|
print(f"episode_id = {result.episode_id}")
|
|
print(f"path = {result.episode_dir}")
|
|
print(f"rows_proc = {result.rows_proc}")
|
|
print(f"phases = {result.phases_observed}")
|
|
print()
|
|
print("To plot:")
|
|
print(f" uv run python tools/plot_envelope.py {result.episode_dir}")
|
|
return 0
|
|
finally:
|
|
if not args.keep_vm:
|
|
log.info("shutting down VM (pid=%d)", qemu.pid)
|
|
try:
|
|
os.killpg(os.getpgid(qemu.pid), signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
pass
|
|
try:
|
|
qemu.wait(timeout=5)
|
|
except subprocess.TimeoutExpired:
|
|
os.killpg(os.getpgid(qemu.pid), signal.SIGKILL)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|