CIS490/tools/run_real_vm_demo.py
Maximus Gorog 7216ec09bd Tier 2: real Alpine VM, real workload, real envelope
End-to-end now drives a real KVM guest through the full XMRig-shaped
phase schedule with the workload running INSIDE the guest. Telemetry is
host-side /proc/<qemu_pid>; the load is busybox `yes` (sustained CPU
saturation) and `dd if=/dev/urandom` (disk burst on infecting), driven
over the serial console at every phase transition. The plotted envelope
shows clean idle → armed → infecting (disk spike) → infected_running
(100% CPU plateau) → dormant → re-entry → final clean.

Components:

  vm/launch_demo.sh              now boots Alpine 3.21 nocloud-cloudinit
                                 (Cirros 0.6.x's cirros-init blocks on the
                                 EC2 metadata service for ~17 min before
                                 falling through to NoCloud — abandoned).
                                 Mounts a cidata ISO as a second drive.

  tools/build_cidata.py          pure-Python NoCloud ISO builder (pycdlib).
                                 Sets root password and ssh_pwauth via
                                 runcmd so we don't depend on a specific
                                 cloud-init version's plain_text_passwd
                                 handling.

  tools/vm_serial.py             serial-console client (stdlib socket).
                                 Idempotent login (detects already-in-shell
                                 state), sentinel-bracketed run() that
                                 distinguishes shell output from the TTY
                                 echo of input by requiring a leading
                                 \r\n boundary on the marker.

  tools/vm_load_controller.py    in-guest load controller. set_phase()
                                 dispatches the per-phase shell command
                                 over the serial connection.

  tools/run_real_vm_demo.py      ties it all together: boot VM, wait for
                                 cloud-init runcmd, log in, run the
                                 EpisodeRunner with on_phase=controller,
                                 shut down VM.

Deps: paramiko, pycdlib added.

docs/sources.md updated with Alpine cloud image (sha512 pinned), and
the new Python deps.

README leads with the tier-2 plot now (real VM, real workload). The
previous synthetic plot is moved below with explicit "host-side mimic,
not a VM" labelling. Tier-2 status flipped to  in the tier table.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 08:38:53 -06:00

181 lines
5.8 KiB
Python

"""Tier-2: real VM, real workload, labeled phases.
Boots the Alpine cidata VM, logs in over the serial console, drives the
guest through an XMRig-shaped phase schedule (clean → armed → infecting →
infected_running → dormant → re-entry), and lets the orchestrator's host
/proc collector sample the qemu-system pid throughout.
Compared to ``run_envelope_demo.py``: same phase schedule, same labels,
same telemetry shape — but the load is now generated by ``yes`` and
``dd`` running *inside* a real Alpine guest, not by a Python program on
the host. Tier-3 replaces the controller with an MSF-driven exploit
fire.
"""
from __future__ import annotations
import argparse
import logging
import os
import signal
import subprocess
import sys
import time
from pathlib import Path
# Allow running as a script.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
sys.path.insert(0, str(Path(__file__).resolve().parent))
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
from vm_load_controller import VMLoadController # noqa: E402
from vm_serial import SerialClient # noqa: E402
# Same shape as run_envelope_demo so plots are comparable.
DEFAULT_SCHEDULE = [
("clean", 10.0),
("armed", 2.0),
("infecting", 3.0),
("infected_running", 25.0),
("dormant", 15.0),
("infected_running", 20.0),
("dormant", 5.0),
("clean", 5.0),
]
def _wait_for_socket(path: Path, timeout_s: float) -> None:
import socket as _sk
deadline = time.monotonic() + timeout_s
while time.monotonic() < deadline:
if path.exists():
try:
# Verify it's actually live, not a leftover from a dead VM.
t = _sk.socket(_sk.AF_UNIX, _sk.SOCK_STREAM)
t.settimeout(0.5)
t.connect(str(path))
t.close()
return
except OSError:
pass
time.sleep(0.2)
raise TimeoutError(f"socket {path} never came alive within {timeout_s}s")
def main() -> int:
parser = argparse.ArgumentParser(prog="run_real_vm_demo")
parser.add_argument("--data-root", default="data")
parser.add_argument("--interval-ms", type=int, default=100)
parser.add_argument(
"--run-dir",
default="/tmp/cis490-vm",
help="QEMU run dir (sockets + pidfile go here)",
)
parser.add_argument(
"--keep-vm",
action="store_true",
help="leave the VM running after the episode finishes",
)
parser.add_argument(
"--boot-timeout",
type=float,
default=120.0,
help="how long to wait for serial login prompt",
)
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
log = logging.getLogger("cis490.run_real_vm_demo")
repo_root = Path(__file__).resolve().parent.parent
launcher = repo_root / "vm" / "launch_demo.sh"
run_dir = Path(args.run_dir)
# Wipe any stale sockets/pidfile from a previous run.
if run_dir.exists():
import shutil
shutil.rmtree(run_dir)
run_dir.mkdir(parents=True, exist_ok=True)
serial_sock = run_dir / "serial.sock"
pid_file = run_dir / "qemu.pid"
log.info("booting VM via %s (RUN_DIR=%s)", launcher, run_dir)
env = os.environ.copy()
env["RUN_DIR"] = str(run_dir)
qemu = subprocess.Popen(
[str(launcher)],
cwd=str(repo_root),
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
try:
_wait_for_socket(serial_sock, timeout_s=15.0)
# Wait for the pid file to be non-empty.
deadline = time.monotonic() + 15.0
while time.monotonic() < deadline:
if pid_file.exists() and pid_file.read_text().strip():
break
time.sleep(0.2)
qemu_pid = int(pid_file.read_text().strip())
log.info("qemu pid = %d", qemu_pid)
# Cloud-init's runcmd (password setup, sshd hardening) needs some
# time after boot. Wait long enough that the credentials we'll
# send are actually valid.
log.info("waiting 35s for cloud-init runcmd to settle...")
time.sleep(35.0)
log.info("connecting serial + logging in (boot timeout %.0fs)",
args.boot_timeout)
serial = SerialClient(str(serial_sock))
serial.connect()
serial.login(boot_timeout_s=args.boot_timeout)
controller = VMLoadController(serial)
controller.setup()
cfg = EpisodeConfig(
target_pid=qemu_pid,
duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
interval_ms=args.interval_ms,
data_root=Path(args.data_root),
phase_schedule=DEFAULT_SCHEDULE,
image_name="alpine-3.21-cloudinit",
snapshot_name="baseline-v1",
)
result = EpisodeRunner(cfg, on_phase=controller.set_phase).run()
controller.teardown()
serial.close()
print()
print(f"episode_id = {result.episode_id}")
print(f"path = {result.episode_dir}")
print(f"rows_proc = {result.rows_proc}")
print(f"phases = {result.phases_observed}")
print()
print("To plot:")
print(f" uv run python tools/plot_envelope.py {result.episode_dir}")
return 0
finally:
if not args.keep_vm:
log.info("shutting down VM (pid=%d)", qemu.pid)
try:
os.killpg(os.getpgid(qemu.pid), signal.SIGTERM)
except ProcessLookupError:
pass
try:
qemu.wait(timeout=5)
except subprocess.TimeoutExpired:
os.killpg(os.getpgid(qemu.pid), signal.SIGKILL)
if __name__ == "__main__":
sys.exit(main())