End-to-end now drives a real KVM guest through the full XMRig-shaped
phase schedule with the workload running INSIDE the guest. Telemetry is
host-side /proc/<qemu_pid>; the load is busybox `yes` (sustained CPU
saturation) and `dd if=/dev/urandom` (disk burst on infecting), driven
over the serial console at every phase transition. The plotted envelope
shows clean idle → armed → infecting (disk spike) → infected_running
(100% CPU plateau) → dormant → re-entry → final clean.
Components:
vm/launch_demo.sh now boots Alpine 3.21 nocloud-cloudinit
(Cirros 0.6.x's cirros-init blocks on the
EC2 metadata service for ~17 min before
falling through to NoCloud — abandoned).
Mounts a cidata ISO as a second drive.
tools/build_cidata.py pure-Python NoCloud ISO builder (pycdlib).
Sets root password and ssh_pwauth via
runcmd so we don't depend on a specific
cloud-init version's plain_text_passwd
handling.
tools/vm_serial.py serial-console client (stdlib socket).
Idempotent login (detects already-in-shell
state), sentinel-bracketed run() that
distinguishes shell output from the TTY
echo of input by requiring a leading
\r\n boundary on the marker.
tools/vm_load_controller.py in-guest load controller. set_phase()
dispatches the per-phase shell command
over the serial connection.
tools/run_real_vm_demo.py ties it all together: boot VM, wait for
cloud-init runcmd, log in, run the
EpisodeRunner with on_phase=controller,
shut down VM.
Deps: paramiko, pycdlib added.
docs/sources.md updated with Alpine cloud image (sha512 pinned), and
the new Python deps.
README leads with the tier-2 plot now (real VM, real workload). The
previous synthetic plot is moved below with explicit "host-side mimic,
not a VM" labelling. Tier-2 status flipped to ✅ in the tier table.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
181 lines
5.8 KiB
Python
181 lines
5.8 KiB
Python
"""Tier-2: real VM, real workload, labeled phases.
|
|
|
|
Boots the Alpine cidata VM, logs in over the serial console, drives the
|
|
guest through an XMRig-shaped phase schedule (clean → armed → infecting →
|
|
infected_running → dormant → re-entry), and lets the orchestrator's host
|
|
/proc collector sample the qemu-system pid throughout.
|
|
|
|
Compared to ``run_envelope_demo.py``: same phase schedule, same labels,
|
|
same telemetry shape — but the load is now generated by ``yes`` and
|
|
``dd`` running *inside* a real Alpine guest, not by a Python program on
|
|
the host. Tier-3 replaces the controller with an MSF-driven exploit
|
|
fire.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Allow running as a script.
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|
|
|
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
|
|
from vm_load_controller import VMLoadController # noqa: E402
|
|
from vm_serial import SerialClient # noqa: E402
|
|
|
|
|
|
# Same shape as run_envelope_demo so plots are comparable.
|
|
DEFAULT_SCHEDULE = [
|
|
("clean", 10.0),
|
|
("armed", 2.0),
|
|
("infecting", 3.0),
|
|
("infected_running", 25.0),
|
|
("dormant", 15.0),
|
|
("infected_running", 20.0),
|
|
("dormant", 5.0),
|
|
("clean", 5.0),
|
|
]
|
|
|
|
|
|
def _wait_for_socket(path: Path, timeout_s: float) -> None:
|
|
import socket as _sk
|
|
deadline = time.monotonic() + timeout_s
|
|
while time.monotonic() < deadline:
|
|
if path.exists():
|
|
try:
|
|
# Verify it's actually live, not a leftover from a dead VM.
|
|
t = _sk.socket(_sk.AF_UNIX, _sk.SOCK_STREAM)
|
|
t.settimeout(0.5)
|
|
t.connect(str(path))
|
|
t.close()
|
|
return
|
|
except OSError:
|
|
pass
|
|
time.sleep(0.2)
|
|
raise TimeoutError(f"socket {path} never came alive within {timeout_s}s")
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(prog="run_real_vm_demo")
|
|
parser.add_argument("--data-root", default="data")
|
|
parser.add_argument("--interval-ms", type=int, default=100)
|
|
parser.add_argument(
|
|
"--run-dir",
|
|
default="/tmp/cis490-vm",
|
|
help="QEMU run dir (sockets + pidfile go here)",
|
|
)
|
|
parser.add_argument(
|
|
"--keep-vm",
|
|
action="store_true",
|
|
help="leave the VM running after the episode finishes",
|
|
)
|
|
parser.add_argument(
|
|
"--boot-timeout",
|
|
type=float,
|
|
default=120.0,
|
|
help="how long to wait for serial login prompt",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
)
|
|
log = logging.getLogger("cis490.run_real_vm_demo")
|
|
|
|
repo_root = Path(__file__).resolve().parent.parent
|
|
launcher = repo_root / "vm" / "launch_demo.sh"
|
|
run_dir = Path(args.run_dir)
|
|
# Wipe any stale sockets/pidfile from a previous run.
|
|
if run_dir.exists():
|
|
import shutil
|
|
shutil.rmtree(run_dir)
|
|
run_dir.mkdir(parents=True, exist_ok=True)
|
|
serial_sock = run_dir / "serial.sock"
|
|
pid_file = run_dir / "qemu.pid"
|
|
|
|
log.info("booting VM via %s (RUN_DIR=%s)", launcher, run_dir)
|
|
env = os.environ.copy()
|
|
env["RUN_DIR"] = str(run_dir)
|
|
qemu = subprocess.Popen(
|
|
[str(launcher)],
|
|
cwd=str(repo_root),
|
|
env=env,
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL,
|
|
start_new_session=True,
|
|
)
|
|
|
|
try:
|
|
_wait_for_socket(serial_sock, timeout_s=15.0)
|
|
# Wait for the pid file to be non-empty.
|
|
deadline = time.monotonic() + 15.0
|
|
while time.monotonic() < deadline:
|
|
if pid_file.exists() and pid_file.read_text().strip():
|
|
break
|
|
time.sleep(0.2)
|
|
qemu_pid = int(pid_file.read_text().strip())
|
|
log.info("qemu pid = %d", qemu_pid)
|
|
|
|
# Cloud-init's runcmd (password setup, sshd hardening) needs some
|
|
# time after boot. Wait long enough that the credentials we'll
|
|
# send are actually valid.
|
|
log.info("waiting 35s for cloud-init runcmd to settle...")
|
|
time.sleep(35.0)
|
|
|
|
log.info("connecting serial + logging in (boot timeout %.0fs)",
|
|
args.boot_timeout)
|
|
serial = SerialClient(str(serial_sock))
|
|
serial.connect()
|
|
serial.login(boot_timeout_s=args.boot_timeout)
|
|
|
|
controller = VMLoadController(serial)
|
|
controller.setup()
|
|
|
|
cfg = EpisodeConfig(
|
|
target_pid=qemu_pid,
|
|
duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
|
|
interval_ms=args.interval_ms,
|
|
data_root=Path(args.data_root),
|
|
phase_schedule=DEFAULT_SCHEDULE,
|
|
image_name="alpine-3.21-cloudinit",
|
|
snapshot_name="baseline-v1",
|
|
)
|
|
|
|
result = EpisodeRunner(cfg, on_phase=controller.set_phase).run()
|
|
|
|
controller.teardown()
|
|
serial.close()
|
|
|
|
print()
|
|
print(f"episode_id = {result.episode_id}")
|
|
print(f"path = {result.episode_dir}")
|
|
print(f"rows_proc = {result.rows_proc}")
|
|
print(f"phases = {result.phases_observed}")
|
|
print()
|
|
print("To plot:")
|
|
print(f" uv run python tools/plot_envelope.py {result.episode_dir}")
|
|
return 0
|
|
finally:
|
|
if not args.keep_vm:
|
|
log.info("shutting down VM (pid=%d)", qemu.pid)
|
|
try:
|
|
os.killpg(os.getpgid(qemu.pid), signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
pass
|
|
try:
|
|
qemu.wait(timeout=5)
|
|
except subprocess.TimeoutExpired:
|
|
os.killpg(os.getpgid(qemu.pid), signal.SIGKILL)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|