CIS490/tools/run_tier3_demo.py
Elliott Kolden 667f042707 Tier-3 bring-up: 9 bugs fixed on elliott-ThinkPad (2026-05-01)
Root causes and fixes documented in TIER3-BRINGUP.md. Summary:

1. BRIDGE env var leaked into Tier-3 subprocess → target VM used tap
   instead of SLIRP; fix: env.pop("BRIDGE") in fleet _run_slot.

2. usable_modules filter conditioned on BRIDGE presence → bridge-requiring
   modules selected on SLIRP runs; fix: always filter requires_bridge.

3. cmd/unix/interact creates no session.list entry → session_open_timeout
   every episode; fix: switch samba_usermap_script to cmd/unix/bind_perl.

4. Per-slot LPORT hostfwd used wrong guest port (host:5444→guest:4444);
   fix: extra_host_port:extra_host_port mapping so guest binds the
   per-slot LPORT directly.

5. vsftpd backdoor port 6200 hardcoded → collision across concurrent slots;
   fix: requires_bridge=true filters it from SLIRP fleet runs.

6. SLIRP false-positive in _wait_for_tcp → exploit fires before Samba
   boots (~60 s too early); fix: replace TCP probe with serial console
   _wait_for_serial_login that waits for actual "login:" prompt.

7. Stale QEMU survives orchestrator restart (start_new_session=True) →
   holds hostfwd ports, new QEMU silently fails; fix: kill by pgid from
   old pidfile before rmtree.

8. PORT_BASE default used privileged port 21; fix: default to 2021+slot*100.

9. msfrpcd 6.x returns bytes for all string values even with raw=False;
   fix: MSFRpcClient._str() recursive decoder applied to all responses.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-02 12:26:19 -06:00

376 lines
13 KiB
Python

"""Tier-3: real VM, real exploit, honest ``armed -> infecting`` transition.
Boots the vulnerable target VM, drives an msfrpcd-fired exploit module
against it, and lets the orchestrator's host /proc collector sample
the qemu-system pid throughout. Compared to ``run_real_vm_demo.py``:
the workload that crosses the ``armed -> infecting`` boundary is now
generated by an actual exploit landing a session, not by a script in
the guest.
Prereqs:
- vm/images/<target>.qcow2 (e.g. Metasploitable2)
- msfrpcd running locally:
msfrpcd -P <password> -U msf -a 127.0.0.1 -p 55553
- ``msgpack`` python package installed (added to runtime deps)
Run:
MSFRPC_PASSWORD=<pass> uv run python tools/run_tier3_demo.py \\
--module vsftpd_234_backdoor \\
--data-root data
"""
from __future__ import annotations
import argparse
import logging
import os
import signal
import subprocess
import sys
import time
from pathlib import Path
# Allow running as a script.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from collectors import qmp # noqa: E402
from exploits.driver import DriverConfig, MSFExploitDriver # noqa: E402
from exploits.modules import load_module_config # noqa: E402
from exploits.msfrpc import MSFRpcClient, MSFRpcConfig # noqa: E402
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
from samples.manifest import SampleManifest # noqa: E402
# Same envelope shape as Tier 2 so plots are comparable. Slightly more
# armed/infecting time because real exploit fire + session establishment
# takes hundreds of ms to a few seconds.
DEFAULT_SCHEDULE = [
("clean", 10.0),
("armed", 3.0),
("infecting", 5.0),
("infected_running", 25.0),
("dormant", 15.0),
("infected_running", 20.0),
("dormant", 5.0),
("clean", 5.0),
]
def _wait_for_path(path: Path, timeout_s: float) -> None:
deadline = time.monotonic() + timeout_s
while time.monotonic() < deadline:
if path.exists() and path.read_text().strip():
return
time.sleep(0.2)
raise TimeoutError(f"{path} never appeared within {timeout_s}s")
def _wait_for_tcp(host: str, port: int, timeout_s: float) -> None:
"""Legacy TCP probe — only reliable when the guest speaks first on connect.
Kept for reference; replaced by _wait_for_serial_login for SLIRP guests."""
import socket
deadline = time.monotonic() + timeout_s
last_err: Exception | None = None
while time.monotonic() < deadline:
try:
with socket.create_connection((host, port), timeout=1.0) as s:
s.settimeout(0.5)
try:
s.recv(1)
except socket.timeout:
pass
return
except OSError as e:
last_err = e
time.sleep(1.0)
raise TimeoutError(
f"target service {host}:{port} not reachable within {timeout_s}s "
f"(last: {last_err})"
)
def _wait_for_serial_login(
serial_sock: "Path",
timeout_s: float,
prompt: bytes = b"login:",
) -> None:
"""Wait for a shell login prompt on the QEMU serial console.
SLIRP completes the TCP handshake before the guest OS boots, making
TCP-based readiness probes on port 139/445 unreliable (they return
immediately even when Samba isn't running yet). The serial console is
authoritative: we connect right after QEMU writes its pidfile (before
the guest produces any output) and stream boot messages until the
"login:" prompt appears.
QEMU's serial chardev is ``server=on,wait=off``: the socket is created
at QEMU startup. Data written before a client connects is discarded, so
we must connect before the prompt appears. Since the pidfile is written
after QEMU finishes device init (well before the guest kernel loads), we
reliably connect in time.
"""
import socket as _socket
deadline = time.monotonic() + timeout_s
while not serial_sock.exists():
if time.monotonic() >= deadline:
raise TimeoutError(f"serial socket {serial_sock} never appeared")
time.sleep(0.2)
buf = b""
sock = _socket.socket(_socket.AF_UNIX, _socket.SOCK_STREAM)
sock.settimeout(2.0)
try:
sock.connect(str(serial_sock))
while time.monotonic() < deadline:
try:
chunk = sock.recv(4096)
if not chunk:
break
buf += chunk
if prompt in buf.lower():
return
except _socket.timeout:
pass
finally:
sock.close()
raise TimeoutError(
f"login prompt not seen on serial console within {timeout_s}s "
f"(last {min(200, len(buf))} bytes: {buf[-200:]!r})"
)
def main() -> int:
parser = argparse.ArgumentParser(prog="run_tier3_demo")
parser.add_argument("--data-root", default="data")
parser.add_argument("--interval-ms", type=int, default=100)
parser.add_argument(
"--module",
default="vsftpd_234_backdoor",
help="Module config name in exploits/modules/<name>.toml",
)
parser.add_argument(
"--target-ip",
default="127.0.0.1",
help="Address the exploit module sets RHOSTS to. With the SLIRP "
"launcher (default), the guest's vulnerable port is hostfwd'd to "
"loopback; on a host-only bridge, this is the guest's bridge IP.",
)
parser.add_argument(
"--target-port",
type=int,
default=21,
help="Probe port to wait on before firing the exploit",
)
parser.add_argument(
"--run-dir",
# Per-slot defaults so the fleet runner's parallel calls don't
# collide on the same /tmp dir. See run_real_vm_demo.py for
# the same fix.
default=(
os.environ.get("RUN_DIR")
or f"/tmp/cis490-target-{os.environ.get('SLOT', '0')}"
),
help="QEMU run dir (sockets + pidfile)",
)
parser.add_argument(
"--msfrpc-host", default=os.environ.get("MSFRPC_HOST", "127.0.0.1"),
)
parser.add_argument(
"--msfrpc-port", type=int,
default=int(os.environ.get("MSFRPC_PORT", "55553")),
)
parser.add_argument(
"--msfrpc-user", default=os.environ.get("MSFRPC_USER", "msf"),
)
parser.add_argument(
"--keep-vm",
action="store_true",
help="leave the VM running after the episode finishes",
)
parser.add_argument(
"--target-boot-timeout",
type=float,
default=180.0,
help="how long to wait for the guest's vulnerable service to listen",
)
parser.add_argument(
"--sample",
default=os.environ.get("SAMPLE_NAME"),
help="Pick a workload profile from the manifest by name. Fleet runner "
"passes this via SAMPLE_NAME env. Without it, falls back to the v1 yes-loop.",
)
parser.add_argument(
"--manifest",
default=str(Path(__file__).resolve().parent.parent / "samples" / "manifest.toml"),
)
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
log = logging.getLogger("cis490.run_tier3_demo")
msfrpc_password = os.environ.get("MSFRPC_PASSWORD")
if not msfrpc_password:
log.error("MSFRPC_PASSWORD env var must be set")
return 2
repo_root = Path(__file__).resolve().parent.parent
launcher = repo_root / "vm" / "launch_target.sh"
modules_dir = repo_root / "exploits" / "modules"
module_path = modules_dir / f"{args.module}.toml"
if not module_path.exists():
log.error("no module config at %s", module_path)
return 2
module = load_module_config(module_path)
log.info("module loaded: %s (%s)", module.name, module.module_path)
sample = None
if args.sample:
manifest = SampleManifest.load(args.manifest)
sample = next((s for s in manifest.samples if s.name == args.sample), None)
if sample is None:
log.error("sample %r not in manifest %s", args.sample, args.manifest)
return 2
log.info("sample=%s profile=%s kind=%s",
sample.name, sample.profile, sample.kind)
run_dir = Path(args.run_dir)
# Kill any QEMU still holding this slot's run_dir from a previous wave.
# QEMU is started with start_new_session=True so it survives orchestrator
# SIGTERM without explicit cleanup here.
old_pid_file = run_dir / "qemu.pid"
if old_pid_file.exists():
try:
old_pid = int(old_pid_file.read_text().strip())
import os as _os
_os.killpg(_os.getpgid(old_pid), signal.SIGTERM)
time.sleep(1.5)
except (ProcessLookupError, ValueError, OSError):
pass
if run_dir.exists():
import shutil
shutil.rmtree(run_dir)
run_dir.mkdir(parents=True, exist_ok=True)
pid_file = run_dir / "qemu.pid"
log.info("booting target VM via %s (RUN_DIR=%s)", launcher, run_dir)
env = os.environ.copy()
env["RUN_DIR"] = str(run_dir)
qemu = subprocess.Popen(
[str(launcher)],
cwd=str(repo_root),
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
try:
_wait_for_path(pid_file, timeout_s=15.0)
qemu_pid = int(pid_file.read_text().strip())
serial_sock = run_dir / "serial.sock"
log.info("qemu pid = %d; waiting for login prompt on serial console (timeout %.0fs)",
qemu_pid, args.target_boot_timeout)
_wait_for_serial_login(serial_sock, timeout_s=args.target_boot_timeout)
log.info("target guest OS ready (login prompt seen on serial console)")
# Pre-exploit savevm so EpisodeConfig.revert_at_{start,end}
# has a known-good baseline to load. Best-effort — we still
# run the episode if savevm fails (just without revert
# support). See run_real_vm_demo.py for the same pattern.
qmp_sock = run_dir / "qmp.sock"
if qmp_sock.exists():
try:
_qmp = qmp.QMPClient(qmp_sock)
_qmp.connect()
try:
out = _qmp.savevm("baseline-v1")
log.info("savevm baseline-v1 OK: %s", out.strip()[:160])
finally:
_qmp.close()
except Exception as e:
log.warning("savevm failed; revert_at_start unusable: %s", e)
client = MSFRpcClient(
MSFRpcConfig(
host=args.msfrpc_host,
port=args.msfrpc_port,
user=args.msfrpc_user,
password=msfrpc_password,
)
)
cfg = EpisodeConfig(
target_pid=qemu_pid,
duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
interval_ms=args.interval_ms,
data_root=Path(args.data_root),
phase_schedule=DEFAULT_SCHEDULE,
image_name=module.name + "-target",
snapshot_name="baseline-v1",
sample=sample,
exploit_meta={
"framework": "metasploit",
"module": module.module_path,
"module_type": module.module_type,
"module_name": module.name,
"payload": module.payload_path,
"rport": module.options.get("RPORT"),
"rhost_template": module.options.get("RHOSTS"),
},
)
runner = EpisodeRunner(cfg)
driver = MSFExploitDriver(
client=client,
module=module,
cfg=DriverConfig(
target_ip=args.target_ip,
# Override RPORT when target_port is an unprivileged host port
# (i.e. fleet runner remapped the guest's privileged port to a
# loopback port > 1024). When target_port == module RPORT the
# caller wants direct guest access; leave RPORT unchanged.
target_port=args.target_port if args.target_port > 1024 else None,
sample_store_root=repo_root / "samples" / "store",
),
emit_event=runner.emit_event,
sample=sample,
)
runner.on_phase = driver.set_phase
driver.setup()
try:
result = runner.run()
finally:
driver.teardown()
print()
print(f"episode_id = {result.episode_id}")
print(f"path = {result.episode_dir}")
print(f"rows_proc = {result.rows_proc}")
print(f"phases = {result.phases_observed}")
print(f"module = {module.module_path}")
print()
print("To plot:")
print(f" uv run python tools/plot_envelope.py {result.episode_dir}")
return 0
finally:
if not args.keep_vm:
log.info("shutting down VM (pid=%d)", qemu.pid)
try:
os.killpg(os.getpgid(qemu.pid), signal.SIGTERM)
except ProcessLookupError:
pass
try:
qemu.wait(timeout=5)
except subprocess.TimeoutExpired:
os.killpg(os.getpgid(qemu.pid), signal.SIGKILL)
if __name__ == "__main__":
sys.exit(main())