Root causes and fixes documented in TIER3-BRINGUP.md. Summary:
1. BRIDGE env var leaked into Tier-3 subprocess → target VM used tap
instead of SLIRP; fix: env.pop("BRIDGE") in fleet _run_slot.
2. usable_modules filter conditioned on BRIDGE presence → bridge-requiring
modules selected on SLIRP runs; fix: always filter requires_bridge.
3. cmd/unix/interact creates no session.list entry → session_open_timeout
every episode; fix: switch samba_usermap_script to cmd/unix/bind_perl.
4. Per-slot LPORT hostfwd used wrong guest port (host:5444→guest:4444);
fix: extra_host_port:extra_host_port mapping so guest binds the
per-slot LPORT directly.
5. vsftpd backdoor port 6200 hardcoded → collision across concurrent slots;
fix: requires_bridge=true filters it from SLIRP fleet runs.
6. SLIRP false-positive in _wait_for_tcp → exploit fires before Samba
boots (~60 s too early); fix: replace TCP probe with serial console
_wait_for_serial_login that waits for actual "login:" prompt.
7. Stale QEMU survives orchestrator restart (start_new_session=True) →
holds hostfwd ports, new QEMU silently fails; fix: kill by pgid from
old pidfile before rmtree.
8. PORT_BASE default used privileged port 21; fix: default to 2021+slot*100.
9. msfrpcd 6.x returns bytes for all string values even with raw=False;
fix: MSFRpcClient._str() recursive decoder applied to all responses.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
376 lines
13 KiB
Python
376 lines
13 KiB
Python
"""Tier-3: real VM, real exploit, honest ``armed -> infecting`` transition.
|
|
|
|
Boots the vulnerable target VM, drives an msfrpcd-fired exploit module
|
|
against it, and lets the orchestrator's host /proc collector sample
|
|
the qemu-system pid throughout. Compared to ``run_real_vm_demo.py``:
|
|
the workload that crosses the ``armed -> infecting`` boundary is now
|
|
generated by an actual exploit landing a session, not by a script in
|
|
the guest.
|
|
|
|
Prereqs:
|
|
- vm/images/<target>.qcow2 (e.g. Metasploitable2)
|
|
- msfrpcd running locally:
|
|
msfrpcd -P <password> -U msf -a 127.0.0.1 -p 55553
|
|
- ``msgpack`` python package installed (added to runtime deps)
|
|
|
|
Run:
|
|
MSFRPC_PASSWORD=<pass> uv run python tools/run_tier3_demo.py \\
|
|
--module vsftpd_234_backdoor \\
|
|
--data-root data
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Allow running as a script.
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
from collectors import qmp # noqa: E402
|
|
from exploits.driver import DriverConfig, MSFExploitDriver # noqa: E402
|
|
from exploits.modules import load_module_config # noqa: E402
|
|
from exploits.msfrpc import MSFRpcClient, MSFRpcConfig # noqa: E402
|
|
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
|
|
from samples.manifest import SampleManifest # noqa: E402
|
|
|
|
|
|
# Same envelope shape as Tier 2 so plots are comparable. Slightly more
|
|
# armed/infecting time because real exploit fire + session establishment
|
|
# takes hundreds of ms to a few seconds.
|
|
DEFAULT_SCHEDULE = [
|
|
("clean", 10.0),
|
|
("armed", 3.0),
|
|
("infecting", 5.0),
|
|
("infected_running", 25.0),
|
|
("dormant", 15.0),
|
|
("infected_running", 20.0),
|
|
("dormant", 5.0),
|
|
("clean", 5.0),
|
|
]
|
|
|
|
|
|
def _wait_for_path(path: Path, timeout_s: float) -> None:
|
|
deadline = time.monotonic() + timeout_s
|
|
while time.monotonic() < deadline:
|
|
if path.exists() and path.read_text().strip():
|
|
return
|
|
time.sleep(0.2)
|
|
raise TimeoutError(f"{path} never appeared within {timeout_s}s")
|
|
|
|
|
|
def _wait_for_tcp(host: str, port: int, timeout_s: float) -> None:
|
|
"""Legacy TCP probe — only reliable when the guest speaks first on connect.
|
|
Kept for reference; replaced by _wait_for_serial_login for SLIRP guests."""
|
|
import socket
|
|
deadline = time.monotonic() + timeout_s
|
|
last_err: Exception | None = None
|
|
while time.monotonic() < deadline:
|
|
try:
|
|
with socket.create_connection((host, port), timeout=1.0) as s:
|
|
s.settimeout(0.5)
|
|
try:
|
|
s.recv(1)
|
|
except socket.timeout:
|
|
pass
|
|
return
|
|
except OSError as e:
|
|
last_err = e
|
|
time.sleep(1.0)
|
|
raise TimeoutError(
|
|
f"target service {host}:{port} not reachable within {timeout_s}s "
|
|
f"(last: {last_err})"
|
|
)
|
|
|
|
|
|
def _wait_for_serial_login(
|
|
serial_sock: "Path",
|
|
timeout_s: float,
|
|
prompt: bytes = b"login:",
|
|
) -> None:
|
|
"""Wait for a shell login prompt on the QEMU serial console.
|
|
|
|
SLIRP completes the TCP handshake before the guest OS boots, making
|
|
TCP-based readiness probes on port 139/445 unreliable (they return
|
|
immediately even when Samba isn't running yet). The serial console is
|
|
authoritative: we connect right after QEMU writes its pidfile (before
|
|
the guest produces any output) and stream boot messages until the
|
|
"login:" prompt appears.
|
|
|
|
QEMU's serial chardev is ``server=on,wait=off``: the socket is created
|
|
at QEMU startup. Data written before a client connects is discarded, so
|
|
we must connect before the prompt appears. Since the pidfile is written
|
|
after QEMU finishes device init (well before the guest kernel loads), we
|
|
reliably connect in time.
|
|
"""
|
|
import socket as _socket
|
|
|
|
deadline = time.monotonic() + timeout_s
|
|
while not serial_sock.exists():
|
|
if time.monotonic() >= deadline:
|
|
raise TimeoutError(f"serial socket {serial_sock} never appeared")
|
|
time.sleep(0.2)
|
|
|
|
buf = b""
|
|
sock = _socket.socket(_socket.AF_UNIX, _socket.SOCK_STREAM)
|
|
sock.settimeout(2.0)
|
|
try:
|
|
sock.connect(str(serial_sock))
|
|
while time.monotonic() < deadline:
|
|
try:
|
|
chunk = sock.recv(4096)
|
|
if not chunk:
|
|
break
|
|
buf += chunk
|
|
if prompt in buf.lower():
|
|
return
|
|
except _socket.timeout:
|
|
pass
|
|
finally:
|
|
sock.close()
|
|
|
|
raise TimeoutError(
|
|
f"login prompt not seen on serial console within {timeout_s}s "
|
|
f"(last {min(200, len(buf))} bytes: {buf[-200:]!r})"
|
|
)
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(prog="run_tier3_demo")
|
|
parser.add_argument("--data-root", default="data")
|
|
parser.add_argument("--interval-ms", type=int, default=100)
|
|
parser.add_argument(
|
|
"--module",
|
|
default="vsftpd_234_backdoor",
|
|
help="Module config name in exploits/modules/<name>.toml",
|
|
)
|
|
parser.add_argument(
|
|
"--target-ip",
|
|
default="127.0.0.1",
|
|
help="Address the exploit module sets RHOSTS to. With the SLIRP "
|
|
"launcher (default), the guest's vulnerable port is hostfwd'd to "
|
|
"loopback; on a host-only bridge, this is the guest's bridge IP.",
|
|
)
|
|
parser.add_argument(
|
|
"--target-port",
|
|
type=int,
|
|
default=21,
|
|
help="Probe port to wait on before firing the exploit",
|
|
)
|
|
parser.add_argument(
|
|
"--run-dir",
|
|
# Per-slot defaults so the fleet runner's parallel calls don't
|
|
# collide on the same /tmp dir. See run_real_vm_demo.py for
|
|
# the same fix.
|
|
default=(
|
|
os.environ.get("RUN_DIR")
|
|
or f"/tmp/cis490-target-{os.environ.get('SLOT', '0')}"
|
|
),
|
|
help="QEMU run dir (sockets + pidfile)",
|
|
)
|
|
parser.add_argument(
|
|
"--msfrpc-host", default=os.environ.get("MSFRPC_HOST", "127.0.0.1"),
|
|
)
|
|
parser.add_argument(
|
|
"--msfrpc-port", type=int,
|
|
default=int(os.environ.get("MSFRPC_PORT", "55553")),
|
|
)
|
|
parser.add_argument(
|
|
"--msfrpc-user", default=os.environ.get("MSFRPC_USER", "msf"),
|
|
)
|
|
parser.add_argument(
|
|
"--keep-vm",
|
|
action="store_true",
|
|
help="leave the VM running after the episode finishes",
|
|
)
|
|
parser.add_argument(
|
|
"--target-boot-timeout",
|
|
type=float,
|
|
default=180.0,
|
|
help="how long to wait for the guest's vulnerable service to listen",
|
|
)
|
|
parser.add_argument(
|
|
"--sample",
|
|
default=os.environ.get("SAMPLE_NAME"),
|
|
help="Pick a workload profile from the manifest by name. Fleet runner "
|
|
"passes this via SAMPLE_NAME env. Without it, falls back to the v1 yes-loop.",
|
|
)
|
|
parser.add_argument(
|
|
"--manifest",
|
|
default=str(Path(__file__).resolve().parent.parent / "samples" / "manifest.toml"),
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
)
|
|
log = logging.getLogger("cis490.run_tier3_demo")
|
|
|
|
msfrpc_password = os.environ.get("MSFRPC_PASSWORD")
|
|
if not msfrpc_password:
|
|
log.error("MSFRPC_PASSWORD env var must be set")
|
|
return 2
|
|
|
|
repo_root = Path(__file__).resolve().parent.parent
|
|
launcher = repo_root / "vm" / "launch_target.sh"
|
|
modules_dir = repo_root / "exploits" / "modules"
|
|
module_path = modules_dir / f"{args.module}.toml"
|
|
if not module_path.exists():
|
|
log.error("no module config at %s", module_path)
|
|
return 2
|
|
|
|
module = load_module_config(module_path)
|
|
log.info("module loaded: %s (%s)", module.name, module.module_path)
|
|
|
|
sample = None
|
|
if args.sample:
|
|
manifest = SampleManifest.load(args.manifest)
|
|
sample = next((s for s in manifest.samples if s.name == args.sample), None)
|
|
if sample is None:
|
|
log.error("sample %r not in manifest %s", args.sample, args.manifest)
|
|
return 2
|
|
log.info("sample=%s profile=%s kind=%s",
|
|
sample.name, sample.profile, sample.kind)
|
|
|
|
run_dir = Path(args.run_dir)
|
|
# Kill any QEMU still holding this slot's run_dir from a previous wave.
|
|
# QEMU is started with start_new_session=True so it survives orchestrator
|
|
# SIGTERM without explicit cleanup here.
|
|
old_pid_file = run_dir / "qemu.pid"
|
|
if old_pid_file.exists():
|
|
try:
|
|
old_pid = int(old_pid_file.read_text().strip())
|
|
import os as _os
|
|
_os.killpg(_os.getpgid(old_pid), signal.SIGTERM)
|
|
time.sleep(1.5)
|
|
except (ProcessLookupError, ValueError, OSError):
|
|
pass
|
|
if run_dir.exists():
|
|
import shutil
|
|
shutil.rmtree(run_dir)
|
|
run_dir.mkdir(parents=True, exist_ok=True)
|
|
pid_file = run_dir / "qemu.pid"
|
|
|
|
log.info("booting target VM via %s (RUN_DIR=%s)", launcher, run_dir)
|
|
env = os.environ.copy()
|
|
env["RUN_DIR"] = str(run_dir)
|
|
qemu = subprocess.Popen(
|
|
[str(launcher)],
|
|
cwd=str(repo_root),
|
|
env=env,
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL,
|
|
start_new_session=True,
|
|
)
|
|
|
|
try:
|
|
_wait_for_path(pid_file, timeout_s=15.0)
|
|
qemu_pid = int(pid_file.read_text().strip())
|
|
serial_sock = run_dir / "serial.sock"
|
|
log.info("qemu pid = %d; waiting for login prompt on serial console (timeout %.0fs)",
|
|
qemu_pid, args.target_boot_timeout)
|
|
_wait_for_serial_login(serial_sock, timeout_s=args.target_boot_timeout)
|
|
log.info("target guest OS ready (login prompt seen on serial console)")
|
|
|
|
# Pre-exploit savevm so EpisodeConfig.revert_at_{start,end}
|
|
# has a known-good baseline to load. Best-effort — we still
|
|
# run the episode if savevm fails (just without revert
|
|
# support). See run_real_vm_demo.py for the same pattern.
|
|
qmp_sock = run_dir / "qmp.sock"
|
|
if qmp_sock.exists():
|
|
try:
|
|
_qmp = qmp.QMPClient(qmp_sock)
|
|
_qmp.connect()
|
|
try:
|
|
out = _qmp.savevm("baseline-v1")
|
|
log.info("savevm baseline-v1 OK: %s", out.strip()[:160])
|
|
finally:
|
|
_qmp.close()
|
|
except Exception as e:
|
|
log.warning("savevm failed; revert_at_start unusable: %s", e)
|
|
|
|
client = MSFRpcClient(
|
|
MSFRpcConfig(
|
|
host=args.msfrpc_host,
|
|
port=args.msfrpc_port,
|
|
user=args.msfrpc_user,
|
|
password=msfrpc_password,
|
|
)
|
|
)
|
|
|
|
cfg = EpisodeConfig(
|
|
target_pid=qemu_pid,
|
|
duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
|
|
interval_ms=args.interval_ms,
|
|
data_root=Path(args.data_root),
|
|
phase_schedule=DEFAULT_SCHEDULE,
|
|
image_name=module.name + "-target",
|
|
snapshot_name="baseline-v1",
|
|
sample=sample,
|
|
exploit_meta={
|
|
"framework": "metasploit",
|
|
"module": module.module_path,
|
|
"module_type": module.module_type,
|
|
"module_name": module.name,
|
|
"payload": module.payload_path,
|
|
"rport": module.options.get("RPORT"),
|
|
"rhost_template": module.options.get("RHOSTS"),
|
|
},
|
|
)
|
|
runner = EpisodeRunner(cfg)
|
|
|
|
driver = MSFExploitDriver(
|
|
client=client,
|
|
module=module,
|
|
cfg=DriverConfig(
|
|
target_ip=args.target_ip,
|
|
# Override RPORT when target_port is an unprivileged host port
|
|
# (i.e. fleet runner remapped the guest's privileged port to a
|
|
# loopback port > 1024). When target_port == module RPORT the
|
|
# caller wants direct guest access; leave RPORT unchanged.
|
|
target_port=args.target_port if args.target_port > 1024 else None,
|
|
sample_store_root=repo_root / "samples" / "store",
|
|
),
|
|
emit_event=runner.emit_event,
|
|
sample=sample,
|
|
)
|
|
runner.on_phase = driver.set_phase
|
|
|
|
driver.setup()
|
|
try:
|
|
result = runner.run()
|
|
finally:
|
|
driver.teardown()
|
|
|
|
print()
|
|
print(f"episode_id = {result.episode_id}")
|
|
print(f"path = {result.episode_dir}")
|
|
print(f"rows_proc = {result.rows_proc}")
|
|
print(f"phases = {result.phases_observed}")
|
|
print(f"module = {module.module_path}")
|
|
print()
|
|
print("To plot:")
|
|
print(f" uv run python tools/plot_envelope.py {result.episode_dir}")
|
|
return 0
|
|
finally:
|
|
if not args.keep_vm:
|
|
log.info("shutting down VM (pid=%d)", qemu.pid)
|
|
try:
|
|
os.killpg(os.getpgid(qemu.pid), signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
pass
|
|
try:
|
|
qemu.wait(timeout=5)
|
|
except subprocess.TimeoutExpired:
|
|
os.killpg(os.getpgid(qemu.pid), signal.SIGKILL)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|