CIS490/tools/run_tier3_demo.py

"""Tier-3: real VM, real exploit, honest ``armed -> infecting`` transition.

Boots the vulnerable target VM, drives an msfrpcd-fired exploit module
against it, and lets the orchestrator's host /proc collector sample
the qemu-system pid throughout. Compared to ``run_real_vm_demo.py``:
the workload that crosses the ``armed -> infecting`` boundary is now
generated by an actual exploit landing a session, not by a script in
the guest.

Prereqs:
  - vm/images/<target>.qcow2 (e.g. Metasploitable2)
  - msfrpcd running locally:
        msfrpcd -P <password> -U msf -a 127.0.0.1 -p 55553
  - ``msgpack`` python package installed (added to runtime deps)

Run:
    MSFRPC_PASSWORD=<pass> uv run python tools/run_tier3_demo.py \\
        --module vsftpd_234_backdoor \\
        --data-root data
"""

from __future__ import annotations

import argparse
import logging
import os
import signal
import subprocess
import sys
import time
from pathlib import Path

# Allow running as a script.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from collectors import qmp  # noqa: E402
from exploits.driver import DriverConfig, MSFExploitDriver  # noqa: E402
from exploits.modules import load_module_config  # noqa: E402
from exploits.msfrpc import MSFRpcClient, MSFRpcConfig  # noqa: E402
from orchestrator.episode import EpisodeConfig, EpisodeRunner  # noqa: E402
from samples.manifest import SampleManifest  # noqa: E402


# Same envelope shape as Tier 2 so plots are comparable. Slightly more
# armed/infecting time because real exploit fire + session establishment
# takes hundreds of ms to a few seconds.
DEFAULT_SCHEDULE = [
    ("clean",            10.0),
    ("armed",             3.0),
    ("infecting",         5.0),
    ("infected_running", 25.0),
    ("dormant",          15.0),
    ("infected_running", 20.0),
    ("dormant",           5.0),
    ("clean",             5.0),
]


def _wait_for_path(path: Path, timeout_s: float) -> None:
    deadline = time.monotonic() + timeout_s
    while time.monotonic() < deadline:
        if path.exists() and path.read_text().strip():
            return
        time.sleep(0.2)
    raise TimeoutError(f"{path} never appeared within {timeout_s}s")


def _wait_for_tcp(host: str, port: int, timeout_s: float) -> None:
    """Legacy TCP probe — only reliable when the guest speaks first on connect.
    Kept for reference; replaced by _wait_for_serial_login for SLIRP guests."""
    import socket
    deadline = time.monotonic() + timeout_s
    last_err: Exception | None = None
    while time.monotonic() < deadline:
        try:
            with socket.create_connection((host, port), timeout=1.0) as s:
                s.settimeout(0.5)
                try:
                    s.recv(1)
                except socket.timeout:
                    pass
            return
        except OSError as e:
            last_err = e
            time.sleep(1.0)
    raise TimeoutError(
        f"target service {host}:{port} not reachable within {timeout_s}s "
        f"(last: {last_err})"
    )


def _wait_for_serial_login(
    serial_sock: "Path",
    timeout_s: float,
    prompt: bytes = b"login:",
) -> None:
    """Wait for a shell login prompt on the QEMU serial console.

    SLIRP completes the TCP handshake before the guest OS boots, making
    TCP-based readiness probes on port 139/445 unreliable (they return
    immediately even when Samba isn't running yet). The serial console is
    authoritative: we connect right after QEMU writes its pidfile (before
    the guest produces any output) and stream boot messages until the
    "login:" prompt appears.

    QEMU's serial chardev is ``server=on,wait=off``: the socket is created
    at QEMU startup. Data written before a client connects is discarded, so
    we must connect before the prompt appears. Since the pidfile is written
    after QEMU finishes device init (well before the guest kernel loads), we
    reliably connect in time.
    """
    import socket as _socket

    deadline = time.monotonic() + timeout_s
    while not serial_sock.exists():
        if time.monotonic() >= deadline:
            raise TimeoutError(f"serial socket {serial_sock} never appeared")
        time.sleep(0.2)

    buf = b""
    sock = _socket.socket(_socket.AF_UNIX, _socket.SOCK_STREAM)
    sock.settimeout(2.0)
    try:
        sock.connect(str(serial_sock))
        while time.monotonic() < deadline:
            try:
                chunk = sock.recv(4096)
                if not chunk:
                    break
                buf += chunk
                if prompt in buf.lower():
                    return
            except _socket.timeout:
                pass
    finally:
        sock.close()

    raise TimeoutError(
        f"login prompt not seen on serial console within {timeout_s}s "
        f"(last {min(200, len(buf))} bytes: {buf[-200:]!r})"
    )


def main() -> int:
    parser = argparse.ArgumentParser(prog="run_tier3_demo")
    parser.add_argument("--data-root", default="data")
    parser.add_argument("--interval-ms", type=int, default=100)
    parser.add_argument(
        "--module",
        default="vsftpd_234_backdoor",
        help="Module config name in exploits/modules/<name>.toml",
    )
    parser.add_argument(
        "--target-ip",
        default="127.0.0.1",
        help="Address the exploit module sets RHOSTS to. With the SLIRP "
        "launcher (default), the guest's vulnerable port is hostfwd'd to "
        "loopback; on a host-only bridge, this is the guest's bridge IP.",
    )
    parser.add_argument(
        "--target-port",
        type=int,
        default=21,
        help="Probe port to wait on before firing the exploit",
    )
    parser.add_argument(
        "--run-dir",
        # Per-slot defaults so the fleet runner's parallel calls don't
        # collide on the same /tmp dir. See run_real_vm_demo.py for
        # the same fix.
        default=(
            os.environ.get("RUN_DIR")
            or f"/tmp/cis490-target-{os.environ.get('SLOT', '0')}"
        ),
        help="QEMU run dir (sockets + pidfile)",
    )
    parser.add_argument(
        "--msfrpc-host", default=os.environ.get("MSFRPC_HOST", "127.0.0.1"),
    )
    parser.add_argument(
        "--msfrpc-port", type=int,
        default=int(os.environ.get("MSFRPC_PORT", "55553")),
    )
    parser.add_argument(
        "--msfrpc-user", default=os.environ.get("MSFRPC_USER", "msf"),
    )
    parser.add_argument(
        "--keep-vm",
        action="store_true",
        help="leave the VM running after the episode finishes",
    )
    parser.add_argument(
        "--target-boot-timeout",
        type=float,
        default=180.0,
        help="how long to wait for the guest's vulnerable service to listen",
    )
    parser.add_argument(
        "--sample",
        default=os.environ.get("SAMPLE_NAME"),
        help="Pick a workload profile from the manifest by name. Fleet runner "
        "passes this via SAMPLE_NAME env. Without it, falls back to the v1 yes-loop.",
    )
    parser.add_argument(
        "--manifest",
        default=str(Path(__file__).resolve().parent.parent / "samples" / "manifest.toml"),
    )
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
    )
    log = logging.getLogger("cis490.run_tier3_demo")

    msfrpc_password = os.environ.get("MSFRPC_PASSWORD")
    if not msfrpc_password:
        log.error("MSFRPC_PASSWORD env var must be set")
        return 2

    repo_root = Path(__file__).resolve().parent.parent
    launcher = repo_root / "vm" / "launch_target.sh"
    modules_dir = repo_root / "exploits" / "modules"
    module_path = modules_dir / f"{args.module}.toml"
    if not module_path.exists():
        log.error("no module config at %s", module_path)
        return 2

    module = load_module_config(module_path)
    log.info("module loaded: %s (%s)", module.name, module.module_path)

    sample = None
    if args.sample:
        manifest = SampleManifest.load(args.manifest)
        sample = next((s for s in manifest.samples if s.name == args.sample), None)
        if sample is None:
            log.error("sample %r not in manifest %s", args.sample, args.manifest)
            return 2
        log.info("sample=%s profile=%s kind=%s",
                 sample.name, sample.profile, sample.kind)

    run_dir = Path(args.run_dir)
    # Kill any QEMU still holding this slot's run_dir from a previous wave.
    # QEMU is started with start_new_session=True so it survives orchestrator
    # SIGTERM without explicit cleanup here.
    old_pid_file = run_dir / "qemu.pid"
    if old_pid_file.exists():
        try:
            old_pid = int(old_pid_file.read_text().strip())
            import os as _os
            _os.killpg(_os.getpgid(old_pid), signal.SIGTERM)
            time.sleep(1.5)
        except (ProcessLookupError, ValueError, OSError):
            pass
    if run_dir.exists():
        import shutil
        shutil.rmtree(run_dir)
    run_dir.mkdir(parents=True, exist_ok=True)
    pid_file = run_dir / "qemu.pid"

    log.info("booting target VM via %s (RUN_DIR=%s)", launcher, run_dir)
    env = os.environ.copy()
    env["RUN_DIR"] = str(run_dir)
    qemu = subprocess.Popen(
        [str(launcher)],
        cwd=str(repo_root),
        env=env,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
        start_new_session=True,
    )

    try:
        _wait_for_path(pid_file, timeout_s=15.0)
        qemu_pid = int(pid_file.read_text().strip())
        serial_sock = run_dir / "serial.sock"
        log.info("qemu pid = %d; waiting for login prompt on serial console (timeout %.0fs)",
                 qemu_pid, args.target_boot_timeout)
        _wait_for_serial_login(serial_sock, timeout_s=args.target_boot_timeout)
        log.info("target guest OS ready (login prompt seen on serial console)")

        # Pre-exploit savevm so EpisodeConfig.revert_at_{start,end}
        # has a known-good baseline to load. Best-effort — we still
        # run the episode if savevm fails (just without revert
        # support). See run_real_vm_demo.py for the same pattern.
        qmp_sock = run_dir / "qmp.sock"
        if qmp_sock.exists():
            try:
                _qmp = qmp.QMPClient(qmp_sock)
                _qmp.connect()
                try:
                    out = _qmp.savevm("baseline-v1")
                    log.info("savevm baseline-v1 OK: %s", out.strip()[:160])
                finally:
                    _qmp.close()
            except Exception as e:
                log.warning("savevm failed; revert_at_start unusable: %s", e)

        client = MSFRpcClient(
            MSFRpcConfig(
                host=args.msfrpc_host,
                port=args.msfrpc_port,
                user=args.msfrpc_user,
                password=msfrpc_password,
            )
        )

        cfg = EpisodeConfig(
            target_pid=qemu_pid,
            duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
            interval_ms=args.interval_ms,
            data_root=Path(args.data_root),
            phase_schedule=DEFAULT_SCHEDULE,
            image_name=module.name + "-target",
            snapshot_name="baseline-v1",
            sample=sample,
            exploit_meta={
                "framework": "metasploit",
                "module": module.module_path,
                "module_type": module.module_type,
                "module_name": module.name,
                "payload": module.payload_path,
                "rport": module.options.get("RPORT"),
                "rhost_template": module.options.get("RHOSTS"),
            },
        )
        runner = EpisodeRunner(cfg)

        driver = MSFExploitDriver(
            client=client,
            module=module,
            cfg=DriverConfig(
                target_ip=args.target_ip,
                # Override RPORT when target_port is an unprivileged host port
                # (i.e. fleet runner remapped the guest's privileged port to a
                # loopback port > 1024). When target_port == module RPORT the
                # caller wants direct guest access; leave RPORT unchanged.
                target_port=args.target_port if args.target_port > 1024 else None,
                sample_store_root=repo_root / "samples" / "store",
            ),
            emit_event=runner.emit_event,
            sample=sample,
        )
        runner.on_phase = driver.set_phase

        driver.setup()
        try:
            result = runner.run()
        finally:
            driver.teardown()

        print()
        print(f"episode_id = {result.episode_id}")
        print(f"path       = {result.episode_dir}")
        print(f"rows_proc  = {result.rows_proc}")
        print(f"phases     = {result.phases_observed}")
        print(f"module     = {module.module_path}")
        print()
        print("To plot:")
        print(f"  uv run python tools/plot_envelope.py {result.episode_dir}")
        return 0
    finally:
        if not args.keep_vm:
            log.info("shutting down VM (pid=%d)", qemu.pid)
            try:
                os.killpg(os.getpgid(qemu.pid), signal.SIGTERM)
            except ProcessLookupError:
                pass
            try:
                qemu.wait(timeout=5)
            except subprocess.TimeoutExpired:
                os.killpg(os.getpgid(qemu.pid), signal.SIGKILL)


if __name__ == "__main__":
    sys.exit(main())