CIS490/tools/run_tier3_demo.py

"""Tier-3: real VM, real exploit, honest ``armed -> infecting`` transition.

Boots the vulnerable target VM, drives an msfrpcd-fired exploit module
against it, and lets the orchestrator's host /proc collector sample
the qemu-system pid throughout. Compared to ``run_real_vm_demo.py``:
the workload that crosses the ``armed -> infecting`` boundary is now
generated by an actual exploit landing a session, not by a script in
the guest.

Prereqs:
  - vm/images/<target>.qcow2 (e.g. Metasploitable2)
  - msfrpcd running locally:
        msfrpcd -P <password> -U msf -a 127.0.0.1 -p 55553
  - ``msgpack`` python package installed (added to runtime deps)

Run:
    MSFRPC_PASSWORD=<pass> uv run python tools/run_tier3_demo.py \\
        --module vsftpd_234_backdoor \\
        --data-root data
"""

from __future__ import annotations

import argparse
import logging
import os
import signal
import subprocess
import sys
import time
from pathlib import Path

# Allow running as a script.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from exploits.driver import DriverConfig, MSFExploitDriver  # noqa: E402
from exploits.modules import load_module_config  # noqa: E402
from exploits.msfrpc import MSFRpcClient, MSFRpcConfig  # noqa: E402
from orchestrator.episode import EpisodeConfig, EpisodeRunner  # noqa: E402
from samples.manifest import SampleManifest  # noqa: E402


# Same envelope shape as Tier 2 so plots are comparable. Slightly more
# armed/infecting time because real exploit fire + session establishment
# takes hundreds of ms to a few seconds.
DEFAULT_SCHEDULE = [
    ("clean",            10.0),
    ("armed",             3.0),
    ("infecting",         5.0),
    ("infected_running", 25.0),
    ("dormant",          15.0),
    ("infected_running", 20.0),
    ("dormant",           5.0),
    ("clean",             5.0),
]


def _wait_for_path(path: Path, timeout_s: float) -> None:
    deadline = time.monotonic() + timeout_s
    while time.monotonic() < deadline:
        if path.exists() and path.read_text().strip():
            return
        time.sleep(0.2)
    raise TimeoutError(f"{path} never appeared within {timeout_s}s")


def _wait_for_tcp(host: str, port: int, timeout_s: float) -> None:
    import socket
    deadline = time.monotonic() + timeout_s
    last_err: Exception | None = None
    while time.monotonic() < deadline:
        try:
            with socket.create_connection((host, port), timeout=1.0):
                return
        except OSError as e:
            last_err = e
            time.sleep(1.0)
    raise TimeoutError(
        f"target service {host}:{port} not reachable within {timeout_s}s "
        f"(last: {last_err})"
    )


def main() -> int:
    parser = argparse.ArgumentParser(prog="run_tier3_demo")
    parser.add_argument("--data-root", default="data")
    parser.add_argument("--interval-ms", type=int, default=100)
    parser.add_argument(
        "--module",
        default="vsftpd_234_backdoor",
        help="Module config name in exploits/modules/<name>.toml",
    )
    parser.add_argument(
        "--target-ip",
        default="127.0.0.1",
        help="Address the exploit module sets RHOSTS to. With the SLIRP "
        "launcher (default), the guest's vulnerable port is hostfwd'd to "
        "loopback; on a host-only bridge, this is the guest's bridge IP.",
    )
    parser.add_argument(
        "--target-port",
        type=int,
        default=21,
        help="Probe port to wait on before firing the exploit",
    )
    parser.add_argument(
        "--run-dir",
        default="/tmp/cis490-target",
        help="QEMU run dir (sockets + pidfile)",
    )
    parser.add_argument(
        "--msfrpc-host", default=os.environ.get("MSFRPC_HOST", "127.0.0.1"),
    )
    parser.add_argument(
        "--msfrpc-port", type=int,
        default=int(os.environ.get("MSFRPC_PORT", "55553")),
    )
    parser.add_argument(
        "--msfrpc-user", default=os.environ.get("MSFRPC_USER", "msf"),
    )
    parser.add_argument(
        "--keep-vm",
        action="store_true",
        help="leave the VM running after the episode finishes",
    )
    parser.add_argument(
        "--target-boot-timeout",
        type=float,
        default=180.0,
        help="how long to wait for the guest's vulnerable service to listen",
    )
    parser.add_argument(
        "--sample",
        default=os.environ.get("SAMPLE_NAME"),
        help="Pick a workload profile from the manifest by name. Fleet runner "
        "passes this via SAMPLE_NAME env. Without it, falls back to the v1 yes-loop.",
    )
    parser.add_argument(
        "--manifest",
        default=str(Path(__file__).resolve().parent.parent / "samples" / "manifest.toml"),
    )
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
    )
    log = logging.getLogger("cis490.run_tier3_demo")

    msfrpc_password = os.environ.get("MSFRPC_PASSWORD")
    if not msfrpc_password:
        log.error("MSFRPC_PASSWORD env var must be set")
        return 2

    repo_root = Path(__file__).resolve().parent.parent
    launcher = repo_root / "vm" / "launch_target.sh"
    modules_dir = repo_root / "exploits" / "modules"
    module_path = modules_dir / f"{args.module}.toml"
    if not module_path.exists():
        log.error("no module config at %s", module_path)
        return 2

    module = load_module_config(module_path)
    log.info("module loaded: %s (%s)", module.name, module.module_path)

    sample = None
    if args.sample:
        manifest = SampleManifest.load(args.manifest)
        sample = next((s for s in manifest.samples if s.name == args.sample), None)
        if sample is None:
            log.error("sample %r not in manifest %s", args.sample, args.manifest)
            return 2
        log.info("sample=%s profile=%s kind=%s",
                 sample.name, sample.profile, sample.kind)

    run_dir = Path(args.run_dir)
    if run_dir.exists():
        import shutil
        shutil.rmtree(run_dir)
    run_dir.mkdir(parents=True, exist_ok=True)
    pid_file = run_dir / "qemu.pid"

    log.info("booting target VM via %s (RUN_DIR=%s)", launcher, run_dir)
    env = os.environ.copy()
    env["RUN_DIR"] = str(run_dir)
    qemu = subprocess.Popen(
        [str(launcher)],
        cwd=str(repo_root),
        env=env,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
        start_new_session=True,
    )

    try:
        _wait_for_path(pid_file, timeout_s=15.0)
        qemu_pid = int(pid_file.read_text().strip())
        log.info("qemu pid = %d; waiting for service on %s:%d (timeout %.0fs)",
                 qemu_pid, args.target_ip, args.target_port,
                 args.target_boot_timeout)
        _wait_for_tcp(args.target_ip, args.target_port, args.target_boot_timeout)
        log.info("target service is up")

        client = MSFRpcClient(
            MSFRpcConfig(
                host=args.msfrpc_host,
                port=args.msfrpc_port,
                user=args.msfrpc_user,
                password=msfrpc_password,
            )
        )

        cfg = EpisodeConfig(
            target_pid=qemu_pid,
            duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
            interval_ms=args.interval_ms,
            data_root=Path(args.data_root),
            phase_schedule=DEFAULT_SCHEDULE,
            image_name=module.name + "-target",
            snapshot_name="qcow2-snapshot-on",
        )
        runner = EpisodeRunner(cfg)

        driver = MSFExploitDriver(
            client=client,
            module=module,
            cfg=DriverConfig(
                target_ip=args.target_ip,
                sample_store_root=repo_root / "samples" / "store",
            ),
            emit_event=runner.emit_event,
            sample=sample,
        )
        runner.on_phase = driver.set_phase

        driver.setup()
        try:
            result = runner.run()
        finally:
            driver.teardown()

        print()
        print(f"episode_id = {result.episode_id}")
        print(f"path       = {result.episode_dir}")
        print(f"rows_proc  = {result.rows_proc}")
        print(f"phases     = {result.phases_observed}")
        print(f"module     = {module.module_path}")
        print()
        print("To plot:")
        print(f"  uv run python tools/plot_envelope.py {result.episode_dir}")
        return 0
    finally:
        if not args.keep_vm:
            log.info("shutting down VM (pid=%d)", qemu.pid)
            try:
                os.killpg(os.getpgid(qemu.pid), signal.SIGTERM)
            except ProcessLookupError:
                pass
            try:
                qemu.wait(timeout=5)
            except subprocess.TimeoutExpired:
                os.killpg(os.getpgid(qemu.pid), signal.SIGKILL)


if __name__ == "__main__":
    sys.exit(main())