CIS490/exploits/driver.py

"""Tier-3 exploit driver.

Plugged into ``EpisodeRunner`` as the ``on_phase`` callback. Translates
the closed phase enum into msfrpc actions:

  clean             — idle. (no-op; exploit hasn't fired yet)
  armed             — module loaded + options applied; module fires
                      with ``module.execute``. Driver records the fire
                      timestamp via ``emit_event`` so the labeler can
                      align ``armed`` with what's actually happening.
  infecting         — poll for a new session; on session_open, run a
                      one-shot landing command (``id`` or similar) so
                      we have a clear "session is responsive" event.
  infected_running  — start observable workload inside the session.
  dormant           — kill the workload, leave the session alive.
  reverting         — kill session, snapshot revert handled by caller.

The events the driver writes match the schema in ``docs/data-model.md``:
``exploit_fire``, ``session_open``, ``sample_executed``, ``session_dormant``,
``session_killed``.

The driver does NOT author exploits or pick payloads at runtime — those
choices live in ``exploits/modules/*.toml``. The driver is a pure
adapter between the phase machine and msfrpc.
"""

from __future__ import annotations

import logging
import time
from dataclasses import dataclass
from typing import Callable

from pathlib import Path

from samples.manifest import Sample

from .modules import ModuleConfig
from .msfrpc import MSFRpcClient, wait_for_new_session
from .workloads import Workload, real_binary_workload, workload_for


log = logging.getLogger("cis490.exploits.driver")

EmitEvent = Callable[..., None]


@dataclass
class DriverConfig:
    target_ip: str
    session_open_timeout_s: float = 30.0
    # Driver v1 fallback workload — used only when no Sample is passed
    # in (Sample-driven runs override these via exploits.workloads).
    # We keep the v1 path so existing callers keep working unchanged.
    workload_cmd: str = "yes > /dev/null"
    workload_kill_cmd: str = "pkill yes; true"
    # Where staged real-malware binaries live on the lab host.
    sample_store_root: Path | None = None


class MSFExploitDriver:
    """Phase-to-msfrpc adapter. One instance per episode.

    When constructed with a ``Sample``, the driver dispatches the
    ``infected_running`` / ``dormant`` workload through
    ``exploits.workloads`` so the in-session behaviour matches the
    sample's profile (cpu-saturate, scan-and-dial, io-walk, bursty-c2,
    low-and-slow, shell-resident). Without a sample, falls back to
    the v1 single-command workload — useful for the very first
    Tier-3 smoke runs."""

    def __init__(
        self,
        client: MSFRpcClient,
        module: ModuleConfig,
        cfg: DriverConfig,
        emit_event: EmitEvent,
        *,
        sample: Sample | None = None,
    ) -> None:
        self.client = client
        self.module = module
        self.cfg = cfg
        self.emit = emit_event
        self.sample = sample
        self.workload: Workload | None = self._resolve_workload(sample)

        self._sessions_seen_at_arm: set[int] = set()
        self._session_id: int | None = None
        self._job_id: int | str | None = None
        self._fired = False

    def _resolve_workload(self, sample: Sample | None) -> Workload | None:
        """Pick the best workload for this sample:
          1. real binary (if staged at samples/store/<sha256>) → upload + exec
          2. profile mimic from exploits.workloads
          3. None → driver v1 fallback (yes-loop)
        """
        if sample is None:
            return None
        if sample.kind == "real" and self.cfg.sample_store_root is not None:
            bin_path = sample.binary_path(self.cfg.sample_store_root)
            if bin_path is not None:
                try:
                    payload = bin_path.read_bytes()
                    return real_binary_workload(payload, sample=sample)
                except OSError as e:
                    log.warning("could not read real sample %s: %s; falling back", bin_path, e)
        return workload_for(sample)

    # ---- lifecycle ------------------------------------------------------

    def setup(self) -> None:
        """Authenticate and snapshot the pre-existing session set so we
        can recognize a *new* session as the one we just opened."""
        self.client.login()
        self._sessions_seen_at_arm = set(self.client.session_list().keys())
        self.emit(
            "driver_setup",
            module=self.module.module_path,
            payload=self.module.payload_path,
            target_ip=self.cfg.target_ip,
            preexisting_sessions=sorted(self._sessions_seen_at_arm),
            sample=self.sample.name if self.sample else None,
            sample_kind=self.sample.kind if self.sample else None,
            sample_sha256=self.sample.sha256 if self.sample else None,
            workload_profile=self.workload.profile if self.workload else None,
        )

    def teardown(self) -> None:
        if self._session_id is not None:
            try:
                self.client.session_stop(self._session_id)
                self.emit("session_killed", session_id=self._session_id)
            except Exception:
                log.exception("session.stop on %s", self._session_id)
        if self._job_id is not None:
            try:
                self.client.job_stop(self._job_id)
            except Exception:
                log.debug("job.stop on %s (often already gone)", self._job_id)
        self.client.logout()

    # ---- phase callback -------------------------------------------------

    def set_phase(self, phase: str) -> None:
        log.info("driver phase -> %s", phase)
        if phase == "clean":
            return
        if phase == "armed":
            self._fire()
        elif phase == "infecting":
            self._await_session()
        elif phase == "infected_running":
            self._start_workload()
        elif phase == "dormant":
            self._stop_workload()
        elif phase == "reverting":
            self.teardown()
        else:
            log.warning("unknown phase: %s", phase)

    # ---- actions --------------------------------------------------------

    def _fire(self) -> None:
        if self._fired:
            log.debug("module already fired; skipping re-fire")
            return
        opts = self.module.render_options(target_ip=self.cfg.target_ip)
        self.emit(
            "exploit_fire",
            module=self.module.module_path,
            options={k: v for k, v in opts.items() if k != "PASSWORD"},
        )
        resp = self.client.module_execute(
            self.module.module_type, self.module.module_path, opts,
        )
        self._job_id = resp.get("job_id")
        self._fired = True

    def _await_session(self) -> None:
        if self._session_id is not None:
            return
        result = wait_for_new_session(
            self.client,
            seen=self._sessions_seen_at_arm,
            timeout_s=self.cfg.session_open_timeout_s,
        )
        if result is None:
            self.emit(
                "session_open_timeout",
                module=self.module.module_path,
                timeout_s=self.cfg.session_open_timeout_s,
            )
            log.warning(
                "no session opened within %.1fs", self.cfg.session_open_timeout_s,
            )
            return
        sid, info = result
        self._session_id = sid
        self.emit(
            "session_open",
            session_id=sid,
            session_type=info.get("type"),
            tunnel_peer=info.get("tunnel_peer"),
        )
        # Landing probe so we have a known-good RTT marker on the wire.
        try:
            self.client.session_shell_write(sid, "id")
            time.sleep(0.5)
            out = self.client.session_shell_read(sid)
            self.emit("session_landing_probe", session_id=sid, output=out.strip()[:256])
        except Exception:
            log.exception("landing probe on session %s", sid)

    def _start_workload(self) -> None:
        if self._session_id is None:
            log.warning("infected_running with no session — skipping workload")
            return
        if self.workload is not None:
            # Driver v2 — profile-matched workload.
            self.client.session_shell_write(self._session_id, self.workload.start_cmd)
            self.emit(
                "sample_executed",
                session_id=self._session_id,
                profile=self.workload.profile,
                description=self.workload.description,
                sample=self.sample.name if self.sample else None,
            )
        else:
            # Driver v1 fallback.
            self.client.session_shell_write(
                self._session_id,
                f"nohup sh -c {_shquote(self.cfg.workload_cmd)} </dev/null "
                f">/dev/null 2>&1 & disown",
            )
            self.emit(
                "sample_executed",
                session_id=self._session_id,
                command=self.cfg.workload_cmd,
            )

    def _stop_workload(self) -> None:
        if self._session_id is None:
            return
        if self.workload is not None:
            self.client.session_shell_write(self._session_id, self.workload.stop_cmd)
        else:
            self.client.session_shell_write(
                self._session_id, self.cfg.workload_kill_cmd,
            )
        self.emit(
            "session_dormant",
            session_id=self._session_id,
            profile=self.workload.profile if self.workload else None,
        )


def _shquote(s: str) -> str:
    # Minimal POSIX single-quote escaping. The workload command is set
    # by us, not by anything user-controlled, so we just need to handle
    # embedded single quotes correctly for completeness.
    return "'" + s.replace("'", "'\\''") + "'"