CIS490/tools/vm_load_controller.py

"""In-guest load controller for tier-2 episodes.

Drives a real Alpine VM through the same phase schedule the orchestrator
follows, but the load this time is generated *inside* the guest by busybox
``yes`` / ``dd`` / a small marker file. The host /proc collector still
samples the qemu-system process from outside — what's "real" here is the
workload itself, not the orchestrator's view of it.

Phase commands (all run via the SerialClient):

  clean             — kill any running load, idle.
  armed             — small disk write (handshake-shape).
  infecting         — disk burst: 512 KiB urandom write to /tmp/payload.
  infected_running  — background ``yes > /dev/null`` for sustained CPU.
  dormant           — kill background load (back to idle).

Designed to mimic the envelope of an XMRig-class compromise without
running real malware. Tier-3 will replace this with msf-driven exploit
fire and a real sample.
"""

from __future__ import annotations

import logging
import sys
from pathlib import Path
from typing import Callable

from vm_serial import SerialClient

# Allow running as a script (sibling of tools/).
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from exploits.workloads import Workload, workload_for  # noqa: E402
from samples.manifest import Sample  # noqa: E402


log = logging.getLogger("cis490.vm_load_controller")


EmitEvent = Callable[..., None]


class VMLoadController:
    """Drives a real Alpine guest through the phase schedule for
    Tier 2 (no exploit). Workload is chosen by ``sample.profile`` —
    same profile catalog as the Tier-3 driver so a fleet wave
    produces matched envelopes whether or not an exploit fires.

    Without a sample, falls back to the original cpu-saturate yes-loop
    (the original Tier-2 demo behaviour).

    Every set_phase call emits an event into the runner's events.jsonl
    so we can audit (a) whether the workload command actually got
    sent, (b) whether the guest acknowledged it, and (c) whether the
    expected process is running afterwards. Without those events,
    silent failures (login partial, command swallowed by tty) produce
    well-labeled but information-less episodes — see CIS490 history
    where every phase median'd 20% CPU on elliott-lab."""

    def __init__(
        self,
        serial: SerialClient,
        sample: Sample | None = None,
        emit_event: EmitEvent | None = None,
    ) -> None:
        self.s = serial
        self.sample = sample
        self.workload: Workload | None = workload_for(sample)
        # No-op default so callers don't have to thread an emitter.
        self.emit: EmitEvent = emit_event or (lambda *a, **kw: None)

    def setup(self) -> None:
        # Kill any pre-existing load and clear scratch space.
        self._kill_load()
        self.s.run("rm -f /tmp/payload /tmp/armed.log; echo setup-ok")
        self.emit(
            "workload_setup",
            profile=self.workload.profile if self.workload else "v1-yes",
            sample=self.sample.name if self.sample else None,
        )

    def teardown(self) -> None:
        self._kill_load()

    # ---- phases ---------------------------------------------------------

    def set_phase(self, phase: str) -> None:
        log.info("vm phase -> %s (profile=%s)",
                 phase, self.workload.profile if self.workload else "v1")
        try:
            if phase == "clean":
                self._kill_load()
                self._emit_phase("workload_killed", phase)
            elif phase == "armed":
                self.s.run("echo armed-handshake-$(date +%s) > /tmp/armed.log")
                self._emit_phase("workload_armed", phase)
            elif phase == "infecting":
                self.s.run(
                    "dd if=/dev/urandom of=/tmp/payload bs=4k count=128 2>/dev/null && "
                    "chmod +x /tmp/payload"
                )
                self._emit_phase("workload_infecting", phase)
            elif phase == "infected_running":
                self._kill_load()
                if self.workload is not None:
                    self.s.run(self.workload.start_cmd)
                else:
                    self.s.run(
                        "nohup sh -c 'yes > /dev/null' </dev/null >/dev/null 2>&1 & disown"
                    )
                self._emit_phase("workload_started", phase)
            elif phase == "dormant":
                # Probe BEFORE we kill so we see whether the workload
                # was actually running. If the probe says nothing was
                # running, the previous infected_running was a no-op
                # and the trainer should filter this episode.
                probe = self._probe()
                self._kill_load()
                self._emit_phase("workload_killed", phase, pre_kill_probe=probe)
            else:
                log.warning("unknown phase: %s", phase)
        except Exception as e:
            # Don't propagate — the runner already swallows on_phase
            # exceptions. But DO record so the episode is filterable.
            log.exception("set_phase(%s) failed", phase)
            self.emit(
                "workload_failed",
                phase=phase,
                error=str(e)[:200],
                profile=self.workload.profile if self.workload else "v1-yes",
            )

    # ---- internals ------------------------------------------------------

    def _kill_load(self) -> None:
        if self.workload is not None:
            self.s.run(self.workload.stop_cmd)
        # Always sweep the v1 leftover commands too, in case we just
        # switched profiles mid-fleet-run.
        self.s.run("pkill yes 2>/dev/null; pkill stress-ng 2>/dev/null; true")

    def _probe(self) -> dict:
        """Ask the guest what's actually running. Returns a small dict
        the caller stamps into the event so trainers can detect the
        "workload didn't fire" case from meta alone.

        Counts processes via ``pgrep <name> | wc -l`` rather than
        ``pgrep -c <name>``: the latter is a procps-ng/util-linux flag
        and is NOT supported by busybox's pgrep (Alpine guests). On
        busybox, ``pgrep -c`` exits 1 with a usage banner, the
        ``|| echo 0`` fallback always fires, and the probe reports
        false zeros. See spectral/CIS490#15 — this caused 244 episodes
        from elliott-thinkpad and k-gamingcom to be incorrectly
        labelled workload-silent even when the workload was running."""
        try:
            out = self.s.run(
                "echo yes=$(pgrep yes 2>/dev/null | wc -l); "
                "echo sh=$(pgrep sh 2>/dev/null | wc -l); "
                "echo loadavg=$(awk '{print $1}' /proc/loadavg)"
            )
            stats: dict = {}
            for line in out.splitlines():
                line = line.strip()
                if "=" not in line:
                    continue
                k, _, v = line.partition("=")
                stats[k.strip()] = v.strip()
            return stats
        except Exception as e:
            return {"probe_error": str(e)[:120]}

    def _emit_phase(self, event: str, phase: str, **extra) -> None:
        self.emit(
            event,
            phase=phase,
            profile=self.workload.profile if self.workload else "v1-yes",
            sample=self.sample.name if self.sample else None,
            **extra,
        )