CIS490/tools/vm_load_controller.py

"""In-guest load controller for tier-2 episodes.

Drives a real Alpine VM through the same phase schedule the orchestrator
follows, but the load this time is generated *inside* the guest by busybox
``yes`` / ``dd`` / a small marker file. The host /proc collector still
samples the qemu-system process from outside — what's "real" here is the
workload itself, not the orchestrator's view of it.

Phase commands (all run via the SerialClient):

  clean             — kill any running load, idle.
  armed             — small disk write (handshake-shape).
  infecting         — disk burst: 512 KiB urandom write to /tmp/payload.
  infected_running  — background ``yes > /dev/null`` for sustained CPU.
  dormant           — kill background load (back to idle).

Designed to mimic the envelope of an XMRig-class compromise without
running real malware. Tier-3 will replace this with msf-driven exploit
fire and a real sample.
"""

from __future__ import annotations

import logging
import sys
from pathlib import Path

from vm_serial import SerialClient

# Allow running as a script (sibling of tools/).
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from exploits.workloads import Workload, workload_for  # noqa: E402
from samples.manifest import Sample  # noqa: E402


log = logging.getLogger("cis490.vm_load_controller")


class VMLoadController:
    """Drives a real Alpine guest through the phase schedule for
    Tier 2 (no exploit). Workload is chosen by ``sample.profile`` —
    same profile catalog as the Tier-3 driver so a fleet wave
    produces matched envelopes whether or not an exploit fires.

    Without a sample, falls back to the original cpu-saturate yes-loop
    (the original Tier-2 demo behaviour)."""

    def __init__(self, serial: SerialClient, sample: Sample | None = None) -> None:
        self.s = serial
        self.sample = sample
        self.workload: Workload | None = workload_for(sample)

    def setup(self) -> None:
        # Kill any pre-existing load and clear scratch space.
        self._kill_load()
        self.s.run("rm -f /tmp/payload /tmp/armed.log; echo setup-ok")

    def teardown(self) -> None:
        self._kill_load()

    # ---- phases ---------------------------------------------------------

    def set_phase(self, phase: str) -> None:
        log.info("vm phase -> %s (profile=%s)",
                 phase, self.workload.profile if self.workload else "v1")
        if phase == "clean":
            self._kill_load()
        elif phase == "armed":
            self.s.run("echo armed-handshake-$(date +%s) > /tmp/armed.log")
        elif phase == "infecting":
            self.s.run(
                "dd if=/dev/urandom of=/tmp/payload bs=4k count=128 2>/dev/null && "
                "chmod +x /tmp/payload"
            )
        elif phase == "infected_running":
            self._kill_load()
            if self.workload is not None:
                self.s.run(self.workload.start_cmd)
            else:
                self.s.run(
                    "nohup sh -c 'yes > /dev/null' </dev/null >/dev/null 2>&1 & disown"
                )
        elif phase == "dormant":
            self._kill_load()
        else:
            log.warning("unknown phase: %s", phase)

    # ---- internals ------------------------------------------------------

    def _kill_load(self) -> None:
        if self.workload is not None:
            self.s.run(self.workload.stop_cmd)
        # Always sweep the v1 leftover commands too, in case we just
        # switched profiles mid-fleet-run.
        self.s.run("pkill yes 2>/dev/null; pkill stress-ng 2>/dev/null; true")