CIS490/tests/test_fleet.py

"""Tests for fleet capacity calculation + sample manifest selection.

Capacity is unit-tested via deterministic monkeypatching of /proc and
os.cpu_count so the math is exercised independently of the host
running the suite. Sample selection has its own tests covering the
"different hosts pick different samples" property.
"""

from __future__ import annotations

from pathlib import Path

import pytest

from orchestrator import fleet
from samples.manifest import Sample, SampleManifest


REPO_ROOT = Path(__file__).resolve().parent.parent


# ---------------------------------------------------------------------------
# Capacity
# ---------------------------------------------------------------------------


def _patch_capacity_inputs(
    monkeypatch,
    *,
    cores: int,
    ram_total_mib: int,
    ram_available_mib: int,
    load_1m: float = 0.0,
) -> None:
    monkeypatch.setattr(fleet.os, "cpu_count", lambda: cores)
    monkeypatch.setattr(
        fleet, "_read_meminfo",
        lambda: {
            "MemTotal": ram_total_mib * 1024 * 1024,
            "MemAvailable": ram_available_mib * 1024 * 1024,
        },
    )
    monkeypatch.setattr(fleet, "_read_loadavg", lambda: load_1m)


def test_capacity_8core_idle_box(monkeypatch) -> None:
    _patch_capacity_inputs(monkeypatch, cores=8, ram_total_mib=16384, ram_available_mib=14000)
    c = fleet.detect_capacity(ram_per_vm_mib=320)
    assert c.cores_total == 8
    assert c.cores_reserved == 1  # 8 // 8 = 1
    assert c.max_by_cores == 7
    # Plenty of RAM, idle → cores binding.
    assert c.max_concurrent == 7
    assert "binding=cores" in c.rationale


def test_capacity_low_ram_caps_below_cores(monkeypatch) -> None:
    # 8 cores but only ~2 GiB free → ram caps below cores.
    _patch_capacity_inputs(monkeypatch, cores=8, ram_total_mib=4096, ram_available_mib=2048)
    c = fleet.detect_capacity(ram_per_vm_mib=320)
    # headroom = max(1024, 4096//8) = 1024
    # max_by_ram = (2048 - 1024) // 320 = 3
    assert c.max_by_ram == 3
    assert c.max_concurrent == 3


def test_capacity_high_load_halves_concurrency(monkeypatch) -> None:
    # 8 cores, plenty of RAM, but load_1m / cores > 0.75
    _patch_capacity_inputs(
        monkeypatch, cores=8, ram_total_mib=16384, ram_available_mib=14000,
        load_1m=7.0,  # 7/8 = 0.875 > 0.75
    )
    c = fleet.detect_capacity(ram_per_vm_mib=320)
    # max_by_cores = 7; max_by_load = max(1, 7//2) = 3
    assert c.max_by_load == 3
    assert c.max_concurrent == 3


def test_capacity_pi5_class(monkeypatch) -> None:
    """4 cores + 8 GiB → reserve 1 core, run 3 concurrent."""
    _patch_capacity_inputs(monkeypatch, cores=4, ram_total_mib=7951, ram_available_mib=5223)
    c = fleet.detect_capacity(ram_per_vm_mib=320)
    assert c.cores_total == 4
    assert c.max_concurrent == 3


def test_capacity_minimal_box(monkeypatch) -> None:
    """1-core 1 GiB host shouldn't try to run any VMs."""
    _patch_capacity_inputs(monkeypatch, cores=1, ram_total_mib=1024, ram_available_mib=512)
    c = fleet.detect_capacity(ram_per_vm_mib=320)
    assert c.max_concurrent == 0


def test_capacity_to_dict_round_trips(monkeypatch) -> None:
    _patch_capacity_inputs(monkeypatch, cores=4, ram_total_mib=8000, ram_available_mib=6000)
    c = fleet.detect_capacity(ram_per_vm_mib=320)
    d = c.to_dict()
    assert d["cores_total"] == 4
    assert d["max_concurrent"] == c.max_concurrent
    assert "rationale" in d


# ---------------------------------------------------------------------------
# Sample manifest
# ---------------------------------------------------------------------------


def test_repo_manifest_loads() -> None:
    m = SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml")
    assert len(m) >= 4
    # Every entry has required fields.
    for s in m.samples:
        assert s.name and s.family and s.category and s.profile
    # All "mimic" today; will switch as real samples are added.
    assert all(s.kind == "mimic" for s in m.samples)


def test_selection_is_deterministic() -> None:
    m = SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml")
    a = m.select(host_id="lab-1", slot=2, episode_index=5)
    b = m.select(host_id="lab-1", slot=2, episode_index=5)
    assert a is b


def test_selection_differs_across_hosts() -> None:
    """Two hosts on the same slot/episode should generally hit
    different samples (probabilistic — assert distribution, not
    individual equality).
    """
    m = SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml")
    if len(m) < 2:
        pytest.skip("manifest too small for diversity check")
    matches = 0
    for slot in range(20):
        a = m.select(host_id="alice", slot=slot, episode_index=0)
        b = m.select(host_id="bob",   slot=slot, episode_index=0)
        if a is b:
            matches += 1
    # If the catalog has N samples, naive collision rate ~1/N. With
    # 20 trials and N≥4 we expect ~5 matches; allow up to half.
    assert matches < 15, "host_id seed isn't producing variety"


def test_selection_walks_catalog_across_episodes() -> None:
    """A single host over many episodes should hit every sample at
    least once."""
    m = SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml")
    seen = set()
    for ep in range(200):
        seen.add(m.select(host_id="lab-x", slot=0, episode_index=ep).name)
    assert len(seen) == len(m), f"only saw {len(seen)}/{len(m)} samples"


def test_manifest_rejects_missing_required_field(tmp_path: Path) -> None:
    p = tmp_path / "bad.toml"
    p.write_text(
        '[[sample]]\n'
        'name = "x"\n'
        'family = "y"\n'
        '# missing category\n'
        'profile = "z"\n'
    )
    with pytest.raises(ValueError, match="category"):
        SampleManifest.load(p)


def test_manifest_rejects_unknown_category(tmp_path: Path) -> None:
    p = tmp_path / "bad.toml"
    p.write_text(
        '[[sample]]\n'
        'name = "x"\n'
        'family = "y"\n'
        'category = "fish"\n'
        'profile = "z"\n'
    )
    with pytest.raises(ValueError, match="category"):
        SampleManifest.load(p)


def test_manifest_rejects_duplicate_names(tmp_path: Path) -> None:
    p = tmp_path / "dup.toml"
    p.write_text(
        '[[sample]]\n'
        'name = "x"\nfamily = "y"\ncategory = "rat"\nprofile = "z"\n'
        '\n[[sample]]\n'
        'name = "x"\nfamily = "y"\ncategory = "rat"\nprofile = "z"\n'
    )
    with pytest.raises(ValueError, match="duplicate"):
        SampleManifest.load(p)


# ---------------------------------------------------------------------------
# Fleet dispatch — Tier 3 vs Tier 2 selection + per-slot module rotation
# ---------------------------------------------------------------------------


class _RecordingPopen:
    """Replacement for subprocess.run that just records what it would
    have invoked. Returns a returncode-0 result."""
    calls: list[dict] = []

    def __init__(self, args, **kwargs) -> None:
        # Mimic CompletedProcess shape.
        type(self).calls.append({"args": args, "env": kwargs.get("env"), "cwd": kwargs.get("cwd")})
        self.returncode = 0
        self.stdout = b""
        self.stderr = b""


def _fixture_modules() -> dict:
    """Synthetic in-memory module catalog for test fixtures.

    Production no longer ships any verified Tier-3 modules — the
    samba_usermap_script entry was removed because it never landed a
    session against the configured Metasploitable2 target (PIPELINE.md
    §4.3 admission criteria, default-to-removal). Until §5 step 3
    builds a target VM and step 4 re-admits modules with a recorded
    `verified_against`, the production catalog is empty by design.

    Tests still need to exercise Tier-3 dispatch logic, so this
    fixture provides a SLIRP-friendly module + a bridge-required
    module hand-built in-memory. Keeping the fixture decoupled from
    `exploits/modules/*.toml` means production catalog state can
    change freely without breaking these tests."""
    from exploits.modules import ModuleConfig
    return {
        "fixture_slirp": ModuleConfig(
            name="fixture_slirp",
            module_type="exploit",
            module_path="multi/test/slirp_friendly_fixture",
            options={"RHOSTS": "{{ target_ip }}", "RPORT": 139},
            payload_path="cmd/unix/bind_perl",
            payload_options={"LPORT": 4444},
            requires_bridge=False,
            extra_target_ports=(4444,),
        ),
        "fixture_bridge": ModuleConfig(
            name="fixture_bridge",
            module_type="exploit",
            module_path="multi/test/bridge_required_fixture",
            options={"RHOSTS": "{{ target_ip }}", "RPORT": 21},
            payload_path="cmd/unix/interact",
            requires_bridge=True,
        ),
    }


def _fixture_manifest(*, max_tier3_slots: int = 0,
                      max_concurrent_ceiling: int = 0):
    """Synthetic canonical Manifest for fleet tests.

    Mirrors the production manifest.toml shape but constructed in-memory
    so test outcomes don't depend on what the on-disk manifest happens
    to say. Per-test parameterization (ceilings, future schedule
    variants) goes through this builder, not through CLI overrides
    that don't exist anymore (PIPELINE.md §4.1)."""
    from orchestrator.manifest import (
        CollectorIntervals, FleetPolicy, Manifest, Phase,
    )
    return Manifest(
        schema_version=1,
        name="test-fixture",
        ram_per_vm_mib=320,
        schedule=(
            Phase("clean", 10.0),
            Phase("armed", 3.0),
            Phase("infecting", 5.0),
            Phase("infected_running", 25.0),
            Phase("dormant", 15.0),
            Phase("clean", 5.0),
        ),
        fleet=FleetPolicy(
            max_concurrent_ceiling=max_concurrent_ceiling,
            max_tier3_slots=max_tier3_slots,
        ),
        collectors_active=("proc", "qmp", "perf", "guest_agent",
                           "pcap", "netflow"),
        intervals=CollectorIntervals(
            proc_ms=100, qmp_ms=1000, perf_ms=100,
            guest_agent_ms=100, pcap_snaplen=256, netflow_bucket_ms=100,
        ),
        catalog=(),
        targets=(),
        samples_manifest_path="samples/manifest.toml",
        repo_root=REPO_ROOT,
        manifest_path=REPO_ROOT / "manifest.toml",
    )


def _fleet_cfg_with_modules(tmp_path: Path, *, max_tier3_slots: int = 0,
                            max_concurrent_ceiling: int = 0):
    from orchestrator import fleet
    from samples.manifest import SampleManifest

    repo_root = REPO_ROOT
    return fleet.FleetConfig(
        host_id="test-host",
        repo_root=repo_root,
        data_root=tmp_path,
        experiment=_fixture_manifest(
            max_tier3_slots=max_tier3_slots,
            max_concurrent_ceiling=max_concurrent_ceiling,
        ),
        samples=SampleManifest.load(repo_root / "samples" / "manifest.toml"),
        modules=_fixture_modules(),
    )


def _patch_subprocess(monkeypatch):
    from orchestrator import fleet
    _RecordingPopen.calls = []
    monkeypatch.setattr(fleet.subprocess, "run", _RecordingPopen)


def test_fleet_dispatches_to_tier3_when_msfrpcd_listening(monkeypatch, tmp_path) -> None:
    from orchestrator import fleet
    cfg = _fleet_cfg_with_modules(tmp_path)
    monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
    _patch_subprocess(monkeypatch)
    capacity = fleet.detect_capacity()

    sample = cfg.samples.samples[0]
    res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)

    assert res.tier == "tier3", res
    assert res.module_name in cfg.modules
    cmd = _RecordingPopen.calls[-1]["args"]
    # The Tier-3 runner is what gets invoked.
    assert any("run_tier3_demo.py" in str(a) for a in cmd)
    # The module name is plumbed through.
    assert "--module" in cmd
    assert res.module_name in cmd


def test_fleet_falls_back_to_tier2_when_msfrpcd_down(monkeypatch, tmp_path) -> None:
    from orchestrator import fleet
    cfg = _fleet_cfg_with_modules(tmp_path)
    monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: False)
    _patch_subprocess(monkeypatch)
    capacity = fleet.detect_capacity()

    sample = cfg.samples.samples[0]
    res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)

    assert res.tier == "tier2"
    assert res.module_name is None
    cmd = _RecordingPopen.calls[-1]["args"]
    assert any("run_real_vm_demo.py" in str(a) for a in cmd)


def test_fleet_falls_back_to_tier2_when_module_catalog_empty(monkeypatch, tmp_path) -> None:
    from orchestrator import fleet
    from samples.manifest import SampleManifest
    cfg = fleet.FleetConfig(
        host_id="test-host",
        repo_root=REPO_ROOT,
        data_root=tmp_path,
        experiment=_fixture_manifest(),
        samples=SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml"),
        modules={},  # explicitly empty
    )
    monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
    _patch_subprocess(monkeypatch)
    capacity = fleet.detect_capacity()

    sample = cfg.samples.samples[0]
    res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
    assert res.tier == "tier2"


def test_fleet_empty_module_catalog_falls_back_to_tier2(monkeypatch, tmp_path) -> None:
    """An empty module catalog forces Tier-2 fallback even when msfrpcd
    is reachable. This replaces the former force_tier2 override knob:
    per PIPELINE.md §14 the closed override list contains only
    CIS490_ALLOW_DIRTY, and per §1 the right way to disable Tier-3 is
    to ship no admitted modules — not to flag-flip the orchestrator."""
    from orchestrator import fleet
    from samples.manifest import SampleManifest
    cfg = fleet.FleetConfig(
        host_id="test-host",
        repo_root=REPO_ROOT,
        data_root=tmp_path,
        experiment=_fixture_manifest(),
        samples=SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml"),
        modules={},  # empty catalog → no Tier-3
    )
    monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
    _patch_subprocess(monkeypatch)
    capacity = fleet.detect_capacity()

    sample = cfg.samples.samples[0]
    res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
    assert res.tier == "tier2"


def test_fleet_skips_requires_bridge_modules_when_no_bridge(monkeypatch, tmp_path) -> None:
    """Fleet must filter out callback-payload modules when BRIDGE is
    unset — otherwise the exploit fires but the session never lands
    and the episode degenerates to a 30 s session_open_timeout."""
    from orchestrator import fleet
    cfg = _fleet_cfg_with_modules(tmp_path)
    monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
    monkeypatch.delenv("BRIDGE", raising=False)
    _patch_subprocess(monkeypatch)
    capacity = fleet.detect_capacity()

    sample = cfg.samples.samples[0]
    seen_modules = set()
    for ep in range(20):
        res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=ep, capacity=capacity)
        if res.tier == "tier3" and res.module_name:
            seen_modules.add(res.module_name)

    # Every selected module must be callback-free (same-socket).
    callback_modules = {
        m.name for m in cfg.modules.values() if m.requires_bridge
    }
    assert callback_modules, "test setup error: expected some require_bridge modules"
    assert not (seen_modules & callback_modules), \
        f"selected callback modules without BRIDGE: {seen_modules & callback_modules}"


def test_tier3_strips_bridge_env_even_when_set(monkeypatch, tmp_path) -> None:
    """Tier-3 always uses SLIRP+hostfwd because the rest of the pipeline
    passes target_ip=127.0.0.1 regardless of bridge mode (no guest-IP
    discovery wired). If BRIDGE leaks into launch_target.sh's env, the
    target VM goes into tap mode without the matching IP discovery and
    every exploit times out against 127.0.0.1 — producing dishonest
    infected_running labels (PIPELINE.md §10). Strip BRIDGE from the
    Tier-3 subprocess env even when the operator set it for Tier-2.

    Regression for: 2026-05-03 vsftpd_234_backdoor episode on
    k-gamingcom (commit 4ab5477) — picker selected vsftpd because
    BRIDGE was set, episode timed out, schedule-clock wrote
    `infected_running` for an exploit that never landed."""
    from orchestrator import fleet
    cfg = _fleet_cfg_with_modules(tmp_path)
    monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
    monkeypatch.setenv("BRIDGE", "br-malware")
    _patch_subprocess(monkeypatch)
    capacity = fleet.detect_capacity()
    sample = cfg.samples.samples[0]
    fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
    assert "BRIDGE" not in _RecordingPopen.calls[-1]["env"]


def test_tier3_drops_requires_bridge_modules_unconditionally(monkeypatch, tmp_path) -> None:
    """Picker MUST drop requires_bridge modules even when BRIDGE is set,
    because the rest of the pipeline can't actually use them yet (no
    guest-IP discovery for bridge mode). Until that's wired, including
    them produces session_open_timeout + dishonest labels.

    Asserts the picker only ever returns the SLIRP-friendly subset
    across many episodes regardless of BRIDGE state."""
    from orchestrator import fleet
    cfg = _fleet_cfg_with_modules(tmp_path)
    monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
    monkeypatch.setenv("BRIDGE", "br-malware")
    _patch_subprocess(monkeypatch)
    capacity = fleet.detect_capacity()
    slirp_friendly = {k for k, v in cfg.modules.items() if not v.requires_bridge}
    sample = cfg.samples.samples[0]
    seen = set()
    for ep in range(40):
        res = fleet._run_slot(cfg, slot=0, sample=sample,
                              episode_index=ep, capacity=capacity)
        if res.tier == "tier3" and res.module_name:
            seen.add(res.module_name)
    assert seen <= slirp_friendly, (
        f"picker returned bridge-only modules {seen - slirp_friendly}; "
        f"these can't reach the guest with target_ip=127.0.0.1"
    )


def test_fleet_assigns_unique_port_base_per_slot(monkeypatch, tmp_path) -> None:
    """Concurrent Tier-3 slots can't share the host-side hostfwd port
    or all targets stomp on each other's vsftpd:21 → 21 mapping. The
    fleet must shift PORT_BASE per slot."""
    from orchestrator import fleet
    cfg = _fleet_cfg_with_modules(tmp_path)
    monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
    _patch_subprocess(monkeypatch)
    capacity = fleet.detect_capacity()

    sample = cfg.samples.samples[0]
    fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
    fleet._run_slot(cfg, slot=1, sample=sample, episode_index=0, capacity=capacity)
    fleet._run_slot(cfg, slot=2, sample=sample, episode_index=0, capacity=capacity)

    port_bases = [c["env"]["PORT_BASE"] for c in _RecordingPopen.calls]
    assert len(set(port_bases)) == len(port_bases), \
        f"PORT_BASE collision across slots: {port_bases}"


def test_manifest_marks_real_when_sha256_present(tmp_path: Path) -> None:
    p = tmp_path / "real.toml"
    p.write_text(
        '[[sample]]\n'
        'name = "real-one"\nfamily = "y"\ncategory = "rat"\nprofile = "z"\n'
        'sha256 = "abc123"\n'
        '\n[[sample]]\n'
        'name = "mimic-one"\nfamily = "y"\ncategory = "rat"\nprofile = "z"\n'
    )
    m = SampleManifest.load(p)
    by_name = {s.name: s for s in m.samples}
    assert by_name["real-one"].kind == "real"
    assert by_name["mimic-one"].kind == "mimic"