Empirical evidence from k-gamingcom (commit 4ab5477, 2026-05-03 22:20Z
vsftpd_234_backdoor episode): the picker selected vsftpd because BRIDGE
was set on that host. The exploit fires against target_ip=127.0.0.1
(SLIRP loopback) but vsftpd's hardcoded port-6200 backdoor is reachable
only at the guest's bridge IP. Result: session_open_timeout, AND a
schedule-clock-driven `infected_running` label was still written for
the failed exploit — exactly the §10 poisoned-training-example pattern.
Until guest-IP discovery for bridge mode is wired (a separate piece of
infrastructure), bridge-only modules can't actually reach their target
even when the operator sets BRIDGE for Tier-2's pcap source. Revert
the picker to its prior conservative form: drop requires_bridge modules
unconditionally regardless of BRIDGE state. Same for the BRIDGE env
strip in the Tier-3 launch path — it was correct as unconditional.
Replaces the two aspirational tests
(test_fleet_uses_all_modules_when_bridge_set,
test_fleet_propagates_bridge_env_to_runner) with their honest negatives
(test_tier3_drops_requires_bridge_modules_unconditionally,
test_tier3_strips_bridge_env_even_when_set). The previous tests asserted
behavior the rest of the pipeline can't deliver; they were false signals.
229 passed.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
410 lines
16 KiB
Python
410 lines
16 KiB
Python
"""Tests for fleet capacity calculation + sample manifest selection.
|
|
|
|
Capacity is unit-tested via deterministic monkeypatching of /proc and
|
|
os.cpu_count so the math is exercised independently of the host
|
|
running the suite. Sample selection has its own tests covering the
|
|
"different hosts pick different samples" property.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from orchestrator import fleet
|
|
from samples.manifest import Sample, SampleManifest
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Capacity
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _patch_capacity_inputs(
|
|
monkeypatch,
|
|
*,
|
|
cores: int,
|
|
ram_total_mib: int,
|
|
ram_available_mib: int,
|
|
load_1m: float = 0.0,
|
|
) -> None:
|
|
monkeypatch.setattr(fleet.os, "cpu_count", lambda: cores)
|
|
monkeypatch.setattr(
|
|
fleet, "_read_meminfo",
|
|
lambda: {
|
|
"MemTotal": ram_total_mib * 1024 * 1024,
|
|
"MemAvailable": ram_available_mib * 1024 * 1024,
|
|
},
|
|
)
|
|
monkeypatch.setattr(fleet, "_read_loadavg", lambda: load_1m)
|
|
|
|
|
|
def test_capacity_8core_idle_box(monkeypatch) -> None:
|
|
_patch_capacity_inputs(monkeypatch, cores=8, ram_total_mib=16384, ram_available_mib=14000)
|
|
c = fleet.detect_capacity(ram_per_vm_mib=320)
|
|
assert c.cores_total == 8
|
|
assert c.cores_reserved == 1 # 8 // 8 = 1
|
|
assert c.max_by_cores == 7
|
|
# Plenty of RAM, idle → cores binding.
|
|
assert c.max_concurrent == 7
|
|
assert "binding=cores" in c.rationale
|
|
|
|
|
|
def test_capacity_low_ram_caps_below_cores(monkeypatch) -> None:
|
|
# 8 cores but only ~2 GiB free → ram caps below cores.
|
|
_patch_capacity_inputs(monkeypatch, cores=8, ram_total_mib=4096, ram_available_mib=2048)
|
|
c = fleet.detect_capacity(ram_per_vm_mib=320)
|
|
# headroom = max(1024, 4096//8) = 1024
|
|
# max_by_ram = (2048 - 1024) // 320 = 3
|
|
assert c.max_by_ram == 3
|
|
assert c.max_concurrent == 3
|
|
|
|
|
|
def test_capacity_high_load_halves_concurrency(monkeypatch) -> None:
|
|
# 8 cores, plenty of RAM, but load_1m / cores > 0.75
|
|
_patch_capacity_inputs(
|
|
monkeypatch, cores=8, ram_total_mib=16384, ram_available_mib=14000,
|
|
load_1m=7.0, # 7/8 = 0.875 > 0.75
|
|
)
|
|
c = fleet.detect_capacity(ram_per_vm_mib=320)
|
|
# max_by_cores = 7; max_by_load = max(1, 7//2) = 3
|
|
assert c.max_by_load == 3
|
|
assert c.max_concurrent == 3
|
|
|
|
|
|
def test_capacity_pi5_class(monkeypatch) -> None:
|
|
"""4 cores + 8 GiB → reserve 1 core, run 3 concurrent."""
|
|
_patch_capacity_inputs(monkeypatch, cores=4, ram_total_mib=7951, ram_available_mib=5223)
|
|
c = fleet.detect_capacity(ram_per_vm_mib=320)
|
|
assert c.cores_total == 4
|
|
assert c.max_concurrent == 3
|
|
|
|
|
|
def test_capacity_minimal_box(monkeypatch) -> None:
|
|
"""1-core 1 GiB host shouldn't try to run any VMs."""
|
|
_patch_capacity_inputs(monkeypatch, cores=1, ram_total_mib=1024, ram_available_mib=512)
|
|
c = fleet.detect_capacity(ram_per_vm_mib=320)
|
|
assert c.max_concurrent == 0
|
|
|
|
|
|
def test_capacity_to_dict_round_trips(monkeypatch) -> None:
|
|
_patch_capacity_inputs(monkeypatch, cores=4, ram_total_mib=8000, ram_available_mib=6000)
|
|
c = fleet.detect_capacity(ram_per_vm_mib=320)
|
|
d = c.to_dict()
|
|
assert d["cores_total"] == 4
|
|
assert d["max_concurrent"] == c.max_concurrent
|
|
assert "rationale" in d
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Sample manifest
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_repo_manifest_loads() -> None:
|
|
m = SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml")
|
|
assert len(m) >= 4
|
|
# Every entry has required fields.
|
|
for s in m.samples:
|
|
assert s.name and s.family and s.category and s.profile
|
|
# All "mimic" today; will switch as real samples are added.
|
|
assert all(s.kind == "mimic" for s in m.samples)
|
|
|
|
|
|
def test_selection_is_deterministic() -> None:
|
|
m = SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml")
|
|
a = m.select(host_id="lab-1", slot=2, episode_index=5)
|
|
b = m.select(host_id="lab-1", slot=2, episode_index=5)
|
|
assert a is b
|
|
|
|
|
|
def test_selection_differs_across_hosts() -> None:
|
|
"""Two hosts on the same slot/episode should generally hit
|
|
different samples (probabilistic — assert distribution, not
|
|
individual equality).
|
|
"""
|
|
m = SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml")
|
|
if len(m) < 2:
|
|
pytest.skip("manifest too small for diversity check")
|
|
matches = 0
|
|
for slot in range(20):
|
|
a = m.select(host_id="alice", slot=slot, episode_index=0)
|
|
b = m.select(host_id="bob", slot=slot, episode_index=0)
|
|
if a is b:
|
|
matches += 1
|
|
# If the catalog has N samples, naive collision rate ~1/N. With
|
|
# 20 trials and N≥4 we expect ~5 matches; allow up to half.
|
|
assert matches < 15, "host_id seed isn't producing variety"
|
|
|
|
|
|
def test_selection_walks_catalog_across_episodes() -> None:
|
|
"""A single host over many episodes should hit every sample at
|
|
least once."""
|
|
m = SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml")
|
|
seen = set()
|
|
for ep in range(200):
|
|
seen.add(m.select(host_id="lab-x", slot=0, episode_index=ep).name)
|
|
assert len(seen) == len(m), f"only saw {len(seen)}/{len(m)} samples"
|
|
|
|
|
|
def test_manifest_rejects_missing_required_field(tmp_path: Path) -> None:
|
|
p = tmp_path / "bad.toml"
|
|
p.write_text(
|
|
'[[sample]]\n'
|
|
'name = "x"\n'
|
|
'family = "y"\n'
|
|
'# missing category\n'
|
|
'profile = "z"\n'
|
|
)
|
|
with pytest.raises(ValueError, match="category"):
|
|
SampleManifest.load(p)
|
|
|
|
|
|
def test_manifest_rejects_unknown_category(tmp_path: Path) -> None:
|
|
p = tmp_path / "bad.toml"
|
|
p.write_text(
|
|
'[[sample]]\n'
|
|
'name = "x"\n'
|
|
'family = "y"\n'
|
|
'category = "fish"\n'
|
|
'profile = "z"\n'
|
|
)
|
|
with pytest.raises(ValueError, match="category"):
|
|
SampleManifest.load(p)
|
|
|
|
|
|
def test_manifest_rejects_duplicate_names(tmp_path: Path) -> None:
|
|
p = tmp_path / "dup.toml"
|
|
p.write_text(
|
|
'[[sample]]\n'
|
|
'name = "x"\nfamily = "y"\ncategory = "rat"\nprofile = "z"\n'
|
|
'\n[[sample]]\n'
|
|
'name = "x"\nfamily = "y"\ncategory = "rat"\nprofile = "z"\n'
|
|
)
|
|
with pytest.raises(ValueError, match="duplicate"):
|
|
SampleManifest.load(p)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fleet dispatch — Tier 3 vs Tier 2 selection + per-slot module rotation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class _RecordingPopen:
|
|
"""Replacement for subprocess.run that just records what it would
|
|
have invoked. Returns a returncode-0 result."""
|
|
calls: list[dict] = []
|
|
|
|
def __init__(self, args, **kwargs) -> None:
|
|
# Mimic CompletedProcess shape.
|
|
type(self).calls.append({"args": args, "env": kwargs.get("env"), "cwd": kwargs.get("cwd")})
|
|
self.returncode = 0
|
|
self.stdout = b""
|
|
self.stderr = b""
|
|
|
|
|
|
def _fleet_cfg_with_modules(tmp_path: Path, *, force_tier2: bool = False):
|
|
from exploits.modules import load_module_configs
|
|
from orchestrator import fleet
|
|
from samples.manifest import SampleManifest
|
|
|
|
repo_root = REPO_ROOT
|
|
return fleet.FleetConfig(
|
|
host_id="test-host",
|
|
repo_root=repo_root,
|
|
data_root=tmp_path,
|
|
manifest=SampleManifest.load(repo_root / "samples" / "manifest.toml"),
|
|
modules=load_module_configs(repo_root / "exploits" / "modules"),
|
|
force_tier2=force_tier2,
|
|
)
|
|
|
|
|
|
def _patch_subprocess(monkeypatch):
|
|
from orchestrator import fleet
|
|
_RecordingPopen.calls = []
|
|
monkeypatch.setattr(fleet.subprocess, "run", _RecordingPopen)
|
|
|
|
|
|
def test_fleet_dispatches_to_tier3_when_msfrpcd_listening(monkeypatch, tmp_path) -> None:
|
|
from orchestrator import fleet
|
|
cfg = _fleet_cfg_with_modules(tmp_path)
|
|
monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
|
|
_patch_subprocess(monkeypatch)
|
|
capacity = fleet.detect_capacity()
|
|
|
|
sample = cfg.manifest.samples[0]
|
|
res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
|
|
|
|
assert res.tier == "tier3", res
|
|
assert res.module_name in cfg.modules
|
|
cmd = _RecordingPopen.calls[-1]["args"]
|
|
# The Tier-3 runner is what gets invoked.
|
|
assert any("run_tier3_demo.py" in str(a) for a in cmd)
|
|
# The module name is plumbed through.
|
|
assert "--module" in cmd
|
|
assert res.module_name in cmd
|
|
|
|
|
|
def test_fleet_falls_back_to_tier2_when_msfrpcd_down(monkeypatch, tmp_path) -> None:
|
|
from orchestrator import fleet
|
|
cfg = _fleet_cfg_with_modules(tmp_path)
|
|
monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: False)
|
|
_patch_subprocess(monkeypatch)
|
|
capacity = fleet.detect_capacity()
|
|
|
|
sample = cfg.manifest.samples[0]
|
|
res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
|
|
|
|
assert res.tier == "tier2"
|
|
assert res.module_name is None
|
|
cmd = _RecordingPopen.calls[-1]["args"]
|
|
assert any("run_real_vm_demo.py" in str(a) for a in cmd)
|
|
|
|
|
|
def test_fleet_falls_back_to_tier2_when_module_catalog_empty(monkeypatch, tmp_path) -> None:
|
|
from orchestrator import fleet
|
|
from samples.manifest import SampleManifest
|
|
cfg = fleet.FleetConfig(
|
|
host_id="test-host",
|
|
repo_root=REPO_ROOT,
|
|
data_root=tmp_path,
|
|
manifest=SampleManifest.load(REPO_ROOT / "samples" / "manifest.toml"),
|
|
modules={}, # explicitly empty
|
|
)
|
|
monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
|
|
_patch_subprocess(monkeypatch)
|
|
capacity = fleet.detect_capacity()
|
|
|
|
sample = cfg.manifest.samples[0]
|
|
res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
|
|
assert res.tier == "tier2"
|
|
|
|
|
|
def test_fleet_force_tier2_overrides_msfrpcd(monkeypatch, tmp_path) -> None:
|
|
from orchestrator import fleet
|
|
cfg = _fleet_cfg_with_modules(tmp_path, force_tier2=True)
|
|
monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
|
|
_patch_subprocess(monkeypatch)
|
|
capacity = fleet.detect_capacity()
|
|
|
|
sample = cfg.manifest.samples[0]
|
|
res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
|
|
assert res.tier == "tier2"
|
|
|
|
|
|
def test_fleet_skips_requires_bridge_modules_when_no_bridge(monkeypatch, tmp_path) -> None:
|
|
"""Fleet must filter out callback-payload modules when BRIDGE is
|
|
unset — otherwise the exploit fires but the session never lands
|
|
and the episode degenerates to a 30 s session_open_timeout."""
|
|
from orchestrator import fleet
|
|
cfg = _fleet_cfg_with_modules(tmp_path)
|
|
monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
|
|
monkeypatch.delenv("BRIDGE", raising=False)
|
|
_patch_subprocess(monkeypatch)
|
|
capacity = fleet.detect_capacity()
|
|
|
|
sample = cfg.manifest.samples[0]
|
|
seen_modules = set()
|
|
for ep in range(20):
|
|
res = fleet._run_slot(cfg, slot=0, sample=sample, episode_index=ep, capacity=capacity)
|
|
if res.tier == "tier3" and res.module_name:
|
|
seen_modules.add(res.module_name)
|
|
|
|
# Every selected module must be callback-free (same-socket).
|
|
callback_modules = {
|
|
m.name for m in cfg.modules.values() if m.requires_bridge
|
|
}
|
|
assert callback_modules, "test setup error: expected some require_bridge modules"
|
|
assert not (seen_modules & callback_modules), \
|
|
f"selected callback modules without BRIDGE: {seen_modules & callback_modules}"
|
|
|
|
|
|
def test_tier3_strips_bridge_env_even_when_set(monkeypatch, tmp_path) -> None:
|
|
"""Tier-3 always uses SLIRP+hostfwd because the rest of the pipeline
|
|
passes target_ip=127.0.0.1 regardless of bridge mode (no guest-IP
|
|
discovery wired). If BRIDGE leaks into launch_target.sh's env, the
|
|
target VM goes into tap mode without the matching IP discovery and
|
|
every exploit times out against 127.0.0.1 — producing dishonest
|
|
infected_running labels (PIPELINE.md §10). Strip BRIDGE from the
|
|
Tier-3 subprocess env even when the operator set it for Tier-2.
|
|
|
|
Regression for: 2026-05-03 vsftpd_234_backdoor episode on
|
|
k-gamingcom (commit 4ab5477) — picker selected vsftpd because
|
|
BRIDGE was set, episode timed out, schedule-clock wrote
|
|
`infected_running` for an exploit that never landed."""
|
|
from orchestrator import fleet
|
|
cfg = _fleet_cfg_with_modules(tmp_path)
|
|
monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
|
|
monkeypatch.setenv("BRIDGE", "br-malware")
|
|
_patch_subprocess(monkeypatch)
|
|
capacity = fleet.detect_capacity()
|
|
sample = cfg.manifest.samples[0]
|
|
fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
|
|
assert "BRIDGE" not in _RecordingPopen.calls[-1]["env"]
|
|
|
|
|
|
def test_tier3_drops_requires_bridge_modules_unconditionally(monkeypatch, tmp_path) -> None:
|
|
"""Picker MUST drop requires_bridge modules even when BRIDGE is set,
|
|
because the rest of the pipeline can't actually use them yet (no
|
|
guest-IP discovery for bridge mode). Until that's wired, including
|
|
them produces session_open_timeout + dishonest labels.
|
|
|
|
Asserts the picker only ever returns the SLIRP-friendly subset
|
|
across many episodes regardless of BRIDGE state."""
|
|
from orchestrator import fleet
|
|
cfg = _fleet_cfg_with_modules(tmp_path)
|
|
monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
|
|
monkeypatch.setenv("BRIDGE", "br-malware")
|
|
_patch_subprocess(monkeypatch)
|
|
capacity = fleet.detect_capacity()
|
|
slirp_friendly = {k for k, v in cfg.modules.items() if not v.requires_bridge}
|
|
sample = cfg.manifest.samples[0]
|
|
seen = set()
|
|
for ep in range(40):
|
|
res = fleet._run_slot(cfg, slot=0, sample=sample,
|
|
episode_index=ep, capacity=capacity)
|
|
if res.tier == "tier3" and res.module_name:
|
|
seen.add(res.module_name)
|
|
assert seen <= slirp_friendly, (
|
|
f"picker returned bridge-only modules {seen - slirp_friendly}; "
|
|
f"these can't reach the guest with target_ip=127.0.0.1"
|
|
)
|
|
|
|
|
|
def test_fleet_assigns_unique_port_base_per_slot(monkeypatch, tmp_path) -> None:
|
|
"""Concurrent Tier-3 slots can't share the host-side hostfwd port
|
|
or all targets stomp on each other's vsftpd:21 → 21 mapping. The
|
|
fleet must shift PORT_BASE per slot."""
|
|
from orchestrator import fleet
|
|
cfg = _fleet_cfg_with_modules(tmp_path)
|
|
monkeypatch.setattr(fleet, "_msfrpcd_available", lambda *a, **kw: True)
|
|
_patch_subprocess(monkeypatch)
|
|
capacity = fleet.detect_capacity()
|
|
|
|
sample = cfg.manifest.samples[0]
|
|
fleet._run_slot(cfg, slot=0, sample=sample, episode_index=0, capacity=capacity)
|
|
fleet._run_slot(cfg, slot=1, sample=sample, episode_index=0, capacity=capacity)
|
|
fleet._run_slot(cfg, slot=2, sample=sample, episode_index=0, capacity=capacity)
|
|
|
|
port_bases = [c["env"]["PORT_BASE"] for c in _RecordingPopen.calls]
|
|
assert len(set(port_bases)) == len(port_bases), \
|
|
f"PORT_BASE collision across slots: {port_bases}"
|
|
|
|
|
|
def test_manifest_marks_real_when_sha256_present(tmp_path: Path) -> None:
|
|
p = tmp_path / "real.toml"
|
|
p.write_text(
|
|
'[[sample]]\n'
|
|
'name = "real-one"\nfamily = "y"\ncategory = "rat"\nprofile = "z"\n'
|
|
'sha256 = "abc123"\n'
|
|
'\n[[sample]]\n'
|
|
'name = "mimic-one"\nfamily = "y"\ncategory = "rat"\nprofile = "z"\n'
|
|
)
|
|
m = SampleManifest.load(p)
|
|
by_name = {s.name: s for s in m.samples}
|
|
assert by_name["real-one"].kind == "real"
|
|
assert by_name["mimic-one"].kind == "mimic"
|