CIS490/tools/run_fleet.py
Elliott Kolden 667f042707 Tier-3 bring-up: 9 bugs fixed on elliott-ThinkPad (2026-05-01)
Root causes and fixes documented in TIER3-BRINGUP.md. Summary:

1. BRIDGE env var leaked into Tier-3 subprocess → target VM used tap
   instead of SLIRP; fix: env.pop("BRIDGE") in fleet _run_slot.

2. usable_modules filter conditioned on BRIDGE presence → bridge-requiring
   modules selected on SLIRP runs; fix: always filter requires_bridge.

3. cmd/unix/interact creates no session.list entry → session_open_timeout
   every episode; fix: switch samba_usermap_script to cmd/unix/bind_perl.

4. Per-slot LPORT hostfwd used wrong guest port (host:5444→guest:4444);
   fix: extra_host_port:extra_host_port mapping so guest binds the
   per-slot LPORT directly.

5. vsftpd backdoor port 6200 hardcoded → collision across concurrent slots;
   fix: requires_bridge=true filters it from SLIRP fleet runs.

6. SLIRP false-positive in _wait_for_tcp → exploit fires before Samba
   boots (~60 s too early); fix: replace TCP probe with serial console
   _wait_for_serial_login that waits for actual "login:" prompt.

7. Stale QEMU survives orchestrator restart (start_new_session=True) →
   holds hostfwd ports, new QEMU silently fails; fix: kill by pgid from
   old pidfile before rmtree.

8. PORT_BASE default used privileged port 21; fix: default to 2021+slot*100.

9. msfrpcd 6.x returns bytes for all string values even with raw=False;
   fix: MSFRpcClient._str() recursive decoder applied to all responses.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-02 12:26:19 -06:00

112 lines
3.8 KiB
Python

"""``cis490-fleet`` — run as many concurrent labeled episodes as the
host can handle, drawing samples from the manifest.
Modes:
--capacity Print the resource calculation and exit. No VMs spawned.
--waves N Run N waves of episodes (one wave = max_concurrent
episodes, each in its own slot). Default: 1.
--max-concurrent N
Cap concurrency below the auto-detected ceiling.
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import signal
import sys
from pathlib import Path
# Allow running as a script.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from exploits.modules import load_module_configs # noqa: E402
from orchestrator.fleet import ( # noqa: E402
FleetConfig, FleetRunner, capacity_report, detect_capacity,
)
from samples.manifest import SampleManifest # noqa: E402
def main(argv: list[str] | None = None) -> int:
p = argparse.ArgumentParser(prog="cis490-fleet")
p.add_argument("--capacity", action="store_true")
p.add_argument("--waves", type=int, default=1)
p.add_argument("--max-concurrent", type=int, default=None)
p.add_argument("--manifest",
default=str(Path(__file__).resolve().parent.parent / "samples" / "manifest.toml"))
p.add_argument("--modules-dir",
default=str(Path(__file__).resolve().parent.parent / "exploits" / "modules"))
p.add_argument("--data-root", default="data")
p.add_argument("--host-id", default=os.environ.get("FLEET_HOST_ID") or os.uname().nodename)
p.add_argument("--ram-per-vm-mib", type=int, default=320)
p.add_argument("--require-real-samples", action="store_true")
p.add_argument("--force-tier2", action="store_true",
help="Skip Tier 3 even when msfrpcd is reachable")
p.add_argument("--max-tier3-slots", type=int, default=None,
help="Cap concurrent Tier-3 slots; slots >= N fall back to Tier-2")
p.add_argument("--log-level", default="INFO")
args = p.parse_args(argv)
logging.basicConfig(
level=getattr(logging, args.log_level.upper(), logging.INFO),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
if args.capacity:
print(capacity_report())
return 0
manifest = SampleManifest.load(args.manifest)
repo_root = Path(__file__).resolve().parent.parent
modules_dir = Path(args.modules_dir)
modules = load_module_configs(modules_dir) if modules_dir.exists() else {}
cfg = FleetConfig(
host_id=args.host_id,
repo_root=repo_root,
data_root=Path(args.data_root).resolve(),
manifest=manifest,
modules=modules,
ram_per_vm_mib=args.ram_per_vm_mib,
max_concurrent_override=args.max_concurrent,
require_real_samples=args.require_real_samples,
force_tier2=args.force_tier2,
max_tier3_slots=args.max_tier3_slots,
)
runner = FleetRunner(cfg)
def _stop(signum, frame): # noqa: ARG001
runner.stop()
signal.signal(signal.SIGTERM, _stop)
signal.signal(signal.SIGINT, _stop)
result = runner.run(episodes=args.waves)
print(json.dumps({
"host_id": args.host_id,
"capacity": result.capacity.to_dict(),
"modules_loaded": sorted(modules.keys()),
"slots": [
{
"slot": s.slot,
"sample": s.sample_name,
"sample_kind": s.sample_kind,
"tier": s.tier,
"module": s.module_name,
"rc": s.rc,
"duration_s": s.duration_s,
"error": s.error,
} for s in result.slots
],
"total_duration_s": result.total_duration_s,
}, indent=2))
return 0 if all(s.rc == 0 for s in result.slots) else 1
if __name__ == "__main__":
sys.exit(main())