Wraps the three remaining 🚧 items from the README so every collector the threat-model promises is actually live, and the Tier-4 path (real-malware fetch + upload + exec) works end-to-end as soon as a sha256 lands in samples/store/. Closes spectral/CIS490#4, #5, #6. == #6 — Bridge pcap wiring == EpisodeConfig grows three optional fields: bridge_iface: str | None # e.g. "br-malware" bridge_ip: str = "10.200.0.1" pcap_snaplen: int = 256 When bridge_iface is set, EpisodeRunner spawns tcpdump for the duration of the schedule (network.pcap), stops it cleanly on episode end, and runs collectors.pcap.bucketize() to produce netflow.jsonl per the 100-ms schema in docs/data-model.md. EpisodeResult + meta.result gain rows_netflow + pcap_bytes counters. vm/launch_demo.sh + launch_target.sh now switch between SLIRP usermode and tap+bridge based on $BRIDGE — operator pre-creates the tap as a bridge member, no sudo from the launcher. run_real_vm_demo.py picks BRIDGE up from env so the fleet runner can opt entire waves into pcap mode by exporting BRIDGE before invocation. == #5 — Source 3 perf collector == collectors/perf_qemu.py shells out to ``perf stat -p <pid> -I 100 -j`` and parses the per-event JSON stream. Aggregates one row per interval across the canonical event set (cycles/instructions/cache-{refs,misses}/ branches/branch-misses/page-faults/context-switches), computes IPC + cache-miss rate. Tolerates missing events (``<not counted>`` / ``<not supported>``) without dropping the row, and skips cleanly when ``perf`` isn't on PATH or the process can't be attached. EpisodeConfig.enable_perf=True opts into the collector — off by default because perf needs CAP_SYS_ADMIN or perf_event_paranoid <= 1. When enabled, runs as a parallel thread alongside the other collectors; EpisodeResult.rows_perf records the count. == #4 — Tier 4 (real-malware fetch + upload + exec) == tools/fetch_sample.py: pulls a sample by sha256 from MalwareBazaar (API key from env or samples/.bazaar.token), unzips with the standard "infected" password, verifies the resulting binary's sha256, lands at samples/store/<sha256>. Idempotent — already-staged correct binaries return immediately. samples/manifest.py: Sample.binary_path(store_root) resolves to the staged binary path, or None for mimics / not-yet-fetched real samples. exploits/workloads.py: real_binary_workload(bytes, sample) builds a Workload that base64-uploads the binary into the shell session via a heredoc, decodes + chmods + execs it in the background, captures the PID for clean stop on dormant. Per-profile pid/bin paths so concurrent samples in the same guest don't collide. exploits/driver.py: dispatch order is now: 1) sample.kind == "real" + binary staged at sample_store_root → real_binary_workload (Tier 4) 2) profile mimic from workloads.workload_for() (Tier 3 v2) 3) None → driver v1 fallback yes-loop DriverConfig.sample_store_root is the new field; run_tier3_demo.py wires it to repo_root/samples/store. driver_setup event records sample_sha256 so trainers can join Tier-4 episodes against the manifest by hash. samples/store/.gitkeep added (binaries themselves are gitignored). Tests: 102 pass (was 86). New suites: tests/test_perf_qemu.py — parser + builder + perf-missing fallback tests/test_tier4.py — real_binary_workload base64 round-trip, stop-cmd kills pidfile, per-profile path isolation, driver dispatch chooses real vs mimic correctly, fetcher input validation and cached-fast-path Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
142 lines
4.1 KiB
Python
142 lines
4.1 KiB
Python
"""Fetch a malware sample by sha256 from MalwareBazaar.
|
|
|
|
Lands the binary at ``samples/store/<sha256>`` (gitignored), verifies
|
|
the hash on the way in, and prints the resulting path on stdout.
|
|
|
|
Usage:
|
|
|
|
MALWAREBAZAAR_API_KEY=... uv run python tools/fetch_sample.py <sha256>
|
|
|
|
MalwareBazaar requires a free API key as of late 2023; sign up at
|
|
https://bazaar.abuse.ch and either pass via env or place in
|
|
``samples/.bazaar.token`` (mode 0600, gitignored). The downloaded
|
|
zip is unencrypted by ``infected`` per the MB convention.
|
|
|
|
The fetcher is intentionally read-only over the network — no upload,
|
|
no metadata posted — so a lab host with a tightly-egress-firewalled
|
|
WG mesh can run it once on a build host and rsync the resulting
|
|
``samples/store/`` directory across the fleet.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import hashlib
|
|
import os
|
|
import sys
|
|
import urllib.parse
|
|
import urllib.request
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
|
|
MB_ENDPOINT = "https://mb-api.abuse.ch/api/v1/"
|
|
MB_ZIP_PASSWORD = b"infected"
|
|
|
|
|
|
def _read_api_key(repo_root: Path) -> str | None:
|
|
env = os.environ.get("MALWAREBAZAAR_API_KEY")
|
|
if env:
|
|
return env.strip()
|
|
token = repo_root / "samples" / ".bazaar.token"
|
|
if token.exists():
|
|
return token.read_text().strip()
|
|
return None
|
|
|
|
|
|
def fetch_sample(
|
|
sha256: str,
|
|
out_dir: Path,
|
|
api_key: str,
|
|
*,
|
|
timeout_s: float = 60.0,
|
|
) -> Path:
|
|
if len(sha256) != 64 or not all(c in "0123456789abcdef" for c in sha256.lower()):
|
|
raise ValueError(f"sha256 must be 64 hex chars, got {sha256!r}")
|
|
sha256 = sha256.lower()
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
target = out_dir / sha256
|
|
if target.exists():
|
|
actual = hashlib.sha256(target.read_bytes()).hexdigest()
|
|
if actual == sha256:
|
|
return target
|
|
target.unlink() # tampered or partial; refetch.
|
|
|
|
body = urllib.parse.urlencode({
|
|
"query": "get_file",
|
|
"sha256_hash": sha256,
|
|
}).encode("utf-8")
|
|
req = urllib.request.Request(
|
|
MB_ENDPOINT,
|
|
data=body,
|
|
headers={
|
|
"Auth-Key": api_key,
|
|
"User-Agent": "cis490-fetcher/0",
|
|
},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=timeout_s) as r:
|
|
payload = r.read()
|
|
|
|
if not payload.startswith(b"PK"):
|
|
raise RuntimeError(
|
|
f"MalwareBazaar returned non-zip response (first 200 bytes): "
|
|
f"{payload[:200]!r}"
|
|
)
|
|
|
|
zip_path = out_dir / f"{sha256}.zip"
|
|
zip_path.write_bytes(payload)
|
|
try:
|
|
with zipfile.ZipFile(zip_path) as zf:
|
|
zf.setpassword(MB_ZIP_PASSWORD)
|
|
names = zf.namelist()
|
|
if not names:
|
|
raise RuntimeError(f"{sha256}: empty zip")
|
|
with zf.open(names[0]) as src, target.open("wb") as dst:
|
|
dst.write(src.read())
|
|
finally:
|
|
zip_path.unlink(missing_ok=True)
|
|
|
|
actual = hashlib.sha256(target.read_bytes()).hexdigest()
|
|
if actual != sha256:
|
|
target.unlink()
|
|
raise RuntimeError(f"sha256 mismatch: expected {sha256}, got {actual}")
|
|
return target
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
p = argparse.ArgumentParser(prog="fetch_sample")
|
|
p.add_argument("sha256")
|
|
p.add_argument(
|
|
"--out-dir",
|
|
type=Path,
|
|
default=None,
|
|
help="Where to drop <sha256> (default: samples/store/ relative to repo)",
|
|
)
|
|
args = p.parse_args(argv)
|
|
|
|
repo_root = Path(__file__).resolve().parent.parent
|
|
out_dir = args.out_dir or (repo_root / "samples" / "store")
|
|
|
|
api_key = _read_api_key(repo_root)
|
|
if not api_key:
|
|
print(
|
|
"no MalwareBazaar API key — set MALWAREBAZAAR_API_KEY or write "
|
|
"samples/.bazaar.token (mode 0600). Register at "
|
|
"https://bazaar.abuse.ch.",
|
|
file=sys.stderr,
|
|
)
|
|
return 2
|
|
|
|
try:
|
|
path = fetch_sample(args.sha256, out_dir, api_key)
|
|
except Exception as e:
|
|
print(f"fetch failed: {e}", file=sys.stderr)
|
|
return 1
|
|
print(path)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|