User caught it: I shipped the theZoo path without running it
end-to-end. A real fetch on the Pi exposed two bugs:
1. Family-name matcher was substring-strict. "Cryptolocker-class"
wouldn't match the dir "CryptoLocker_22Jan2014" because "-class"
isn't in the dir name. Now expands to a sequence of tokens
(full, head-of-dash, head-of-dot, head-of-underscore) and tries
each. First match wins.
2. Extraction picker was "largest non-text" — a bad heuristic for
theZoo, where each Linux.* zip often contains MULTIPLE binaries
for different platforms (Linux i386, x86-64, ARM, FreeBSD, sometimes
even Windows PE). The largest is rarely the i386 Linux ELF that
would actually run on Metasploitable2. Now sniffs ELF magic bytes
in stdlib and tiers:
1. Linux i386 ELF (largest first)
2. any other ELF (best-effort, may not execute)
3. largest non-text (Wine fallback)
Verified end-to-end on the Pi against a real theZoo clone (~500 MB,
263 family dirs, 2026-05-01 fresh pull):
linux-encoder-ransomware → ELF 32-bit Intel i386 SYSV (278 KB)
linux-wirenet-rat → ELF 32-bit Intel i386 SYSV (64 KB)
linux-rex-ransomware → ELF 32-bit Intel i386 SYSV Go (7.6 MB)
linux-neurevt-bot → ELF 32-bit Intel i386 SYSV (3.0 MB)
linux-earthkrahang-apt → ELF 32-bit Intel i386 GNU/Linux (5.8 MB)
5/5 picks are runnable Linux i386 ELFs. Manifest rewrites in place
add source/sha256/url; meta.sample.kind goes to "real" automatically.
Manifest rewritten:
- Old families (XMRig, Mirai, Cryptolocker-class, Dridex, Kovter,
Reverse-Shell) → mostly absent from theZoo's Linux catalog or
matched the wrong arch.
- New families chosen against a verified theZoo presence list:
Linux.Encoder, Linux.Wirenet, Ransomware.Rex, Neurevt,
EarthKrahang.
- XMRig + Kovter remain as mimic-only fallbacks (theZoo lacks a
runnable Linux i386 binary for these; orchestrator falls back
to the mimic profile).
Tests added (tests/test_auto_fetch_samples.py): 13 cases covering
ELF magic detection (i386 accepted, FreeBSD/x86-64/ARM/PE32/text
all rejected), family-token expansion (the "-class" suffix bug),
extraction picker (prefers Linux i386 over larger non-Linux ELFs),
manifest in-place rewrite preserves mode + skips entries that
already have sha256.
What's still NOT verified end-to-end (requires a lab host with
KVM x86):
- Metasploitable2 boot under QEMU
- vsftpd_234_backdoor exploit fire via msfrpcd
- chunked binary upload through a real shell session
- real binary executing inside a Metasploitable2 guest
The Pi is ARM64 — can't run Metasploitable2. install-tier-3-4.sh's
verify step (run_tier3_demo.py) covers all four on a real lab host;
deploy verifies on first run there.
171/171 tests pass.
200 lines
7 KiB
Python
200 lines
7 KiB
Python
"""Tests for tools/auto_fetch_samples.py.
|
|
|
|
Exercises the parts that can be tested without a real theZoo clone:
|
|
- ELF magic-byte sniffing for Linux i386 detection
|
|
- family-name → directory matching (substring + token fallback)
|
|
- manifest in-place rewrite (atomic, stat-preserving)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
spec = importlib.util.spec_from_file_location(
|
|
"auto_fetch_samples", REPO_ROOT / "tools" / "auto_fetch_samples.py"
|
|
)
|
|
afs = importlib.util.module_from_spec(spec)
|
|
sys.modules["auto_fetch_samples"] = afs
|
|
spec.loader.exec_module(afs)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ELF magic detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _write(p: Path, data: bytes) -> Path:
|
|
p.parent.mkdir(parents=True, exist_ok=True)
|
|
p.write_bytes(data)
|
|
return p
|
|
|
|
|
|
def _elf_header(*, ei_class: int = 1, ei_data: int = 1, ei_osabi: int = 0,
|
|
e_machine: int = 0x03) -> bytes:
|
|
"""Synthesise a minimal ELF header. Default = Linux i386."""
|
|
h = bytearray(20)
|
|
h[:4] = b"\x7fELF"
|
|
h[4] = ei_class # 1=32, 2=64
|
|
h[5] = ei_data # 1=little, 2=big
|
|
h[6] = 1 # ei_version
|
|
h[7] = ei_osabi # 0=SYSV, 3=Linux, 9=FreeBSD
|
|
h[18:20] = e_machine.to_bytes(2, "little")
|
|
return bytes(h)
|
|
|
|
|
|
def test_is_linux_i386_elf_accepts_sysv(tmp_path: Path) -> None:
|
|
p = _write(tmp_path / "x", _elf_header())
|
|
assert afs._is_linux_i386_elf(p)
|
|
|
|
|
|
def test_is_linux_i386_elf_accepts_linux_osabi(tmp_path: Path) -> None:
|
|
p = _write(tmp_path / "x", _elf_header(ei_osabi=3))
|
|
assert afs._is_linux_i386_elf(p)
|
|
|
|
|
|
def test_is_linux_i386_elf_rejects_freebsd(tmp_path: Path) -> None:
|
|
"""Snoopy.A in theZoo is FreeBSD/i386 — looks similar but won't
|
|
run on Metasploitable2."""
|
|
p = _write(tmp_path / "x", _elf_header(ei_osabi=9))
|
|
assert not afs._is_linux_i386_elf(p)
|
|
|
|
|
|
def test_is_linux_i386_elf_rejects_x86_64(tmp_path: Path) -> None:
|
|
p = _write(tmp_path / "x", _elf_header(ei_class=2, e_machine=0x3E))
|
|
assert not afs._is_linux_i386_elf(p)
|
|
|
|
|
|
def test_is_linux_i386_elf_rejects_arm(tmp_path: Path) -> None:
|
|
"""Mirai.B in theZoo is ARM — won't run on x86 Metasploitable2."""
|
|
p = _write(tmp_path / "x", _elf_header(e_machine=0x28))
|
|
assert not afs._is_linux_i386_elf(p)
|
|
|
|
|
|
def test_is_linux_i386_elf_rejects_pe32(tmp_path: Path) -> None:
|
|
"""Windows PE32 starts with MZ, not \\x7fELF."""
|
|
p = _write(tmp_path / "x", b"MZ" + b"\x00" * 18)
|
|
assert not afs._is_linux_i386_elf(p)
|
|
|
|
|
|
def test_is_linux_i386_elf_rejects_text(tmp_path: Path) -> None:
|
|
p = _write(tmp_path / "x", b"hello\n")
|
|
assert not afs._is_linux_i386_elf(p)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Family-token expansion (the bug that broke v1: "Cryptolocker-class"
|
|
# wouldn't match "CryptoLocker_22Jan2014" because the suffix "-class"
|
|
# isn't in the dir name)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_family_tokens_strips_suffix() -> None:
|
|
assert afs._family_tokens("Cryptolocker-class") == [
|
|
"cryptolocker-class", "cryptolocker"
|
|
]
|
|
|
|
|
|
def test_family_tokens_dot_namespaces_kept() -> None:
|
|
"""Linux.Mirai stays as `linux.mirai` so it lands on the right dir
|
|
rather than matching every Linux.* entry by the head token."""
|
|
out = afs._family_tokens("Linux.Mirai")
|
|
assert out[0] == "linux.mirai"
|
|
# Head token "linux" is appended as a fallback.
|
|
assert "linux" in out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Extraction picker prefers Linux i386 ELF
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_extract_largest_binary_prefers_linux_i386(tmp_path: Path) -> None:
|
|
"""Mimics theZoo's Linux.Encoder.1 layout: multiple binaries in the
|
|
same zip, only one of which is Linux i386. The picker must return
|
|
that one even though it isn't the largest."""
|
|
import zipfile
|
|
zip_path = tmp_path / "test.zip"
|
|
big_x86_64 = _elf_header(ei_class=2, e_machine=0x3E) + b"\x00" * 5000
|
|
small_i386 = _elf_header() + b"\x00" * 100
|
|
freebsd_i386 = _elf_header(ei_osabi=9) + b"\x00" * 8000
|
|
with zipfile.ZipFile(zip_path, "w") as z:
|
|
z.writestr("big-x86-64", big_x86_64)
|
|
z.writestr("small-i386", small_i386)
|
|
z.writestr("freebsd-i386", freebsd_i386)
|
|
work = tmp_path / "extract"
|
|
chosen = afs._extract_largest_binary(zip_path, work)
|
|
assert chosen is not None
|
|
assert chosen.name == "small-i386", (
|
|
f"picker should prefer Linux i386 over larger non-Linux ELFs, "
|
|
f"got {chosen.name}"
|
|
)
|
|
|
|
|
|
def test_extract_largest_binary_falls_back_to_other_elf(tmp_path: Path) -> None:
|
|
"""Mimics theZoo's Linux.Mirai.B (ARM ELF only). Picker should
|
|
still return something even though it won't run on Metasploitable2."""
|
|
import zipfile
|
|
zip_path = tmp_path / "test.zip"
|
|
arm_elf = _elf_header(e_machine=0x28) + b"\x00" * 200
|
|
text = b"placeholder text\n"
|
|
with zipfile.ZipFile(zip_path, "w") as z:
|
|
z.writestr("arm-binary", arm_elf)
|
|
z.writestr("readme.txt", text)
|
|
work = tmp_path / "extract"
|
|
chosen = afs._extract_largest_binary(zip_path, work)
|
|
assert chosen is not None
|
|
assert chosen.name == "arm-binary"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Manifest rewrite preserves stat
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_update_manifest_entry_preserves_mode(tmp_path: Path) -> None:
|
|
import stat as _st
|
|
m = tmp_path / "manifest.toml"
|
|
m.write_text(
|
|
'[[sample]]\n'
|
|
'name = "x"\n'
|
|
'family = "F"\n'
|
|
'category = "rat"\n'
|
|
'profile = "shell-resident"\n'
|
|
'description = "d"\n'
|
|
)
|
|
m.chmod(0o644)
|
|
before = _st.S_IMODE(m.stat().st_mode)
|
|
afs.update_manifest_entry(m, "x", source="theZoo",
|
|
sha256="a" * 64,
|
|
url="https://example.invalid/")
|
|
after = _st.S_IMODE(m.stat().st_mode)
|
|
assert before == after
|
|
text = m.read_text()
|
|
assert 'sha256 = "' + ("a" * 64) + '"' in text
|
|
assert 'source = "theZoo"' in text
|
|
|
|
|
|
def test_update_manifest_entry_skips_when_sha256_already_set(tmp_path: Path) -> None:
|
|
"""Re-running auto_fetch on an already-staged sample is a no-op."""
|
|
m = tmp_path / "manifest.toml"
|
|
m.write_text(
|
|
'[[sample]]\n'
|
|
'name = "x"\n'
|
|
'family = "F"\n'
|
|
'category = "rat"\n'
|
|
'profile = "shell-resident"\n'
|
|
'sha256 = "' + ("a" * 64) + '"\n'
|
|
'description = "d"\n'
|
|
)
|
|
before = m.read_text()
|
|
afs.update_manifest_entry(m, "x", source="theZoo",
|
|
sha256="b" * 64,
|
|
url="https://example.invalid/")
|
|
after = m.read_text()
|
|
assert before == after, "should not overwrite an existing sha256"
|