This is the chunk that makes "real data" actually flow on multiple
hosts in parallel. End-to-end pipe was up at 613c6fa / 2579683; now
the lab-host side has the diversity + concurrency it needs.
Collectors landed:
collectors/qmp.py — source 2 (oracle). Tiny synchronous QMP
client + row builder + run loop. Tolerates
older qemu without query-stats.
collectors/guest_agent.py — source 5 (deployable). Reads the
virtio-serial host-side socket, parses
agent JSON-lines, re-stamps to the host
monotonic clock, persists.
collectors/pcap.py — source 4 (deployable). tcpdump capture
+ pure-Python pcap reader + 100 ms
netflow.jsonl bucketizer. Decodes
Ethernet/IPv4/TCP/UDP enough for the
schema in docs/data-model.md.
In-guest agent:
vm/guest-agent/cis490_agent.py — stdlib-only Python agent. Reads
/proc/{stat,meminfo,loadavg,net/dev,net/tcp*}, top-N RSS procs,
thermal. Writes JSON-lines to /dev/virtio-ports/cis490.guest.agent.
tools/build_cidata.py — embeds the agent + an OpenRC service into
user-data so first boot of the Alpine cidata image auto-starts it.
Launchers:
vm/launch_demo.sh / launch_target.sh — second virtio-serial port for
the agent socket; SLOT env support so multiple VMs run without
socket / port collisions; PORT_BASE on launch_target so multiple
target VMs hostfwd different host ports.
vm/setup_bridge.sh — creates host-only br-malware (10.200.0.1/24,
no NAT). Idempotent.
Fleet:
orchestrator/fleet.py — capacity detector (cores / RAM / load
headroom) + concurrent-slot runner. Per-slot ENV selects the
sample. FleetCapacity dataclass round-trips into meta.json so
"this episode ran with 6 concurrent VMs" is auditable post-hoc.
tools/run_fleet.py — CLI: --capacity report; --waves N runs N
waves of (max_concurrent) episodes each, every slot with a
different sample.
etc/cis490-orchestrator.service — now drives the fleet runner with
Restart=always so each invocation runs one wave and respawns,
giving a continuous stream.
Samples:
samples/manifest.toml — six profiles spanning the five major
behaviour shapes. Each entry is real OR mimic (sha256 distinguishes).
samples/manifest.py — strict TOML loader (rejects dups, unknown
categories) + deterministic select(host_id, slot, episode_index)
so different hosts on the network walk the catalog in different
orders without any coordinator.
EpisodeRunner:
orchestrator/episode.py — optional qmp_socket + guest_agent_socket
fields on EpisodeConfig; when set, additional collector threads
run alongside proc_qemu. EpisodeResult now carries rows_qmp +
rows_guest counters.
Tier-3 setup automation:
scripts/install-msfrpcd.sh — installs metasploit-framework where
the package manager has it, generates a strong password into
/etc/cis490/msfrpc.env, drops a hardened systemd unit bound to
127.0.0.1:55553. After this, run_tier3_demo.py works zero-touch
once MSFRPC_PASSWORD is sourced.
scripts/fetch-metasploitable2.sh — accepts IMAGE_URL + IMAGE_SHA256
from the operator (Rapid7 download is registration-walled), pulls,
verifies, converts vmdk → qcow2, lands at vm/images/.
Tests: 82 pass (was 51). New suites:
tests/test_qmp.py — fake QMP server, capability handshake,
blockstats, async-event interleaving,
5-failure backoff
tests/test_guest_agent.py — fake virtio socket, JSON-lines read +
re-stamp, malformed-line tolerance
tests/test_pcap.py — synthetic pcap with TCP/UDP/ARP frames,
bucketize correctness across windows
tests/test_fleet.py — capacity math (8-core idle / low-RAM /
high-load / Pi5 / 1-core box), manifest
selection determinism + diversity
What's queued for the next commit (already discussed in convo):
- MSFExploitDriver v2: map sample.profile → distinct in-session
workload so Tier-3 episodes don't all produce the same yes-loop
envelope. Critical for ML to learn varied malware shapes.
- Real-sample fetch from MalwareBazaar by sha256.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
187 lines
5.4 KiB
Python
187 lines
5.4 KiB
Python
"""Build a NoCloud cidata ISO for cloud-init.
|
|
|
|
Cirros 0.6.x — and most cloud images — look for a NoCloud datasource at
|
|
boot: an ISO9660 volume labeled ``cidata`` containing two files,
|
|
``user-data`` and ``meta-data``. We attach it as a second drive so
|
|
cloud-init proceeds without spending ~17 minutes timing out trying to
|
|
reach a non-existent metadata service.
|
|
|
|
This script is intentionally self-contained and uses only pycdlib (pure
|
|
Python) — no system mkisofs/xorriso/cloud-localds dependency.
|
|
|
|
Usage:
|
|
|
|
uv run python tools/build_cidata.py vm/images/cidata.iso
|
|
|
|
The defaults bake in the ``cirros`` user with the documented Cirros
|
|
password, enable SSH password auth (so future Metasploit-class images
|
|
work without changes), and set a hostname. Override via flags if needed.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import io
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pycdlib
|
|
|
|
|
|
DEFAULT_USER_DATA_HEAD = """\
|
|
#cloud-config
|
|
hostname: cis490
|
|
manage_etc_hosts: true
|
|
users:
|
|
- name: cis490
|
|
plain_text_passwd: cis490
|
|
lock_passwd: false
|
|
sudo: ALL=(ALL) NOPASSWD:ALL
|
|
shell: /bin/sh
|
|
ssh_pwauth: true
|
|
disable_root: false
|
|
chpasswd:
|
|
expire: false
|
|
list: |
|
|
root:cis490
|
|
cis490:cis490
|
|
"""
|
|
|
|
# OpenRC service file shipped inside the guest. Alpine uses OpenRC;
|
|
# the runcmd at the bottom of user-data wires it up on first boot.
|
|
OPENRC_SERVICE = """\
|
|
#!/sbin/openrc-run
|
|
|
|
description="CIS490 in-guest telemetry agent"
|
|
command="/usr/local/bin/cis490-agent"
|
|
command_args="--port /dev/virtio-ports/cis490.guest.agent"
|
|
command_background=true
|
|
pidfile="/run/cis490-agent.pid"
|
|
output_log="/var/log/cis490-agent.log"
|
|
error_log="/var/log/cis490-agent.log"
|
|
|
|
depend() {
|
|
need localmount
|
|
}
|
|
"""
|
|
|
|
DEFAULT_META_DATA = """\
|
|
instance-id: cis490-vm-001
|
|
local-hostname: cis490
|
|
"""
|
|
|
|
|
|
def _indent(text: str, n: int) -> str:
|
|
pad = " " * n
|
|
return "\n".join(pad + line if line else line for line in text.splitlines())
|
|
|
|
|
|
def build_user_data(*, embed_agent: bool, agent_path: Path | None) -> bytes:
|
|
"""Build a cloud-init user-data document. When ``embed_agent`` is
|
|
True, also stuff the in-guest agent + an OpenRC service into
|
|
``write_files`` and arrange to start the service on first boot."""
|
|
head = DEFAULT_USER_DATA_HEAD
|
|
if not embed_agent:
|
|
return (head + 'runcmd:\n - [ sh, -c, "echo CIS490_BOOT_OK > /tmp/.cis490-boot" ]\n').encode()
|
|
|
|
if agent_path is None:
|
|
agent_path = Path(__file__).resolve().parent.parent / "vm" / "guest-agent" / "cis490_agent.py"
|
|
if not agent_path.exists():
|
|
raise FileNotFoundError(f"agent script not found: {agent_path}")
|
|
agent_src = agent_path.read_text()
|
|
|
|
body = head + (
|
|
"write_files:\n"
|
|
" - path: /usr/local/bin/cis490-agent\n"
|
|
" permissions: '0755'\n"
|
|
" owner: root:root\n"
|
|
" content: |\n"
|
|
f"{_indent(agent_src, 6)}\n"
|
|
" - path: /etc/init.d/cis490-agent\n"
|
|
" permissions: '0755'\n"
|
|
" owner: root:root\n"
|
|
" content: |\n"
|
|
f"{_indent(OPENRC_SERVICE, 6)}\n"
|
|
"runcmd:\n"
|
|
' - [ sh, -c, "echo CIS490_BOOT_OK > /tmp/.cis490-boot" ]\n'
|
|
' - [ sh, -c, "command -v rc-update >/dev/null && rc-update add cis490-agent default || true" ]\n'
|
|
' - [ sh, -c, "command -v rc-service >/dev/null && rc-service cis490-agent start || true" ]\n'
|
|
)
|
|
return body.encode()
|
|
|
|
DEFAULT_META_DATA = """\
|
|
instance-id: cis490-vm-001
|
|
local-hostname: cis490
|
|
"""
|
|
|
|
|
|
def build_cidata(out_path: Path, user_data: bytes, meta_data: bytes) -> None:
|
|
iso = pycdlib.PyCdlib()
|
|
# Joliet=3 + Rock Ridge so cloud-init reads filenames correctly on Linux.
|
|
iso.new(joliet=3, vol_ident="cidata", interchange_level=3, rock_ridge="1.09")
|
|
|
|
iso.add_fp(
|
|
io.BytesIO(user_data),
|
|
len(user_data),
|
|
iso_path="/USERDATA.;1",
|
|
rr_name="user-data",
|
|
joliet_path="/user-data",
|
|
)
|
|
iso.add_fp(
|
|
io.BytesIO(meta_data),
|
|
len(meta_data),
|
|
iso_path="/METADATA.;1",
|
|
rr_name="meta-data",
|
|
joliet_path="/meta-data",
|
|
)
|
|
iso.write(str(out_path))
|
|
iso.close()
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(prog="build_cidata")
|
|
parser.add_argument("out_path", type=Path)
|
|
parser.add_argument(
|
|
"--user-data",
|
|
type=Path,
|
|
default=None,
|
|
help="path to a custom cloud-config user-data file",
|
|
)
|
|
parser.add_argument(
|
|
"--meta-data",
|
|
type=Path,
|
|
default=None,
|
|
help="path to a custom meta-data file",
|
|
)
|
|
parser.add_argument(
|
|
"--no-embed-agent",
|
|
action="store_true",
|
|
help="don't bake the in-guest agent into user-data",
|
|
)
|
|
parser.add_argument(
|
|
"--agent-path",
|
|
type=Path,
|
|
default=None,
|
|
help="path to the in-guest agent (default: vm/guest-agent/cis490_agent.py)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.user_data:
|
|
user_data = args.user_data.read_bytes()
|
|
else:
|
|
user_data = build_user_data(
|
|
embed_agent=not args.no_embed_agent,
|
|
agent_path=args.agent_path,
|
|
)
|
|
meta_data = (
|
|
args.meta_data.read_bytes() if args.meta_data else DEFAULT_META_DATA.encode()
|
|
)
|
|
|
|
args.out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
build_cidata(args.out_path, user_data, meta_data)
|
|
print(f"wrote {args.out_path} ({args.out_path.stat().st_size} bytes)")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|