Wraps the gaps surfaced in the "what is not implemented" audit so the
fleet really is shippable end-to-end. Verified live on the Pi:
- cis490-shipper --ping → HTTP 200 through Caddy + mTLS via the
new wg-pki client CA leaf
- real episode dir → tar+zstd → PUT → HTTP 201 stored
- re-ship same bytes → 200 (idempotent)
- re-ship different bytes under same id → 409 (conflict)
Changes:
orchestrator/episode.py
- EpisodeConfig.revert_at_start / revert_at_end (Tier 0+ snapshot/
revert per docs/architecture.md). When set + qmp_socket present,
EpisodeRunner issues loadvm <snapshot_name> and emits
snapshot_revert / snapshot_revert_failed events on the same
monotonic clock as everything else.
collectors/qmp.py
- savevm() / loadvm() helpers using human-monitor-command, plus a
test against the fake QMP server.
exploits/workloads.py
- chunked_real_binary_upload() returns a ChunkedUpload plan: 8 KiB
base64 chunks (~6 KiB binary each) so msfrpc never sees a buffer-
busting payload. Includes a finalize step that sha256-verifies on
the guest before exec.
- real_binary_workload() now wraps the chunked plan for backwards
compat with single-shot callers.
exploits/driver.py
- Tier-4 dispatch walks the chunked plan in MSFExploitDriver:
each chunk is a separate session_shell_write; finalize verifies;
exec only runs on sha-ok. New events: real_binary_upload_begin,
real_binary_verify, real_binary_aborted.
etc/cis490-orchestrator.service
- Reads /etc/cis490/lab-host.env (FLEET_HOST_ID + optional BRIDGE).
- Grants AmbientCapabilities CAP_NET_RAW (tcpdump for source 4) +
CAP_SYS_ADMIN + CAP_PERFMON (perf for source 3) so collectors
work under hardening.
scripts/install-lab-host.sh
- Writes /etc/cis490/lab-host.env on first install with FLEET_HOST_ID
defaulting to `hostname -s`.
- Best-effort: fetches the Alpine baseline qcow2 (sha512-pinned) and
builds cidata.iso with the in-guest agent embedded; symlinks both
into /opt/cis490/vm/images/ so launchers find them.
scripts/fetch-alpine-baseline.sh
- Idempotent fetcher for the Alpine 3.21 cloud-init nocloud qcow2
matching the sha512 in docs/sources.md.
tools/plot_envelope.py
- Rebuilt to render whatever telemetry the episode dir contains:
proc → QMP block ops → perf IPC/miss-rate → bridge pkts/SYNs →
guest agent load/mem. Missing sources are silently skipped.
tools/index_reader.py
- cis490-index CLI: filter receiver's index.jsonl by host / sample
/ time range, sort, count-by group. Closest thing to a query
interface until we stand up Postgres/Timescale.
samples/README.md
- Rewritten to match the new manifest schema, the kind=real vs mimic
split, the per-(host, slot, ep) selection mechanic, and the
chunked-upload safety story.
Tests: 106 pass (was 102). New cases:
- test_qmp.py — savevm + loadvm (HMP wrapper + error path)
- test_tier4.py — chunked plan splitting, sha-pinned finalize,
end-to-end driver walks all chunks + verify + exec via the fake
msfrpc client
Closes the "what is not implemented" punch list.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
262 lines
9.3 KiB
Python
262 lines
9.3 KiB
Python
"""Source 2 (oracle): QEMU QMP sampler.
|
|
|
|
Connects to the QEMU monitor protocol socket exposed by the launcher
|
|
($RUN_DIR/qmp.sock) and periodically queries the hypervisor for
|
|
per-VM stats that don't show up in /proc/<qemu_pid>:
|
|
|
|
- per-disk block I/O (rd_bytes, wr_bytes, rd_ops, wr_ops)
|
|
- VM run state (running / paused / shutdown)
|
|
- per-netdev tx/rx counters (when available)
|
|
- KVM stat counters (when available; introspection differs by qemu
|
|
version, so anything we can't read is skipped silently)
|
|
|
|
This source is **oracle-only** — it does not exist on a deployed
|
|
device. Every row carries ``available_in_deployment: false``.
|
|
|
|
Wire format: QMP is line-delimited JSON. The handshake is fixed:
|
|
|
|
server → {"QMP": {capabilities: [...], version: ...}}
|
|
client → {"execute": "qmp_capabilities"}
|
|
server → {"return": {}}
|
|
(client may now issue commands)
|
|
|
|
We use a dedicated synchronous client because QMP is request/response
|
|
and we don't need pipelining; one query batch per tick keeps the
|
|
on-disk schema simple.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import socket
|
|
import threading
|
|
import time
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
log = logging.getLogger("cis490.collectors.qmp")
|
|
|
|
SOURCE = "host_qmp"
|
|
AVAILABLE_IN_DEPLOYMENT = False
|
|
|
|
|
|
class QMPError(RuntimeError):
|
|
pass
|
|
|
|
|
|
@dataclass
|
|
class _SockReader:
|
|
sock: socket.socket
|
|
buf: bytes = b""
|
|
|
|
def read_line(self, timeout_s: float = 5.0) -> str:
|
|
deadline = time.monotonic() + timeout_s
|
|
while b"\n" not in self.buf:
|
|
self.sock.settimeout(max(0.1, deadline - time.monotonic()))
|
|
try:
|
|
chunk = self.sock.recv(8192)
|
|
except socket.timeout as e:
|
|
raise QMPError(f"QMP read timed out: {e}") from e
|
|
if not chunk:
|
|
raise QMPError("QMP connection closed by peer")
|
|
self.buf += chunk
|
|
line, _, rest = self.buf.partition(b"\n")
|
|
self.buf = rest
|
|
return line.decode("utf-8", errors="replace")
|
|
|
|
|
|
class QMPClient:
|
|
"""Tiny synchronous QMP client over a unix socket."""
|
|
|
|
def __init__(self, socket_path: str | Path) -> None:
|
|
self.path = str(socket_path)
|
|
self._sock: socket.socket | None = None
|
|
self._reader: _SockReader | None = None
|
|
|
|
def connect(self, timeout_s: float = 5.0) -> dict[str, Any]:
|
|
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
s.settimeout(timeout_s)
|
|
s.connect(self.path)
|
|
self._sock = s
|
|
self._reader = _SockReader(s)
|
|
# Read greeting.
|
|
greeting = json.loads(self._reader.read_line(timeout_s=timeout_s))
|
|
if "QMP" not in greeting:
|
|
raise QMPError(f"unexpected QMP greeting: {greeting!r}")
|
|
# Negotiate capabilities (no flags requested).
|
|
self.execute("qmp_capabilities")
|
|
return greeting["QMP"]
|
|
|
|
def execute(self, command: str, **arguments: Any) -> Any:
|
|
if self._sock is None or self._reader is None:
|
|
raise QMPError("not connected")
|
|
msg: dict[str, Any] = {"execute": command}
|
|
if arguments:
|
|
msg["arguments"] = arguments
|
|
body = (json.dumps(msg) + "\n").encode("utf-8")
|
|
self._sock.sendall(body)
|
|
# QMP can interleave async events with the response — drain
|
|
# until we see the matching {"return": ...} or {"error": ...}.
|
|
for _ in range(64): # bounded to avoid an infinite loop on bugs
|
|
line = self._reader.read_line()
|
|
if not line.strip():
|
|
continue
|
|
resp = json.loads(line)
|
|
if "return" in resp:
|
|
return resp["return"]
|
|
if "error" in resp:
|
|
raise QMPError(f"{command}: {resp['error']}")
|
|
# Otherwise it's an async event; ignore and keep reading.
|
|
raise QMPError(f"{command}: too many async events without a response")
|
|
|
|
# ---- snapshot / revert (via human-monitor-command) -----------------
|
|
|
|
def savevm(self, name: str) -> str:
|
|
"""``savevm <name>`` — capture a live VM snapshot inside the
|
|
qcow2. Returns the monitor's reply (empty string on success).
|
|
Requires the disk to be qcow2 (our launchers always are)."""
|
|
return self._hmp(f"savevm {name}")
|
|
|
|
def loadvm(self, name: str) -> str:
|
|
"""``loadvm <name>`` — restore the named snapshot. The guest
|
|
is paused, restored, and resumed; collectors continue
|
|
sampling and just see a sharp transition."""
|
|
return self._hmp(f"loadvm {name}")
|
|
|
|
def _hmp(self, cmd: str) -> str:
|
|
out = self.execute("human-monitor-command", **{"command-line": cmd})
|
|
return out if isinstance(out, str) else ""
|
|
|
|
def close(self) -> None:
|
|
if self._sock is not None:
|
|
try:
|
|
self._sock.close()
|
|
except OSError:
|
|
pass
|
|
self._sock = None
|
|
self._reader = None
|
|
|
|
|
|
# ---- row builders ----------------------------------------------------------
|
|
|
|
|
|
def _flatten_blockstats(blockstats: list[dict] | None) -> dict[str, dict[str, int]]:
|
|
"""Compact ``query-blockstats`` to ``{device: {rd_ops, wr_ops, ...}}``."""
|
|
out: dict[str, dict[str, int]] = {}
|
|
for entry in blockstats or []:
|
|
name = entry.get("device") or entry.get("qdev") or "unknown"
|
|
s = entry.get("stats") or {}
|
|
out[name] = {
|
|
"rd_ops": int(s.get("rd_operations", 0)),
|
|
"wr_ops": int(s.get("wr_operations", 0)),
|
|
"rd_bytes": int(s.get("rd_bytes", 0)),
|
|
"wr_bytes": int(s.get("wr_bytes", 0)),
|
|
"flush_ops": int(s.get("flush_operations", 0)),
|
|
}
|
|
return out
|
|
|
|
|
|
def collect_once(client: QMPClient, t_mono_origin_ns: int) -> dict[str, Any]:
|
|
row: dict[str, Any] = {
|
|
"t_mono_ns": time.monotonic_ns() - t_mono_origin_ns,
|
|
"t_wall_ns": time.time_ns(),
|
|
"source": SOURCE,
|
|
"available_in_deployment": AVAILABLE_IN_DEPLOYMENT,
|
|
}
|
|
|
|
# query-status is dirt cheap and tells us whether the guest is
|
|
# paused (rare) or running.
|
|
try:
|
|
status = client.execute("query-status")
|
|
row["vm_status"] = status.get("status")
|
|
row["vm_running"] = bool(status.get("running"))
|
|
except QMPError as e:
|
|
log.debug("query-status failed: %s", e)
|
|
|
|
try:
|
|
bs = client.execute("query-blockstats")
|
|
row["blockstats"] = _flatten_blockstats(bs)
|
|
except QMPError as e:
|
|
log.debug("query-blockstats failed: %s", e)
|
|
|
|
# query-stats is QEMU 7.1+ and the schema varies across versions.
|
|
# We only ask for KVM stats and tolerate any subset of fields.
|
|
try:
|
|
stats = client.execute("query-stats", target="vm")
|
|
row["kvm_stats"] = _summarize_query_stats(stats)
|
|
except QMPError as e:
|
|
log.debug("query-stats not supported: %s", e)
|
|
|
|
return row
|
|
|
|
|
|
def _summarize_query_stats(stats_resp: list[dict] | dict) -> dict[str, int]:
|
|
"""Reduce ``query-stats`` to a flat name→value map of integer
|
|
counters. The full payload is verbose and version-specific; we only
|
|
ever want individual scalar counters downstream."""
|
|
flat: dict[str, int] = {}
|
|
items = stats_resp if isinstance(stats_resp, list) else [stats_resp]
|
|
for entry in items:
|
|
for s in entry.get("stats", []) or []:
|
|
name = s.get("name")
|
|
value = s.get("value")
|
|
if isinstance(name, str) and isinstance(value, int):
|
|
flat[name] = value
|
|
return flat
|
|
|
|
|
|
# ---- run loop --------------------------------------------------------------
|
|
|
|
|
|
def run_loop(
|
|
socket_path: str | Path,
|
|
output_path: Path,
|
|
t_mono_origin_ns: int,
|
|
interval_ms: int,
|
|
stop_event: threading.Event,
|
|
) -> int:
|
|
"""Connect to ``socket_path`` and sample at ``interval_ms`` until
|
|
``stop_event``. Returns the number of rows written.
|
|
|
|
A single missed sample (transient QMP error) is logged and skipped;
|
|
repeated failures terminate the loop so the episode finishes cleanly
|
|
rather than hanging on a dead hypervisor."""
|
|
interval_ns = interval_ms * 1_000_000
|
|
client = QMPClient(socket_path)
|
|
try:
|
|
client.connect(timeout_s=5.0)
|
|
except (OSError, QMPError) as e:
|
|
log.warning("QMP connect to %s failed: %s — collector exits cleanly", socket_path, e)
|
|
return 0
|
|
|
|
rows = 0
|
|
consecutive_failures = 0
|
|
next_tick = time.monotonic_ns()
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
with output_path.open("a", buffering=1) as f:
|
|
while not stop_event.is_set():
|
|
try:
|
|
row = collect_once(client, t_mono_origin_ns)
|
|
f.write(json.dumps(row) + "\n")
|
|
rows += 1
|
|
consecutive_failures = 0
|
|
except (QMPError, OSError) as e:
|
|
consecutive_failures += 1
|
|
log.warning("QMP sample %d failed: %s", rows, e)
|
|
if consecutive_failures >= 5:
|
|
log.warning("5 consecutive QMP failures; bailing")
|
|
break
|
|
|
|
next_tick += interval_ns
|
|
sleep_ns = next_tick - time.monotonic_ns()
|
|
if sleep_ns > 0:
|
|
stop_event.wait(sleep_ns / 1_000_000_000)
|
|
else:
|
|
next_tick = time.monotonic_ns()
|
|
finally:
|
|
client.close()
|
|
return rows
|