Merge remote-tracking branch 'origin/main' into Dev_REL1_043026

This commit is contained in:
Elliott Kolden 2026-04-30 16:34:01 -06:00
commit 3d4936a227
6 changed files with 104 additions and 17 deletions

View file

@ -201,6 +201,31 @@ older clone:
**If you hit any of these on a fresh install, pull main first** before
filing an issue — the issue is probably already closed.
### Don't trust the in-guest probe alone — cross-check host CPU
The `pre_kill_probe.yes` / `pre_kill_probe.sh` fields in
`workload_killed` events are produced by `pgrep` running inside an
Alpine guest. busybox's pgrep does NOT support the `-c` flag. Older
versions of `VMLoadController._probe()` used `pgrep -c yes`, which
exits 1 with a usage banner on busybox; the `|| echo 0` fallback then
always reported `yes=0` regardless of whether the workload was
running. This caused 244 episodes from `elliott-thinkpad` and
`k-gamingcom` to be incorrectly labelled `workload-silent`.
The fix landed in main (probe now uses `pgrep yes | wc -l`); episodes
shipped after that commit have correct probe values. For older
episodes still on disk, the prune classifier now requires `flat-cpu`
(host-side CPU envelope confirms no signal) AND the probe to flag
workload-silent — a probe-only zero is no longer trusted. So you can
safely run `cis490-prune --archive` against the existing data without
losing valid episodes.
If you write any new in-guest diagnostic that runs commands via
SerialClient, assume busybox/ash semantics: no `disown` builtin, no
GNU `pgrep -c`, no bash `/dev/tcp`, no `[[ ]]`. Always pair an
in-guest signal with the host-side `/proc` measurement before you
declare an episode bad.
### One traceback at a time
When the doctor lights up multiple red rows, fix the topmost one and

View file

@ -17,10 +17,12 @@ qcow_image = "/var/lib/cis490/vm/images/metasploitable2.qcow2"
[receiver]
# The receiver lives behind Caddy on the WG-side collector host. The
# hostname must resolve over WG (collector.wg in the canonical
# spectral lab). The wg-pki CA must be on every lab-host so the
# Caddy-issued internal cert validates.
# spectral lab). ca_bundle pins the Caddy root CA (bundled in the
# repo) so the shipper can verify the server's TLS cert. The wg-pki
# client CA (wg-ca.pem from the bootstrap tarball) is the RECEIVER's
# trust anchor for our client cert — we don't configure it here.
url = "https://collector.wg"
ca_bundle = "/etc/cis490/certs/wg-ca.pem"
ca_bundle = "/opt/cis490/etc/caddy-root.crt"
# mTLS: leaf cert + private key issued by wg-pki for THIS host_id.
# Comment these out to fall back to bearer-token auth during early

View file

@ -68,6 +68,11 @@ def _wrap_loop(name: str, body: str) -> Workload:
script_path = f"/tmp/.cis490-workload-{name}.sh"
# Triple-quote the body into a heredoc so single-quotes inside the
# body don't conflict with our outer single-quoting.
# No `disown` here: it isn't a builtin in busybox sh / ash, so on
# Alpine guests the `disown` line printed `sh: disown: not found`
# into the captured output of every infected_running phase. nohup
# already gives SIGHUP immunity, which is the only thing disown
# was for. See spectral/CIS490#15.
start = (
f"cat > {script_path} <<'CIS490_EOF'\n"
f"#!/bin/sh\n"
@ -79,7 +84,6 @@ def _wrap_loop(name: str, body: str) -> Workload:
f"chmod +x {script_path}; "
f"nohup sh {script_path} </dev/null >/dev/null 2>&1 &\n"
f"echo $! > {pid_path}\n"
f"disown\n"
)
stop = (
f"if [ -f {pid_path} ]; then "

View file

@ -180,21 +180,55 @@ def test_workload_failed_flag(tmp_path: Path) -> None:
def test_workload_silent_flag(tmp_path: Path) -> None:
"""The elliott-lab fingerprint: dormant probe shows yes=0,
meaning the workload never actually fired."""
"""The elliott-lab fingerprint: dormant probe AND host-side CPU
both confirm the workload never fired. Both signals must agree
before we flag workload-silent (see CIS490#15 — the in-guest probe
alone was unreliable on busybox)."""
tar = _make_episode(
tmp_path,
**{"01TEST/events.jsonl": _events([
{"event": "workload_setup"},
{"event": "workload_started", "phase": "infected_running"},
{"event": "workload_killed", "phase": "dormant",
"pre_kill_probe": {"yes": "0", "loadavg": "0.18"}},
])},
**{
"01TEST/events.jsonl": _events([
{"event": "workload_setup"},
{"event": "workload_started", "phase": "infected_running"},
{"event": "workload_killed", "phase": "dormant",
"pre_kill_probe": {"yes": "0", "loadavg": "0.18"}},
]),
# Flat host CPU corroborates the probe — both agree no
# signal → workload-silent legitimately flags.
"01TEST/telemetry-proc.jsonl": _proc_rows(flat=True, n=60),
},
)
q = pe.classify_episode(tar, host_id="lab1", episode_id="01TEST")
assert "workload-silent" in q.reasons
def test_workload_silent_suppressed_when_host_cpu_real(tmp_path: Path) -> None:
"""CIS490#15 regression: busybox pgrep -c is unsupported, so the
in-guest probe always reports yes=0 on Alpine guests even when the
workload is saturating the vCPU. If host-side /proc telemetry shows
a real inter-phase CPU envelope, trust the host and DROP the
probe-based workload-silent reason otherwise we false-positive
every Alpine episode."""
tar = _make_episode(
tmp_path,
**{
"01TEST/events.jsonl": _events([
{"event": "workload_setup"},
{"event": "workload_started", "phase": "infected_running"},
{"event": "workload_killed", "phase": "dormant",
"pre_kill_probe": {"yes": "0", "loadavg": "0.18"}},
]),
# Sharp host CPU envelope — workload IS running. Default
# _make_episode already supplies _proc_rows(flat=False).
},
)
q = pe.classify_episode(tar, host_id="lab1", episode_id="01TEST")
assert "workload-silent" not in q.reasons, (
f"probe-only signal must not flag silent when host CPU is real; "
f"got reasons={q.reasons}"
)
def test_flat_cpu_flag(tmp_path: Path) -> None:
"""When the proc CPU% spread between phases is < 5pp, the episode
has no signal for the trainer to learn from."""

View file

@ -158,7 +158,13 @@ def classify_episode(tar_zst: Path, host_id: str, episode_id: str) -> EpisodeQua
if any(e.get("event") == "workload_failed" for e in events):
q.reasons.append("workload-failed")
# workload-silent: dormant transition's probe shows no `yes` proc.
# workload-silent (provisional): dormant transition's probe shows
# no `yes` proc. This is a weak signal on its own — see CIS490#15:
# busybox pgrep -c is unsupported, so pre-fix episodes always
# report yes=0 even when the workload is saturating the vCPU. We
# only confirm workload-silent when host-side /proc telemetry
# (computed below) AGREES that no signal is present (flat-cpu).
probe_says_silent = False
for e in events:
if e.get("event") != "workload_killed":
continue
@ -166,7 +172,7 @@ def classify_episode(tar_zst: Path, host_id: str, episode_id: str) -> EpisodeQua
continue
probe = e.get("pre_kill_probe")
if isinstance(probe, dict) and probe.get("yes") == "0":
q.reasons.append("workload-silent")
probe_says_silent = True
break
# flat-cpu: bucket /proc CPU% by phase, check inter-phase spread.
@ -198,6 +204,13 @@ def classify_episode(tar_zst: Path, host_id: str, episode_id: str) -> EpisodeQua
if medians and (max(medians) - min(medians)) < 5.0:
q.reasons.append("flat-cpu")
# Confirm workload-silent only when host-side telemetry agrees.
# If the probe said silent but /proc CPU% shows a real inter-phase
# delta (i.e. NOT flat-cpu), trust the host-side ground truth and
# discard the probe result — the probe is busybox-pgrep-broken.
if probe_says_silent and "flat-cpu" in q.reasons:
q.reasons.append("workload-silent")
return q

View file

@ -143,11 +143,20 @@ class VMLoadController:
def _probe(self) -> dict:
"""Ask the guest what's actually running. Returns a small dict
the caller stamps into the event so trainers can detect the
"workload didn't fire" case from meta alone."""
"workload didn't fire" case from meta alone.
Counts processes via ``pgrep <name> | wc -l`` rather than
``pgrep -c <name>``: the latter is a procps-ng/util-linux flag
and is NOT supported by busybox's pgrep (Alpine guests). On
busybox, ``pgrep -c`` exits 1 with a usage banner, the
``|| echo 0`` fallback always fires, and the probe reports
false zeros. See spectral/CIS490#15 — this caused 244 episodes
from elliott-thinkpad and k-gamingcom to be incorrectly
labelled workload-silent even when the workload was running."""
try:
out = self.s.run(
"echo yes=$(pgrep -c yes 2>/dev/null || echo 0); "
"echo sh=$(pgrep -c sh 2>/dev/null || echo 0); "
"echo yes=$(pgrep yes 2>/dev/null | wc -l); "
"echo sh=$(pgrep sh 2>/dev/null | wc -l); "
"echo loadavg=$(awk '{print $1}' /proc/loadavg)"
)
stats: dict = {}