diff --git a/AGENTS.md b/AGENTS.md index 9fbe907..865653a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -201,6 +201,31 @@ older clone: **If you hit any of these on a fresh install, pull main first** before filing an issue — the issue is probably already closed. +### Don't trust the in-guest probe alone — cross-check host CPU + +The `pre_kill_probe.yes` / `pre_kill_probe.sh` fields in +`workload_killed` events are produced by `pgrep` running inside an +Alpine guest. busybox's pgrep does NOT support the `-c` flag. Older +versions of `VMLoadController._probe()` used `pgrep -c yes`, which +exits 1 with a usage banner on busybox; the `|| echo 0` fallback then +always reported `yes=0` regardless of whether the workload was +running. This caused 244 episodes from `elliott-thinkpad` and +`k-gamingcom` to be incorrectly labelled `workload-silent`. + +The fix landed in main (probe now uses `pgrep yes | wc -l`); episodes +shipped after that commit have correct probe values. For older +episodes still on disk, the prune classifier now requires `flat-cpu` +(host-side CPU envelope confirms no signal) AND the probe to flag +workload-silent — a probe-only zero is no longer trusted. So you can +safely run `cis490-prune --archive` against the existing data without +losing valid episodes. + +If you write any new in-guest diagnostic that runs commands via +SerialClient, assume busybox/ash semantics: no `disown` builtin, no +GNU `pgrep -c`, no bash `/dev/tcp`, no `[[ ]]`. Always pair an +in-guest signal with the host-side `/proc` measurement before you +declare an episode bad. + ### One traceback at a time When the doctor lights up multiple red rows, fix the topmost one and diff --git a/exploits/workloads.py b/exploits/workloads.py index 9854483..26865a0 100644 --- a/exploits/workloads.py +++ b/exploits/workloads.py @@ -68,6 +68,11 @@ def _wrap_loop(name: str, body: str) -> Workload: script_path = f"/tmp/.cis490-workload-{name}.sh" # Triple-quote the body into a heredoc so single-quotes inside the # body don't conflict with our outer single-quoting. + # No `disown` here: it isn't a builtin in busybox sh / ash, so on + # Alpine guests the `disown` line printed `sh: disown: not found` + # into the captured output of every infected_running phase. nohup + # already gives SIGHUP immunity, which is the only thing disown + # was for. See spectral/CIS490#15. start = ( f"cat > {script_path} <<'CIS490_EOF'\n" f"#!/bin/sh\n" @@ -79,7 +84,6 @@ def _wrap_loop(name: str, body: str) -> Workload: f"chmod +x {script_path}; " f"nohup sh {script_path} /dev/null 2>&1 &\n" f"echo $! > {pid_path}\n" - f"disown\n" ) stop = ( f"if [ -f {pid_path} ]; then " diff --git a/tests/test_prune.py b/tests/test_prune.py index 21f9b26..d6cd72c 100644 --- a/tests/test_prune.py +++ b/tests/test_prune.py @@ -180,21 +180,55 @@ def test_workload_failed_flag(tmp_path: Path) -> None: def test_workload_silent_flag(tmp_path: Path) -> None: - """The elliott-lab fingerprint: dormant probe shows yes=0, - meaning the workload never actually fired.""" + """The elliott-lab fingerprint: dormant probe AND host-side CPU + both confirm the workload never fired. Both signals must agree + before we flag workload-silent (see CIS490#15 — the in-guest probe + alone was unreliable on busybox).""" tar = _make_episode( tmp_path, - **{"01TEST/events.jsonl": _events([ - {"event": "workload_setup"}, - {"event": "workload_started", "phase": "infected_running"}, - {"event": "workload_killed", "phase": "dormant", - "pre_kill_probe": {"yes": "0", "loadavg": "0.18"}}, - ])}, + **{ + "01TEST/events.jsonl": _events([ + {"event": "workload_setup"}, + {"event": "workload_started", "phase": "infected_running"}, + {"event": "workload_killed", "phase": "dormant", + "pre_kill_probe": {"yes": "0", "loadavg": "0.18"}}, + ]), + # Flat host CPU corroborates the probe — both agree no + # signal → workload-silent legitimately flags. + "01TEST/telemetry-proc.jsonl": _proc_rows(flat=True, n=60), + }, ) q = pe.classify_episode(tar, host_id="lab1", episode_id="01TEST") assert "workload-silent" in q.reasons +def test_workload_silent_suppressed_when_host_cpu_real(tmp_path: Path) -> None: + """CIS490#15 regression: busybox pgrep -c is unsupported, so the + in-guest probe always reports yes=0 on Alpine guests even when the + workload is saturating the vCPU. If host-side /proc telemetry shows + a real inter-phase CPU envelope, trust the host and DROP the + probe-based workload-silent reason — otherwise we false-positive + every Alpine episode.""" + tar = _make_episode( + tmp_path, + **{ + "01TEST/events.jsonl": _events([ + {"event": "workload_setup"}, + {"event": "workload_started", "phase": "infected_running"}, + {"event": "workload_killed", "phase": "dormant", + "pre_kill_probe": {"yes": "0", "loadavg": "0.18"}}, + ]), + # Sharp host CPU envelope — workload IS running. Default + # _make_episode already supplies _proc_rows(flat=False). + }, + ) + q = pe.classify_episode(tar, host_id="lab1", episode_id="01TEST") + assert "workload-silent" not in q.reasons, ( + f"probe-only signal must not flag silent when host CPU is real; " + f"got reasons={q.reasons}" + ) + + def test_flat_cpu_flag(tmp_path: Path) -> None: """When the proc CPU% spread between phases is < 5pp, the episode has no signal for the trainer to learn from.""" diff --git a/tools/prune_episodes.py b/tools/prune_episodes.py index ae523c1..2c9afad 100644 --- a/tools/prune_episodes.py +++ b/tools/prune_episodes.py @@ -158,7 +158,13 @@ def classify_episode(tar_zst: Path, host_id: str, episode_id: str) -> EpisodeQua if any(e.get("event") == "workload_failed" for e in events): q.reasons.append("workload-failed") - # workload-silent: dormant transition's probe shows no `yes` proc. + # workload-silent (provisional): dormant transition's probe shows + # no `yes` proc. This is a weak signal on its own — see CIS490#15: + # busybox pgrep -c is unsupported, so pre-fix episodes always + # report yes=0 even when the workload is saturating the vCPU. We + # only confirm workload-silent when host-side /proc telemetry + # (computed below) AGREES that no signal is present (flat-cpu). + probe_says_silent = False for e in events: if e.get("event") != "workload_killed": continue @@ -166,7 +172,7 @@ def classify_episode(tar_zst: Path, host_id: str, episode_id: str) -> EpisodeQua continue probe = e.get("pre_kill_probe") if isinstance(probe, dict) and probe.get("yes") == "0": - q.reasons.append("workload-silent") + probe_says_silent = True break # flat-cpu: bucket /proc CPU% by phase, check inter-phase spread. @@ -198,6 +204,13 @@ def classify_episode(tar_zst: Path, host_id: str, episode_id: str) -> EpisodeQua if medians and (max(medians) - min(medians)) < 5.0: q.reasons.append("flat-cpu") + # Confirm workload-silent only when host-side telemetry agrees. + # If the probe said silent but /proc CPU% shows a real inter-phase + # delta (i.e. NOT flat-cpu), trust the host-side ground truth and + # discard the probe result — the probe is busybox-pgrep-broken. + if probe_says_silent and "flat-cpu" in q.reasons: + q.reasons.append("workload-silent") + return q diff --git a/tools/vm_load_controller.py b/tools/vm_load_controller.py index a8861d2..2f9d94b 100644 --- a/tools/vm_load_controller.py +++ b/tools/vm_load_controller.py @@ -143,11 +143,20 @@ class VMLoadController: def _probe(self) -> dict: """Ask the guest what's actually running. Returns a small dict the caller stamps into the event so trainers can detect the - "workload didn't fire" case from meta alone.""" + "workload didn't fire" case from meta alone. + + Counts processes via ``pgrep | wc -l`` rather than + ``pgrep -c ``: the latter is a procps-ng/util-linux flag + and is NOT supported by busybox's pgrep (Alpine guests). On + busybox, ``pgrep -c`` exits 1 with a usage banner, the + ``|| echo 0`` fallback always fires, and the probe reports + false zeros. See spectral/CIS490#15 — this caused 244 episodes + from elliott-thinkpad and k-gamingcom to be incorrectly + labelled workload-silent even when the workload was running.""" try: out = self.s.run( - "echo yes=$(pgrep -c yes 2>/dev/null || echo 0); " - "echo sh=$(pgrep -c sh 2>/dev/null || echo 0); " + "echo yes=$(pgrep yes 2>/dev/null | wc -l); " + "echo sh=$(pgrep sh 2>/dev/null | wc -l); " "echo loadavg=$(awk '{print $1}' /proc/loadavg)" ) stats: dict = {}