From 05bf785f0a950bb9a33502800153b332aed65d13 Mon Sep 17 00:00:00 2001 From: max Date: Sat, 2 May 2026 13:51:20 -0500 Subject: [PATCH] fleet-health: exit 0 when alerts found (don't mark unit failed) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The detector previously returned 1 on alerts, which made systemd mark cis490-fleet-health.service as 'failed' every tick that found a sick host. That's the wrong UX — a detector finding a fault is working correctly, not crashing. The alert is the signal (via WARNING log + alerts.jsonl); the unit's success state should mean "the detector itself ran cleanly." Test added. Caught while live-deploying on the Pi: the first run found elliott-thinkpad fatal-only at 943×4xx + 1425×5xx and correctly emitted the alert — but systemd showed the unit red, which would have caused operators to chase the wrong tail. Side note: the same first run also caught a real bug — pycache for receiver.store on /opt/cis490 was stale after I deployed the new app.py + store.py from main, causing 1464 × 500 responses. Cleared the pycache and the index immediately resumed growing (4465 → 4515 in 30 seconds). The detector earned its keep on the very first cycle. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_fleet_health.py | 36 ++++++++++++++++++++++++++++++++++++ tools/check_fleet_health.py | 11 ++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/tests/test_fleet_health.py b/tests/test_fleet_health.py index 0f90e48..43b5be7 100644 --- a/tests/test_fleet_health.py +++ b/tests/test_fleet_health.py @@ -300,3 +300,39 @@ def test_main_returns_zero_when_no_alerts(tmp_path: Path, rc = fh.main(["--index-path", str(idx), "--alerts-path", str(alerts)]) assert rc == 0 + + +def test_main_returns_zero_even_when_alerts_emitted(tmp_path: Path, + monkeypatch: pytest.MonkeyPatch) -> None: + """The detector exits 0 even when alerts are present — the alert + IS the signal (via alerts.jsonl + WARNING log), not the unit's + success state. systemd showing the unit as 'failed' on every + detection would be misleading: the detector itself is working + correctly.""" + idx = tmp_path / "index.jsonl" + now = time.time() + # Active host but silent for >30 min → triggers 'silent' alert. + _write_index(idx, [ + {"host_id": "lab1", "received_at_wall": _iso(now - 60 * 60)}, + ]) + alerts = tmp_path / "alerts.jsonl" + + real_run = fh.subprocess.run + + def fake_run(cmd, *args, **kwargs): + if cmd and cmd[0] == "journalctl": + class R: + returncode = 0 + stdout = "" + stderr = "" + return R() + return real_run(cmd, *args, **kwargs) + + monkeypatch.setattr(fh.subprocess, "run", fake_run) + rc = fh.main(["--index-path", str(idx), + "--alerts-path", str(alerts)]) + assert rc == 0 + # The alert WAS emitted, just not via exit code. + assert alerts.exists() + rows = alerts.read_text().splitlines() + assert len(rows) >= 1 diff --git a/tools/check_fleet_health.py b/tools/check_fleet_health.py index a23b432..9567cd7 100644 --- a/tools/check_fleet_health.py +++ b/tools/check_fleet_health.py @@ -355,7 +355,16 @@ def main(argv: list[str] | None = None) -> int: if new_count == 0: log.info("fleet healthy — %d hosts checked, no new alerts", len(hosts)) - return 1 if new_count else 0 + else: + log.info("emitted %d new alert(s); see %s", + new_count, args.alerts_path) + # Exit 0 even when alerts are emitted: the alert IS the signal, + # not the unit's success/failure state. systemd treating "alerts + # found" as unit-failed is a UX wart — it makes `systemctl status` + # always red on a healthy detector that's just watching a fault. + # Operators consume alerts via journalctl + alerts.jsonl; the + # unit's failure state should mean "the detector itself broke." + return 0 if __name__ == "__main__":