From 05bf785f0a950bb9a33502800153b332aed65d13 Mon Sep 17 00:00:00 2001
From: max <mgorog@gmail.com>
Date: Sat, 2 May 2026 13:51:20 -0500
Subject: [PATCH] fleet-health: exit 0 when alerts found (don't mark unit
 failed)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The detector previously returned 1 on alerts, which made systemd
mark cis490-fleet-health.service as 'failed' every tick that found
a sick host. That's the wrong UX — a detector finding a fault is
working correctly, not crashing. The alert is the signal (via
WARNING log + alerts.jsonl); the unit's success state should mean
"the detector itself ran cleanly." Test added.

Caught while live-deploying on the Pi: the first run found
elliott-thinkpad fatal-only at 943×4xx + 1425×5xx and correctly
emitted the alert — but systemd showed the unit red, which would
have caused operators to chase the wrong tail.

Side note: the same first run also caught a real bug — pycache for
receiver.store on /opt/cis490 was stale after I deployed the new
app.py + store.py from main, causing 1464 × 500 responses. Cleared
the pycache and the index immediately resumed growing (4465 →
4515 in 30 seconds). The detector earned its keep on the very
first cycle.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_fleet_health.py  | 36 ++++++++++++++++++++++++++++++++++++
 tools/check_fleet_health.py | 11 ++++++++++-
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/tests/test_fleet_health.py b/tests/test_fleet_health.py
index 0f90e48..43b5be7 100644
--- a/tests/test_fleet_health.py
+++ b/tests/test_fleet_health.py
@@ -300,3 +300,39 @@ def test_main_returns_zero_when_no_alerts(tmp_path: Path,
     rc = fh.main(["--index-path", str(idx),
                   "--alerts-path", str(alerts)])
     assert rc == 0
+
+
+def test_main_returns_zero_even_when_alerts_emitted(tmp_path: Path,
+                                                    monkeypatch: pytest.MonkeyPatch) -> None:
+    """The detector exits 0 even when alerts are present — the alert
+    IS the signal (via alerts.jsonl + WARNING log), not the unit's
+    success state. systemd showing the unit as 'failed' on every
+    detection would be misleading: the detector itself is working
+    correctly."""
+    idx = tmp_path / "index.jsonl"
+    now = time.time()
+    # Active host but silent for >30 min → triggers 'silent' alert.
+    _write_index(idx, [
+        {"host_id": "lab1", "received_at_wall": _iso(now - 60 * 60)},
+    ])
+    alerts = tmp_path / "alerts.jsonl"
+
+    real_run = fh.subprocess.run
+
+    def fake_run(cmd, *args, **kwargs):
+        if cmd and cmd[0] == "journalctl":
+            class R:
+                returncode = 0
+                stdout = ""
+                stderr = ""
+            return R()
+        return real_run(cmd, *args, **kwargs)
+
+    monkeypatch.setattr(fh.subprocess, "run", fake_run)
+    rc = fh.main(["--index-path", str(idx),
+                  "--alerts-path", str(alerts)])
+    assert rc == 0
+    # The alert WAS emitted, just not via exit code.
+    assert alerts.exists()
+    rows = alerts.read_text().splitlines()
+    assert len(rows) >= 1
diff --git a/tools/check_fleet_health.py b/tools/check_fleet_health.py
index a23b432..9567cd7 100644
--- a/tools/check_fleet_health.py
+++ b/tools/check_fleet_health.py
@@ -355,7 +355,16 @@ def main(argv: list[str] | None = None) -> int:
     if new_count == 0:
         log.info("fleet healthy — %d hosts checked, no new alerts",
                  len(hosts))
-    return 1 if new_count else 0
+    else:
+        log.info("emitted %d new alert(s); see %s",
+                 new_count, args.alerts_path)
+    # Exit 0 even when alerts are emitted: the alert IS the signal,
+    # not the unit's success/failure state. systemd treating "alerts
+    # found" as unit-failed is a UX wart — it makes `systemctl status`
+    # always red on a healthy detector that's just watching a fault.
+    # Operators consume alerts via journalctl + alerts.jsonl; the
+    # unit's failure state should mean "the detector itself broke."
+    return 0
 
 
 if __name__ == "__main__":