fleet-health: exit 0 when alerts found (don't mark unit failed)
The detector previously returned 1 on alerts, which made systemd mark cis490-fleet-health.service as 'failed' every tick that found a sick host. That's the wrong UX — a detector finding a fault is working correctly, not crashing. The alert is the signal (via WARNING log + alerts.jsonl); the unit's success state should mean "the detector itself ran cleanly." Test added. Caught while live-deploying on the Pi: the first run found elliott-thinkpad fatal-only at 943×4xx + 1425×5xx and correctly emitted the alert — but systemd showed the unit red, which would have caused operators to chase the wrong tail. Side note: the same first run also caught a real bug — pycache for receiver.store on /opt/cis490 was stale after I deployed the new app.py + store.py from main, causing 1464 × 500 responses. Cleared the pycache and the index immediately resumed growing (4465 → 4515 in 30 seconds). The detector earned its keep on the very first cycle. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
49eba2fd60
commit
05bf785f0a
2 changed files with 46 additions and 1 deletions
|
|
@ -300,3 +300,39 @@ def test_main_returns_zero_when_no_alerts(tmp_path: Path,
|
|||
rc = fh.main(["--index-path", str(idx),
|
||||
"--alerts-path", str(alerts)])
|
||||
assert rc == 0
|
||||
|
||||
|
||||
def test_main_returns_zero_even_when_alerts_emitted(tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""The detector exits 0 even when alerts are present — the alert
|
||||
IS the signal (via alerts.jsonl + WARNING log), not the unit's
|
||||
success state. systemd showing the unit as 'failed' on every
|
||||
detection would be misleading: the detector itself is working
|
||||
correctly."""
|
||||
idx = tmp_path / "index.jsonl"
|
||||
now = time.time()
|
||||
# Active host but silent for >30 min → triggers 'silent' alert.
|
||||
_write_index(idx, [
|
||||
{"host_id": "lab1", "received_at_wall": _iso(now - 60 * 60)},
|
||||
])
|
||||
alerts = tmp_path / "alerts.jsonl"
|
||||
|
||||
real_run = fh.subprocess.run
|
||||
|
||||
def fake_run(cmd, *args, **kwargs):
|
||||
if cmd and cmd[0] == "journalctl":
|
||||
class R:
|
||||
returncode = 0
|
||||
stdout = ""
|
||||
stderr = ""
|
||||
return R()
|
||||
return real_run(cmd, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(fh.subprocess, "run", fake_run)
|
||||
rc = fh.main(["--index-path", str(idx),
|
||||
"--alerts-path", str(alerts)])
|
||||
assert rc == 0
|
||||
# The alert WAS emitted, just not via exit code.
|
||||
assert alerts.exists()
|
||||
rows = alerts.read_text().splitlines()
|
||||
assert len(rows) >= 1
|
||||
|
|
|
|||
|
|
@ -355,7 +355,16 @@ def main(argv: list[str] | None = None) -> int:
|
|||
if new_count == 0:
|
||||
log.info("fleet healthy — %d hosts checked, no new alerts",
|
||||
len(hosts))
|
||||
return 1 if new_count else 0
|
||||
else:
|
||||
log.info("emitted %d new alert(s); see %s",
|
||||
new_count, args.alerts_path)
|
||||
# Exit 0 even when alerts are emitted: the alert IS the signal,
|
||||
# not the unit's success/failure state. systemd treating "alerts
|
||||
# found" as unit-failed is a UX wart — it makes `systemctl status`
|
||||
# always red on a healthy detector that's just watching a fault.
|
||||
# Operators consume alerts via journalctl + alerts.jsonl; the
|
||||
# unit's failure state should mean "the detector itself broke."
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue