diff --git a/tests/test_fleet_health.py b/tests/test_fleet_health.py index 0f90e48..43b5be7 100644 --- a/tests/test_fleet_health.py +++ b/tests/test_fleet_health.py @@ -300,3 +300,39 @@ def test_main_returns_zero_when_no_alerts(tmp_path: Path, rc = fh.main(["--index-path", str(idx), "--alerts-path", str(alerts)]) assert rc == 0 + + +def test_main_returns_zero_even_when_alerts_emitted(tmp_path: Path, + monkeypatch: pytest.MonkeyPatch) -> None: + """The detector exits 0 even when alerts are present — the alert + IS the signal (via alerts.jsonl + WARNING log), not the unit's + success state. systemd showing the unit as 'failed' on every + detection would be misleading: the detector itself is working + correctly.""" + idx = tmp_path / "index.jsonl" + now = time.time() + # Active host but silent for >30 min → triggers 'silent' alert. + _write_index(idx, [ + {"host_id": "lab1", "received_at_wall": _iso(now - 60 * 60)}, + ]) + alerts = tmp_path / "alerts.jsonl" + + real_run = fh.subprocess.run + + def fake_run(cmd, *args, **kwargs): + if cmd and cmd[0] == "journalctl": + class R: + returncode = 0 + stdout = "" + stderr = "" + return R() + return real_run(cmd, *args, **kwargs) + + monkeypatch.setattr(fh.subprocess, "run", fake_run) + rc = fh.main(["--index-path", str(idx), + "--alerts-path", str(alerts)]) + assert rc == 0 + # The alert WAS emitted, just not via exit code. + assert alerts.exists() + rows = alerts.read_text().splitlines() + assert len(rows) >= 1 diff --git a/tools/check_fleet_health.py b/tools/check_fleet_health.py index a23b432..9567cd7 100644 --- a/tools/check_fleet_health.py +++ b/tools/check_fleet_health.py @@ -355,7 +355,16 @@ def main(argv: list[str] | None = None) -> int: if new_count == 0: log.info("fleet healthy — %d hosts checked, no new alerts", len(hosts)) - return 1 if new_count else 0 + else: + log.info("emitted %d new alert(s); see %s", + new_count, args.alerts_path) + # Exit 0 even when alerts are emitted: the alert IS the signal, + # not the unit's success/failure state. systemd treating "alerts + # found" as unit-failed is a UX wart — it makes `systemctl status` + # always red on a healthy detector that's just watching a fault. + # Operators consume alerts via journalctl + alerts.jsonl; the + # unit's failure state should mean "the detector itself broke." + return 0 if __name__ == "__main__":