CIS490/tests/test_doctor_shipping.py
max f9b2e5c4e6 shipper: systemd watchdog, quarantine cleanup; doctor surfaces ship errors
Three robustness items off the future-work list:

1. Shipper sd_notify watchdog. Type=notify + WatchdogSec=180. The
   daemon sends READY=1 after queue construction and WATCHDOG=1 once
   per scan pass via a heartbeat callback wired into run_forever.
   Restart=on-failure only catches process death — silent stalls
   (deadlock, hung tar subprocess, blocked I/O past timeout) used to
   leave a zombie running with the data backlog growing. Now systemd
   kills + restarts the daemon if no WATCHDOG=1 arrives within 180s.

   Verified end-to-end against systemd via `systemd-run --transient
   --property=Type=notify --property=WatchdogSec=10`: unit transitions
   to active on READY=1; SIGSTOP'ing the process triggers
   `Watchdog timeout (limit 10s)! Killing process N with SIGABRT` at
   exactly t+10s, then unit goes failed → restart cycle.

2. Quarantine cleanup. Without an upper bound, data/quarantine/ grew
   forever as fatal episodes piled up. New ShipperConfig fields:
     quarantine_keep_days = 30           # opt-out: 0 disables
     quarantine_cleanup_interval_s = 3600 # gate so 5s tick doesn't
                                          # statx() the whole tree
   Cleanup runs at the start of run_once() but is gated to once per
   hour. Removed entries logged.

3. Doctor surfaces shipping errors. Tails 10 minutes of cis490-shipper
   journal and surfaces 412/400/transient patterns as red/yellow rows
   with the canonical fix command. An on-device agent running
   cis490_doctor.py now sees one line ("12 ship(s) rejected as
   out-of-window") instead of needing to grep the journal.

Tests: 200/200 (was 188). New coverage: heartbeat callback fires +
survives exceptions; quarantine cleanup respects keep_days, gate, and
opt-out; doctor parser correctly classifies 412/400/transient/clean/
empty/journalctl-denied; both error classes prioritise 412 (more
actionable) when present together.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 12:02:59 -05:00

142 lines
5 KiB
Python

"""Unit tests for cis490_doctor's recent-shipping-errors parser.
The parser scans `journalctl -u cis490-shipper` output for the same
strings the receiver/shipper actually emit. We monkeypatch `_run` so
the test doesn't need a real systemd journal.
"""
from __future__ import annotations
import importlib.util
import sys
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parent.parent
spec = importlib.util.spec_from_file_location(
"cis490_doctor", REPO_ROOT / "tools" / "cis490_doctor.py"
)
doctor = importlib.util.module_from_spec(spec)
sys.modules["cis490_doctor"] = doctor
spec.loader.exec_module(doctor)
def _run_journal(monkeypatch, output: str, rc: int = 0) -> None:
"""Stub `cis490_doctor._run` to return ``output`` for journalctl
calls, real subprocess for everything else."""
real = doctor._run
def fake(cmd):
if cmd and cmd[0] == "journalctl":
return rc, output, ""
return real(cmd)
monkeypatch.setattr(doctor, "_run", fake)
def _check_for(report: doctor.Report, name_substr: str) -> doctor.Check | None:
for c in report.checks:
if name_substr in c.name:
return c
return None
def test_412_pattern_surfaces_as_fail(monkeypatch: pytest.MonkeyPatch) -> None:
"""5 minutes of 412 commit-rejected logs is the exact symptom we
saw on k-gamingcom for 5+ hours pre-fix. Doctor must point the
operator at the canonical pull+reinstall path."""
log = "\n".join(
f"ship 01EP{i:02d} -> 412 commit-rejected. your_commit=abc..."
for i in range(20)
)
_run_journal(monkeypatch, log)
report = doctor.Report(role="lab-host")
doctor.check_recent_shipping_errors(report)
c = _check_for(report, "recent ship results")
assert c is not None and c.status == "fail"
assert "out-of-window" in c.detail
assert "git pull origin main" in c.fix
assert "install-lab-host.sh" in c.fix
def test_400_missing_commit_pattern(monkeypatch: pytest.MonkeyPatch) -> None:
log = "\n".join(
f'episode 01EP{i:02d} -> fatal (400) "missing X-Cis490-Code-Commit header"'
for i in range(5)
)
_run_journal(monkeypatch, log)
report = doctor.Report(role="lab-host")
doctor.check_recent_shipping_errors(report)
c = _check_for(report, "recent ship results")
assert c is not None and c.status == "fail"
assert "missing-commit-header" in c.detail
assert "install-lab-host.sh" in c.fix
def test_clean_journal_is_ok(monkeypatch: pytest.MonkeyPatch) -> None:
log = "\n".join(
f"ship 01EP{i:02d} -> stored (201) " for i in range(10)
)
_run_journal(monkeypatch, log)
report = doctor.Report(role="lab-host")
doctor.check_recent_shipping_errors(report)
c = _check_for(report, "recent ship results")
assert c is not None and c.status == "ok"
def test_empty_journal_is_ok_idle(monkeypatch: pytest.MonkeyPatch) -> None:
"""No log output ≠ broken — could be a daemon that finished
everything and is idle."""
_run_journal(monkeypatch, "")
report = doctor.Report(role="lab-host")
doctor.check_recent_shipping_errors(report)
c = _check_for(report, "recent ship results")
assert c is not None and c.status == "ok"
assert "idle" in c.detail
def test_journalctl_unavailable_skips(monkeypatch: pytest.MonkeyPatch) -> None:
"""Doctor often runs as the unprivileged user and reading the
system journal can need systemd-journal group membership.
journalctl returning non-zero (perm denied / not installed)
should produce a `skip` row, not noise."""
_run_journal(monkeypatch, "", rc=1)
report = doctor.Report(role="lab-host")
doctor.check_recent_shipping_errors(report)
c = _check_for(report, "recent log scan")
assert c is not None and c.status == "skip"
def test_transient_failures_warn_at_threshold(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""A pile of transient (5xx/network) failures isn't fatal but
it's worth flagging — the receiver might be down or WG might be
flaking. The threshold (>5) is tuned to ignore an occasional
blip."""
log = "\n".join(
f"ship 01EP{i:02d} -> transient: timeout" for i in range(10)
)
_run_journal(monkeypatch, log)
report = doctor.Report(role="lab-host")
doctor.check_recent_shipping_errors(report)
c = _check_for(report, "recent ship results")
assert c is not None and c.status == "warn"
assert "transient" in c.detail
def test_412_takes_priority_over_400(monkeypatch: pytest.MonkeyPatch) -> None:
"""If both error classes are present, 412 (out-of-date code)
is the more actionable diagnosis — fixing it fixes both, since
a re-install also writes VERSION."""
log = (
"ship 01OLD -> 412 commit-rejected\n"
'ship 01OLD2 -> fatal (400) "missing X-Cis490-Code-Commit header"\n'
)
_run_journal(monkeypatch, log)
report = doctor.Report(role="lab-host")
doctor.check_recent_shipping_errors(report)
c = _check_for(report, "recent ship results")
assert c is not None and c.status == "fail"
assert "out-of-window" in c.detail