Three robustness items off the future-work list:
1. Shipper sd_notify watchdog. Type=notify + WatchdogSec=180. The
daemon sends READY=1 after queue construction and WATCHDOG=1 once
per scan pass via a heartbeat callback wired into run_forever.
Restart=on-failure only catches process death — silent stalls
(deadlock, hung tar subprocess, blocked I/O past timeout) used to
leave a zombie running with the data backlog growing. Now systemd
kills + restarts the daemon if no WATCHDOG=1 arrives within 180s.
Verified end-to-end against systemd via `systemd-run --transient
--property=Type=notify --property=WatchdogSec=10`: unit transitions
to active on READY=1; SIGSTOP'ing the process triggers
`Watchdog timeout (limit 10s)! Killing process N with SIGABRT` at
exactly t+10s, then unit goes failed → restart cycle.
2. Quarantine cleanup. Without an upper bound, data/quarantine/ grew
forever as fatal episodes piled up. New ShipperConfig fields:
quarantine_keep_days = 30 # opt-out: 0 disables
quarantine_cleanup_interval_s = 3600 # gate so 5s tick doesn't
# statx() the whole tree
Cleanup runs at the start of run_once() but is gated to once per
hour. Removed entries logged.
3. Doctor surfaces shipping errors. Tails 10 minutes of cis490-shipper
journal and surfaces 412/400/transient patterns as red/yellow rows
with the canonical fix command. An on-device agent running
cis490_doctor.py now sees one line ("12 ship(s) rejected as
out-of-window") instead of needing to grep the journal.
Tests: 200/200 (was 188). New coverage: heartbeat callback fires +
survives exceptions; quarantine cleanup respects keep_days, gate, and
opt-out; doctor parser correctly classifies 412/400/transient/clean/
empty/journalctl-denied; both error classes prioritise 412 (more
actionable) when present together.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
142 lines
5 KiB
Python
142 lines
5 KiB
Python
"""Unit tests for cis490_doctor's recent-shipping-errors parser.
|
|
|
|
The parser scans `journalctl -u cis490-shipper` output for the same
|
|
strings the receiver/shipper actually emit. We monkeypatch `_run` so
|
|
the test doesn't need a real systemd journal.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
spec = importlib.util.spec_from_file_location(
|
|
"cis490_doctor", REPO_ROOT / "tools" / "cis490_doctor.py"
|
|
)
|
|
doctor = importlib.util.module_from_spec(spec)
|
|
sys.modules["cis490_doctor"] = doctor
|
|
spec.loader.exec_module(doctor)
|
|
|
|
|
|
def _run_journal(monkeypatch, output: str, rc: int = 0) -> None:
|
|
"""Stub `cis490_doctor._run` to return ``output`` for journalctl
|
|
calls, real subprocess for everything else."""
|
|
real = doctor._run
|
|
|
|
def fake(cmd):
|
|
if cmd and cmd[0] == "journalctl":
|
|
return rc, output, ""
|
|
return real(cmd)
|
|
monkeypatch.setattr(doctor, "_run", fake)
|
|
|
|
|
|
def _check_for(report: doctor.Report, name_substr: str) -> doctor.Check | None:
|
|
for c in report.checks:
|
|
if name_substr in c.name:
|
|
return c
|
|
return None
|
|
|
|
|
|
def test_412_pattern_surfaces_as_fail(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
"""5 minutes of 412 commit-rejected logs is the exact symptom we
|
|
saw on k-gamingcom for 5+ hours pre-fix. Doctor must point the
|
|
operator at the canonical pull+reinstall path."""
|
|
log = "\n".join(
|
|
f"ship 01EP{i:02d} -> 412 commit-rejected. your_commit=abc..."
|
|
for i in range(20)
|
|
)
|
|
_run_journal(monkeypatch, log)
|
|
report = doctor.Report(role="lab-host")
|
|
doctor.check_recent_shipping_errors(report)
|
|
c = _check_for(report, "recent ship results")
|
|
assert c is not None and c.status == "fail"
|
|
assert "out-of-window" in c.detail
|
|
assert "git pull origin main" in c.fix
|
|
assert "install-lab-host.sh" in c.fix
|
|
|
|
|
|
def test_400_missing_commit_pattern(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
log = "\n".join(
|
|
f'episode 01EP{i:02d} -> fatal (400) "missing X-Cis490-Code-Commit header"'
|
|
for i in range(5)
|
|
)
|
|
_run_journal(monkeypatch, log)
|
|
report = doctor.Report(role="lab-host")
|
|
doctor.check_recent_shipping_errors(report)
|
|
c = _check_for(report, "recent ship results")
|
|
assert c is not None and c.status == "fail"
|
|
assert "missing-commit-header" in c.detail
|
|
assert "install-lab-host.sh" in c.fix
|
|
|
|
|
|
def test_clean_journal_is_ok(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
log = "\n".join(
|
|
f"ship 01EP{i:02d} -> stored (201) " for i in range(10)
|
|
)
|
|
_run_journal(monkeypatch, log)
|
|
report = doctor.Report(role="lab-host")
|
|
doctor.check_recent_shipping_errors(report)
|
|
c = _check_for(report, "recent ship results")
|
|
assert c is not None and c.status == "ok"
|
|
|
|
|
|
def test_empty_journal_is_ok_idle(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
"""No log output ≠ broken — could be a daemon that finished
|
|
everything and is idle."""
|
|
_run_journal(monkeypatch, "")
|
|
report = doctor.Report(role="lab-host")
|
|
doctor.check_recent_shipping_errors(report)
|
|
c = _check_for(report, "recent ship results")
|
|
assert c is not None and c.status == "ok"
|
|
assert "idle" in c.detail
|
|
|
|
|
|
def test_journalctl_unavailable_skips(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
"""Doctor often runs as the unprivileged user and reading the
|
|
system journal can need systemd-journal group membership.
|
|
journalctl returning non-zero (perm denied / not installed)
|
|
should produce a `skip` row, not noise."""
|
|
_run_journal(monkeypatch, "", rc=1)
|
|
report = doctor.Report(role="lab-host")
|
|
doctor.check_recent_shipping_errors(report)
|
|
c = _check_for(report, "recent log scan")
|
|
assert c is not None and c.status == "skip"
|
|
|
|
|
|
def test_transient_failures_warn_at_threshold(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
"""A pile of transient (5xx/network) failures isn't fatal but
|
|
it's worth flagging — the receiver might be down or WG might be
|
|
flaking. The threshold (>5) is tuned to ignore an occasional
|
|
blip."""
|
|
log = "\n".join(
|
|
f"ship 01EP{i:02d} -> transient: timeout" for i in range(10)
|
|
)
|
|
_run_journal(monkeypatch, log)
|
|
report = doctor.Report(role="lab-host")
|
|
doctor.check_recent_shipping_errors(report)
|
|
c = _check_for(report, "recent ship results")
|
|
assert c is not None and c.status == "warn"
|
|
assert "transient" in c.detail
|
|
|
|
|
|
def test_412_takes_priority_over_400(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
"""If both error classes are present, 412 (out-of-date code)
|
|
is the more actionable diagnosis — fixing it fixes both, since
|
|
a re-install also writes VERSION."""
|
|
log = (
|
|
"ship 01OLD -> 412 commit-rejected\n"
|
|
'ship 01OLD2 -> fatal (400) "missing X-Cis490-Code-Commit header"\n'
|
|
)
|
|
_run_journal(monkeypatch, log)
|
|
report = doctor.Report(role="lab-host")
|
|
doctor.check_recent_shipping_errors(report)
|
|
c = _check_for(report, "recent ship results")
|
|
assert c is not None and c.status == "fail"
|
|
assert "out-of-window" in c.detail
|