shipper: systemd watchdog, quarantine cleanup; doctor surfaces ship errors
Three robustness items off the future-work list:
1. Shipper sd_notify watchdog. Type=notify + WatchdogSec=180. The
daemon sends READY=1 after queue construction and WATCHDOG=1 once
per scan pass via a heartbeat callback wired into run_forever.
Restart=on-failure only catches process death — silent stalls
(deadlock, hung tar subprocess, blocked I/O past timeout) used to
leave a zombie running with the data backlog growing. Now systemd
kills + restarts the daemon if no WATCHDOG=1 arrives within 180s.
Verified end-to-end against systemd via `systemd-run --transient
--property=Type=notify --property=WatchdogSec=10`: unit transitions
to active on READY=1; SIGSTOP'ing the process triggers
`Watchdog timeout (limit 10s)! Killing process N with SIGABRT` at
exactly t+10s, then unit goes failed → restart cycle.
2. Quarantine cleanup. Without an upper bound, data/quarantine/ grew
forever as fatal episodes piled up. New ShipperConfig fields:
quarantine_keep_days = 30 # opt-out: 0 disables
quarantine_cleanup_interval_s = 3600 # gate so 5s tick doesn't
# statx() the whole tree
Cleanup runs at the start of run_once() but is gated to once per
hour. Removed entries logged.
3. Doctor surfaces shipping errors. Tails 10 minutes of cis490-shipper
journal and surfaces 412/400/transient patterns as red/yellow rows
with the canonical fix command. An on-device agent running
cis490_doctor.py now sees one line ("12 ship(s) rejected as
out-of-window") instead of needing to grep the journal.
Tests: 200/200 (was 188). New coverage: heartbeat callback fires +
survives exceptions; quarantine cleanup respects keep_days, gate, and
opt-out; doctor parser correctly classifies 412/400/transient/clean/
empty/journalctl-denied; both error classes prioritise 412 (more
actionable) when present together.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ed5e6b0581
commit
f9b2e5c4e6
7 changed files with 527 additions and 3 deletions
|
|
@ -7,7 +7,14 @@ Wants=network-online.target
|
||||||
Requires=wg-quick@wg0.service
|
Requires=wg-quick@wg0.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
# Type=notify so systemd waits for sd_notify("READY=1") before
|
||||||
|
# considering the unit started, and so WatchdogSec= can kick in.
|
||||||
|
# Without this, Restart=on-failure only catches process crashes —
|
||||||
|
# silent stalls (deadlock, blocked I/O past timeout, hung tar
|
||||||
|
# subprocess) leave a zombie running with the data backlog growing.
|
||||||
|
Type=notify
|
||||||
|
NotifyAccess=main
|
||||||
|
WatchdogSec=180
|
||||||
User=cis490
|
User=cis490
|
||||||
Group=cis490
|
Group=cis490
|
||||||
WorkingDirectory=/opt/cis490
|
WorkingDirectory=/opt/cis490
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,9 @@ from __future__ import annotations
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import signal
|
import signal
|
||||||
|
import socket
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
@ -22,6 +24,30 @@ from .queue import ShipperQueue
|
||||||
from .transport import ShipperTransport
|
from .transport import ShipperTransport
|
||||||
|
|
||||||
|
|
||||||
|
def _sd_notify(msg: str) -> None:
|
||||||
|
"""Send ``msg`` to systemd's notify socket. No-op when running
|
||||||
|
outside systemd (NOTIFY_SOCKET unset) so the same binary works
|
||||||
|
fine under `--once`, manual invocation, or tests.
|
||||||
|
|
||||||
|
See sd_notify(3). The protocol is one-line key=value messages
|
||||||
|
over an AF_UNIX SOCK_DGRAM socket. We don't need the libsystemd
|
||||||
|
dep — talking to the socket directly is stdlib."""
|
||||||
|
sock_path = os.environ.get("NOTIFY_SOCKET")
|
||||||
|
if not sock_path:
|
||||||
|
return
|
||||||
|
if sock_path.startswith("@"):
|
||||||
|
# Abstract socket: prepend NUL and strip the leading '@'.
|
||||||
|
sock_path = "\0" + sock_path[1:]
|
||||||
|
try:
|
||||||
|
with socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) as s:
|
||||||
|
s.sendto(msg.encode("ascii"), sock_path)
|
||||||
|
except OSError:
|
||||||
|
# Failing to notify isn't fatal — at worst systemd's
|
||||||
|
# WatchdogSec fires and we get restarted, which is the
|
||||||
|
# behaviour the watchdog exists to provide.
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _setup_logging(level: str) -> None:
|
def _setup_logging(level: str) -> None:
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=getattr(logging, level.upper(), logging.INFO),
|
level=getattr(logging, level.upper(), logging.INFO),
|
||||||
|
|
@ -98,7 +124,16 @@ def main(argv: list[str] | None = None) -> int:
|
||||||
"shipper starting: host_id=%s data_root=%s receiver=%s",
|
"shipper starting: host_id=%s data_root=%s receiver=%s",
|
||||||
cfg.host_id, cfg.data_root, cfg.receiver.url,
|
cfg.host_id, cfg.data_root, cfg.receiver.url,
|
||||||
)
|
)
|
||||||
queue.run_forever(stop_check=lambda: stopping)
|
# Tell systemd we're ready to take work — gates Type=notify in
|
||||||
|
# the unit file. The systemd unit's WatchdogSec= will then expect
|
||||||
|
# WATCHDOG=1 messages at least every <WatchdogSec> seconds; a
|
||||||
|
# missed one means stalled-mid-loop and triggers a kill+restart.
|
||||||
|
_sd_notify("READY=1")
|
||||||
|
queue.run_forever(
|
||||||
|
stop_check=lambda: stopping,
|
||||||
|
heartbeat=lambda: _sd_notify("WATCHDOG=1"),
|
||||||
|
)
|
||||||
|
_sd_notify("STOPPING=1")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,15 @@ class ShipperConfig:
|
||||||
backoff_seconds: tuple[float, ...] = (1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 60.0, 120.0, 300.0)
|
backoff_seconds: tuple[float, ...] = (1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 60.0, 120.0, 300.0)
|
||||||
# Local retention before pruning data/shipped/.
|
# Local retention before pruning data/shipped/.
|
||||||
keep_local_for_days: int = 7
|
keep_local_for_days: int = 7
|
||||||
|
# Quarantine retention. Episodes that the receiver permanently
|
||||||
|
# rejected (400/412) sit here as evidence; without an upper bound
|
||||||
|
# they grow forever. Set to 0 to disable cleanup (operator
|
||||||
|
# responsibility).
|
||||||
|
quarantine_keep_days: int = 30
|
||||||
|
# How often the quarantine cleanup pass actually runs. Gated
|
||||||
|
# because a 5-second scan tick checking mtimes against a
|
||||||
|
# 30-day-old cutoff is wasteful — once an hour is plenty.
|
||||||
|
quarantine_cleanup_interval_s: float = 3600.0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def episodes_dir(self) -> Path:
|
def episodes_dir(self) -> Path:
|
||||||
|
|
@ -88,6 +97,7 @@ class ShipperConfig:
|
||||||
scan_interval_s=float(data.get("shipper", {}).get("scan_interval_s", 5.0)),
|
scan_interval_s=float(data.get("shipper", {}).get("scan_interval_s", 5.0)),
|
||||||
request_timeout_s=float(data.get("shipper", {}).get("request_timeout_s", 60.0)),
|
request_timeout_s=float(data.get("shipper", {}).get("request_timeout_s", 60.0)),
|
||||||
keep_local_for_days=int(retention.get("keep_local_for_days", 7)),
|
keep_local_for_days=int(retention.get("keep_local_for_days", 7)),
|
||||||
|
quarantine_keep_days=int(retention.get("quarantine_keep_days", 30)),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -65,6 +65,10 @@ class ShipperQueue:
|
||||||
cfg.outbox_dir.mkdir(parents=True, exist_ok=True)
|
cfg.outbox_dir.mkdir(parents=True, exist_ok=True)
|
||||||
cfg.shipped_dir.mkdir(parents=True, exist_ok=True)
|
cfg.shipped_dir.mkdir(parents=True, exist_ok=True)
|
||||||
cfg.quarantine_dir.mkdir(parents=True, exist_ok=True)
|
cfg.quarantine_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
# Last wall-clock time we walked quarantine/ for cleanup. Set
|
||||||
|
# to 0.0 so the first pass always sweeps (covers daemon
|
||||||
|
# startup after a long downtime).
|
||||||
|
self._last_quarantine_cleanup_at: float = 0.0
|
||||||
|
|
||||||
# ---- main entry point ---------------------------------------------
|
# ---- main entry point ---------------------------------------------
|
||||||
|
|
||||||
|
|
@ -77,6 +81,10 @@ class ShipperQueue:
|
||||||
# Bounded by the number of files in outbox/, so cheap to do
|
# Bounded by the number of files in outbox/, so cheap to do
|
||||||
# every pass.
|
# every pass.
|
||||||
self._sweep_outbox()
|
self._sweep_outbox()
|
||||||
|
# Drop quarantine entries older than keep_days — gated on a
|
||||||
|
# once-per-hour check so the 5-second scan tick doesn't
|
||||||
|
# statx() the whole quarantine tree on every pass.
|
||||||
|
self._maybe_cleanup_quarantine()
|
||||||
ready = self._ready_episodes()
|
ready = self._ready_episodes()
|
||||||
scanned = len(ready)
|
scanned = len(ready)
|
||||||
shipped = 0
|
shipped = 0
|
||||||
|
|
@ -138,12 +146,36 @@ class ShipperQueue:
|
||||||
fatal=fatal,
|
fatal=fatal,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_forever(self, *, stop_check=lambda: False) -> None:
|
def run_forever(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
stop_check=lambda: False,
|
||||||
|
heartbeat=lambda: None,
|
||||||
|
) -> None:
|
||||||
|
"""Long-running scan loop.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stop_check: returns True when the daemon should exit (SIGTERM
|
||||||
|
handler). Checked between passes and inside the inter-pass
|
||||||
|
sleep so SIGTERM isn't blocked for up to scan_interval_s.
|
||||||
|
heartbeat: invoked once per completed pass. Wired to
|
||||||
|
sd_notify("WATCHDOG=1") in production so systemd can
|
||||||
|
kill+restart the daemon if a pass hangs longer than the
|
||||||
|
unit's WatchdogSec — catches silent stalls (deadlock,
|
||||||
|
blocked I/O past timeout) that Restart=on-failure misses.
|
||||||
|
"""
|
||||||
while not stop_check():
|
while not stop_check():
|
||||||
try:
|
try:
|
||||||
self.run_once()
|
self.run_once()
|
||||||
except Exception:
|
except Exception:
|
||||||
log.exception("scan pass crashed; sleeping anyway")
|
log.exception("scan pass crashed; sleeping anyway")
|
||||||
|
try:
|
||||||
|
heartbeat()
|
||||||
|
except Exception:
|
||||||
|
# A heartbeat failure mustn't take down the daemon —
|
||||||
|
# if the watchdog wire is broken, we want at least the
|
||||||
|
# ship loop to keep running.
|
||||||
|
log.exception("heartbeat callback failed")
|
||||||
# Coarse sleep: we don't need precise scheduling and we
|
# Coarse sleep: we don't need precise scheduling and we
|
||||||
# don't want a tight loop on errors.
|
# don't want a tight loop on errors.
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
|
|
@ -163,6 +195,42 @@ class ShipperQueue:
|
||||||
out.append(ep)
|
out.append(ep)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
def _maybe_cleanup_quarantine(self) -> None:
|
||||||
|
"""Walk quarantine/ and remove episodes older than keep_days.
|
||||||
|
|
||||||
|
Cheap on a daemon that's been running a while because the
|
||||||
|
once-per-hour gate prevents the scan tick from statx()-ing
|
||||||
|
the whole tree every 5s. On the first pass after startup, the
|
||||||
|
gate's 0.0 sentinel means we always sweep — that catches a
|
||||||
|
daemon that was offline through a backlog accumulation."""
|
||||||
|
keep_days = self.cfg.quarantine_keep_days
|
||||||
|
if keep_days <= 0:
|
||||||
|
return
|
||||||
|
now = time.time()
|
||||||
|
if (now - self._last_quarantine_cleanup_at
|
||||||
|
< self.cfg.quarantine_cleanup_interval_s):
|
||||||
|
return
|
||||||
|
self._last_quarantine_cleanup_at = now
|
||||||
|
cutoff = now - (keep_days * 86400)
|
||||||
|
removed = 0
|
||||||
|
if not self.cfg.quarantine_dir.exists():
|
||||||
|
return
|
||||||
|
for ep in self.cfg.quarantine_dir.iterdir():
|
||||||
|
if not ep.is_dir():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
# Use mtime — quarantine dirs are written once
|
||||||
|
# (the rename + the reason file), so mtime tracks
|
||||||
|
# quarantine age, not the original episode age.
|
||||||
|
if ep.stat().st_mtime < cutoff:
|
||||||
|
shutil.rmtree(ep, ignore_errors=True)
|
||||||
|
removed += 1
|
||||||
|
except OSError:
|
||||||
|
log.exception("quarantine cleanup failed for %s", ep.name)
|
||||||
|
if removed:
|
||||||
|
log.info("quarantine cleanup: removed %d episode(s) older than %d days",
|
||||||
|
removed, keep_days)
|
||||||
|
|
||||||
def _sweep_outbox(self) -> None:
|
def _sweep_outbox(self) -> None:
|
||||||
"""Delete tarballs in outbox/ that have no matching episode dir.
|
"""Delete tarballs in outbox/ that have no matching episode dir.
|
||||||
|
|
||||||
|
|
|
||||||
142
tests/test_doctor_shipping.py
Normal file
142
tests/test_doctor_shipping.py
Normal file
|
|
@ -0,0 +1,142 @@
|
||||||
|
"""Unit tests for cis490_doctor's recent-shipping-errors parser.
|
||||||
|
|
||||||
|
The parser scans `journalctl -u cis490-shipper` output for the same
|
||||||
|
strings the receiver/shipper actually emit. We monkeypatch `_run` so
|
||||||
|
the test doesn't need a real systemd journal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
spec = importlib.util.spec_from_file_location(
|
||||||
|
"cis490_doctor", REPO_ROOT / "tools" / "cis490_doctor.py"
|
||||||
|
)
|
||||||
|
doctor = importlib.util.module_from_spec(spec)
|
||||||
|
sys.modules["cis490_doctor"] = doctor
|
||||||
|
spec.loader.exec_module(doctor)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_journal(monkeypatch, output: str, rc: int = 0) -> None:
|
||||||
|
"""Stub `cis490_doctor._run` to return ``output`` for journalctl
|
||||||
|
calls, real subprocess for everything else."""
|
||||||
|
real = doctor._run
|
||||||
|
|
||||||
|
def fake(cmd):
|
||||||
|
if cmd and cmd[0] == "journalctl":
|
||||||
|
return rc, output, ""
|
||||||
|
return real(cmd)
|
||||||
|
monkeypatch.setattr(doctor, "_run", fake)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_for(report: doctor.Report, name_substr: str) -> doctor.Check | None:
|
||||||
|
for c in report.checks:
|
||||||
|
if name_substr in c.name:
|
||||||
|
return c
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def test_412_pattern_surfaces_as_fail(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""5 minutes of 412 commit-rejected logs is the exact symptom we
|
||||||
|
saw on k-gamingcom for 5+ hours pre-fix. Doctor must point the
|
||||||
|
operator at the canonical pull+reinstall path."""
|
||||||
|
log = "\n".join(
|
||||||
|
f"ship 01EP{i:02d} -> 412 commit-rejected. your_commit=abc..."
|
||||||
|
for i in range(20)
|
||||||
|
)
|
||||||
|
_run_journal(monkeypatch, log)
|
||||||
|
report = doctor.Report(role="lab-host")
|
||||||
|
doctor.check_recent_shipping_errors(report)
|
||||||
|
c = _check_for(report, "recent ship results")
|
||||||
|
assert c is not None and c.status == "fail"
|
||||||
|
assert "out-of-window" in c.detail
|
||||||
|
assert "git pull origin main" in c.fix
|
||||||
|
assert "install-lab-host.sh" in c.fix
|
||||||
|
|
||||||
|
|
||||||
|
def test_400_missing_commit_pattern(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
log = "\n".join(
|
||||||
|
f'episode 01EP{i:02d} -> fatal (400) "missing X-Cis490-Code-Commit header"'
|
||||||
|
for i in range(5)
|
||||||
|
)
|
||||||
|
_run_journal(monkeypatch, log)
|
||||||
|
report = doctor.Report(role="lab-host")
|
||||||
|
doctor.check_recent_shipping_errors(report)
|
||||||
|
c = _check_for(report, "recent ship results")
|
||||||
|
assert c is not None and c.status == "fail"
|
||||||
|
assert "missing-commit-header" in c.detail
|
||||||
|
assert "install-lab-host.sh" in c.fix
|
||||||
|
|
||||||
|
|
||||||
|
def test_clean_journal_is_ok(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
log = "\n".join(
|
||||||
|
f"ship 01EP{i:02d} -> stored (201) " for i in range(10)
|
||||||
|
)
|
||||||
|
_run_journal(monkeypatch, log)
|
||||||
|
report = doctor.Report(role="lab-host")
|
||||||
|
doctor.check_recent_shipping_errors(report)
|
||||||
|
c = _check_for(report, "recent ship results")
|
||||||
|
assert c is not None and c.status == "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_journal_is_ok_idle(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""No log output ≠ broken — could be a daemon that finished
|
||||||
|
everything and is idle."""
|
||||||
|
_run_journal(monkeypatch, "")
|
||||||
|
report = doctor.Report(role="lab-host")
|
||||||
|
doctor.check_recent_shipping_errors(report)
|
||||||
|
c = _check_for(report, "recent ship results")
|
||||||
|
assert c is not None and c.status == "ok"
|
||||||
|
assert "idle" in c.detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_journalctl_unavailable_skips(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""Doctor often runs as the unprivileged user and reading the
|
||||||
|
system journal can need systemd-journal group membership.
|
||||||
|
journalctl returning non-zero (perm denied / not installed)
|
||||||
|
should produce a `skip` row, not noise."""
|
||||||
|
_run_journal(monkeypatch, "", rc=1)
|
||||||
|
report = doctor.Report(role="lab-host")
|
||||||
|
doctor.check_recent_shipping_errors(report)
|
||||||
|
c = _check_for(report, "recent log scan")
|
||||||
|
assert c is not None and c.status == "skip"
|
||||||
|
|
||||||
|
|
||||||
|
def test_transient_failures_warn_at_threshold(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
"""A pile of transient (5xx/network) failures isn't fatal but
|
||||||
|
it's worth flagging — the receiver might be down or WG might be
|
||||||
|
flaking. The threshold (>5) is tuned to ignore an occasional
|
||||||
|
blip."""
|
||||||
|
log = "\n".join(
|
||||||
|
f"ship 01EP{i:02d} -> transient: timeout" for i in range(10)
|
||||||
|
)
|
||||||
|
_run_journal(monkeypatch, log)
|
||||||
|
report = doctor.Report(role="lab-host")
|
||||||
|
doctor.check_recent_shipping_errors(report)
|
||||||
|
c = _check_for(report, "recent ship results")
|
||||||
|
assert c is not None and c.status == "warn"
|
||||||
|
assert "transient" in c.detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_412_takes_priority_over_400(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""If both error classes are present, 412 (out-of-date code)
|
||||||
|
is the more actionable diagnosis — fixing it fixes both, since
|
||||||
|
a re-install also writes VERSION."""
|
||||||
|
log = (
|
||||||
|
"ship 01OLD -> 412 commit-rejected\n"
|
||||||
|
'ship 01OLD2 -> fatal (400) "missing X-Cis490-Code-Commit header"\n'
|
||||||
|
)
|
||||||
|
_run_journal(monkeypatch, log)
|
||||||
|
report = doctor.Report(role="lab-host")
|
||||||
|
doctor.check_recent_shipping_errors(report)
|
||||||
|
c = _check_for(report, "recent ship results")
|
||||||
|
assert c is not None and c.status == "fail"
|
||||||
|
assert "out-of-window" in c.detail
|
||||||
|
|
@ -320,6 +320,173 @@ def test_run_once_handles_transient_when_receiver_is_down(tmp_path: Path) -> Non
|
||||||
assert (cfg.outbox_dir / "01DOWN.tar.zst").exists()
|
assert (cfg.outbox_dir / "01DOWN.tar.zst").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_cleanup_removes_old_entries(tmp_path: Path) -> None:
|
||||||
|
"""Without an upper bound, quarantine/ grows forever. The cleanup
|
||||||
|
pass walks it once per cleanup_interval and drops anything past
|
||||||
|
keep_days — bounded by directory size since it just statx()s
|
||||||
|
each entry's mtime.
|
||||||
|
|
||||||
|
We run with cleanup_interval_s=0 so the gate fires on every pass,
|
||||||
|
and overload `os.utime` to age a fixture entry past the cutoff
|
||||||
|
without sleeping for real time."""
|
||||||
|
import os as _os
|
||||||
|
import time as _time
|
||||||
|
|
||||||
|
cfg = ShipperConfig(
|
||||||
|
host_id="lab1",
|
||||||
|
data_root=tmp_path / "lab-data",
|
||||||
|
receiver=ReceiverEndpoint(url="http://127.0.0.1:1"),
|
||||||
|
scan_interval_s=0.05,
|
||||||
|
quarantine_keep_days=7,
|
||||||
|
quarantine_cleanup_interval_s=0.0, # always run on every pass
|
||||||
|
)
|
||||||
|
|
||||||
|
class _NoTransport:
|
||||||
|
def ship_tarball(self, *a, **kw): raise AssertionError("not used")
|
||||||
|
|
||||||
|
queue = ShipperQueue(cfg, _NoTransport())
|
||||||
|
|
||||||
|
old = cfg.quarantine_dir / "01OLD"
|
||||||
|
old.mkdir()
|
||||||
|
(old / "meta.json").write_text("{}")
|
||||||
|
new = cfg.quarantine_dir / "01NEW"
|
||||||
|
new.mkdir()
|
||||||
|
(new / "meta.json").write_text("{}")
|
||||||
|
|
||||||
|
# Backdate the OLD entry by 8 days. The directory's own mtime
|
||||||
|
# is what cleanup checks.
|
||||||
|
eight_days_ago = _time.time() - (8 * 86400)
|
||||||
|
_os.utime(old, (eight_days_ago, eight_days_ago))
|
||||||
|
|
||||||
|
queue._maybe_cleanup_quarantine()
|
||||||
|
|
||||||
|
assert not old.exists(), "8-day-old entry should be cleaned up"
|
||||||
|
assert new.exists(), "fresh entry should survive"
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_cleanup_disabled_when_keep_days_zero(tmp_path: Path) -> None:
|
||||||
|
import os as _os
|
||||||
|
import time as _time
|
||||||
|
|
||||||
|
cfg = ShipperConfig(
|
||||||
|
host_id="lab1",
|
||||||
|
data_root=tmp_path / "lab-data",
|
||||||
|
receiver=ReceiverEndpoint(url="http://127.0.0.1:1"),
|
||||||
|
scan_interval_s=0.05,
|
||||||
|
quarantine_keep_days=0, # disabled
|
||||||
|
quarantine_cleanup_interval_s=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
class _NoTransport:
|
||||||
|
def ship_tarball(self, *a, **kw): raise AssertionError("not used")
|
||||||
|
|
||||||
|
queue = ShipperQueue(cfg, _NoTransport())
|
||||||
|
|
||||||
|
old = cfg.quarantine_dir / "01OLD"
|
||||||
|
old.mkdir()
|
||||||
|
eight_days_ago = _time.time() - (8 * 86400)
|
||||||
|
_os.utime(old, (eight_days_ago, eight_days_ago))
|
||||||
|
|
||||||
|
queue._maybe_cleanup_quarantine()
|
||||||
|
assert old.exists(), "cleanup must be a no-op when keep_days=0"
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_cleanup_respects_interval_gate(tmp_path: Path) -> None:
|
||||||
|
"""The interval gate prevents the 5s scan tick from statx()-ing
|
||||||
|
the whole quarantine tree on every pass."""
|
||||||
|
import os as _os
|
||||||
|
import time as _time
|
||||||
|
|
||||||
|
cfg = ShipperConfig(
|
||||||
|
host_id="lab1",
|
||||||
|
data_root=tmp_path / "lab-data",
|
||||||
|
receiver=ReceiverEndpoint(url="http://127.0.0.1:1"),
|
||||||
|
scan_interval_s=0.05,
|
||||||
|
quarantine_keep_days=7,
|
||||||
|
quarantine_cleanup_interval_s=3600.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
class _NoTransport:
|
||||||
|
def ship_tarball(self, *a, **kw): raise AssertionError("not used")
|
||||||
|
|
||||||
|
queue = ShipperQueue(cfg, _NoTransport())
|
||||||
|
|
||||||
|
# First pass: gate's 0.0 sentinel means we sweep.
|
||||||
|
queue._maybe_cleanup_quarantine()
|
||||||
|
first_at = queue._last_quarantine_cleanup_at
|
||||||
|
assert first_at > 0
|
||||||
|
|
||||||
|
# Stage an old entry AFTER the first sweep. The gate should
|
||||||
|
# block the next sweep until cleanup_interval_s has elapsed.
|
||||||
|
old = cfg.quarantine_dir / "01OLD"
|
||||||
|
old.mkdir()
|
||||||
|
_os.utime(old, (_time.time() - 8 * 86400,) * 2)
|
||||||
|
|
||||||
|
queue._maybe_cleanup_quarantine()
|
||||||
|
assert old.exists(), "gate should defer the next sweep"
|
||||||
|
assert queue._last_quarantine_cleanup_at == first_at
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_forever_calls_heartbeat(tmp_path: Path) -> None:
|
||||||
|
"""The heartbeat callback fires once per completed pass. In
|
||||||
|
production this is wired to sd_notify(WATCHDOG=1) so systemd's
|
||||||
|
WatchdogSec catches a hung scan loop."""
|
||||||
|
import threading
|
||||||
|
|
||||||
|
cfg = ShipperConfig(
|
||||||
|
host_id="lab1",
|
||||||
|
data_root=tmp_path / "lab-data",
|
||||||
|
receiver=ReceiverEndpoint(url="http://127.0.0.1:1"),
|
||||||
|
scan_interval_s=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
class _NoTransport:
|
||||||
|
def ship_tarball(self, *a, **kw): raise AssertionError("not used")
|
||||||
|
|
||||||
|
queue = ShipperQueue(cfg, _NoTransport())
|
||||||
|
|
||||||
|
beats = []
|
||||||
|
stop = threading.Event()
|
||||||
|
|
||||||
|
def _heartbeat() -> None:
|
||||||
|
beats.append(time.monotonic())
|
||||||
|
if len(beats) >= 3:
|
||||||
|
stop.set()
|
||||||
|
|
||||||
|
queue.run_forever(stop_check=stop.is_set, heartbeat=_heartbeat)
|
||||||
|
assert len(beats) >= 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_forever_survives_heartbeat_exception(tmp_path: Path) -> None:
|
||||||
|
"""A broken heartbeat (e.g. NOTIFY_SOCKET vanished) must not take
|
||||||
|
down the daemon — the loss of watchdog is tolerable; the loss
|
||||||
|
of the ship loop is not."""
|
||||||
|
cfg = ShipperConfig(
|
||||||
|
host_id="lab1",
|
||||||
|
data_root=tmp_path / "lab-data",
|
||||||
|
receiver=ReceiverEndpoint(url="http://127.0.0.1:1"),
|
||||||
|
scan_interval_s=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
class _NoTransport:
|
||||||
|
def ship_tarball(self, *a, **kw): raise AssertionError("not used")
|
||||||
|
|
||||||
|
queue = ShipperQueue(cfg, _NoTransport())
|
||||||
|
|
||||||
|
pass_count = [0]
|
||||||
|
|
||||||
|
def _stop() -> bool:
|
||||||
|
return pass_count[0] >= 3
|
||||||
|
|
||||||
|
def _broken_heartbeat() -> None:
|
||||||
|
pass_count[0] += 1
|
||||||
|
raise RuntimeError("simulated NOTIFY_SOCKET failure")
|
||||||
|
|
||||||
|
# Should NOT raise.
|
||||||
|
queue.run_forever(stop_check=_stop, heartbeat=_broken_heartbeat)
|
||||||
|
assert pass_count[0] >= 3
|
||||||
|
|
||||||
|
|
||||||
def test_run_once_sweeps_orphaned_outbox_tarball(tmp_path: Path, receiver) -> None:
|
def test_run_once_sweeps_orphaned_outbox_tarball(tmp_path: Path, receiver) -> None:
|
||||||
"""A tarball in outbox/ with no matching episode dir should get
|
"""A tarball in outbox/ with no matching episode dir should get
|
||||||
cleaned up at the start of the next scan. The lifecycle invariant
|
cleaned up at the start of the next scan. The lifecycle invariant
|
||||||
|
|
|
||||||
|
|
@ -354,6 +354,100 @@ def check_services(report: Report, role: str) -> None:
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# checks — recent shipping errors (lab-host)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def check_recent_shipping_errors(report: Report) -> None:
|
||||||
|
"""Tail the last 10 minutes of cis490-shipper logs and surface
|
||||||
|
any 400/412 patterns. The shipper logs every PUT outcome, so a
|
||||||
|
fresh stream of fatals means the lab host's code is older than
|
||||||
|
the receiver's allow-list — exactly the loop our gate-cutover
|
||||||
|
fixes were meant to prevent. Surfacing here gives the operator
|
||||||
|
a one-line "what's broken" instead of having to grep the journal.
|
||||||
|
|
||||||
|
Skipped silently if journalctl isn't accessible (doctor often
|
||||||
|
runs as the unprivileged user and reading the system journal
|
||||||
|
needs the systemd-journal group)."""
|
||||||
|
rc, out, err = _run([
|
||||||
|
"journalctl", "-u", "cis490-shipper",
|
||||||
|
"--since", "10 minutes ago", "--no-pager", "--output=cat",
|
||||||
|
])
|
||||||
|
if rc != 0:
|
||||||
|
# Permission denied / journalctl not available / unit not
|
||||||
|
# installed yet — none of these merit a red row.
|
||||||
|
report.add(Check(
|
||||||
|
"shipper: recent log scan",
|
||||||
|
"skip",
|
||||||
|
detail=(err.strip().splitlines()[-1] if err else "no output")[:80],
|
||||||
|
))
|
||||||
|
return
|
||||||
|
|
||||||
|
lines = out.splitlines()
|
||||||
|
if not lines:
|
||||||
|
report.add(Check(
|
||||||
|
"shipper: recent ship results",
|
||||||
|
"ok",
|
||||||
|
detail="no output in last 10 minutes (daemon may be idle)",
|
||||||
|
))
|
||||||
|
return
|
||||||
|
|
||||||
|
# Match what queue.py / app.py actually log. We're conservative:
|
||||||
|
# only count lines that explicitly identify a ship outcome so
|
||||||
|
# we don't false-positive on unrelated 400s the receiver might
|
||||||
|
# log (e.g. health-check probes).
|
||||||
|
fatal_400 = sum(1 for ln in lines if "missing X-Cis490-Code-Commit" in ln)
|
||||||
|
fatal_412 = sum(1 for ln in lines if "412 commit-rejected" in ln
|
||||||
|
or "code commit rejected" in ln)
|
||||||
|
other_fatal = sum(1 for ln in lines
|
||||||
|
if "ship " in ln and "fatal" in ln
|
||||||
|
and "missing X-Cis490-Code-Commit" not in ln
|
||||||
|
and "commit rejected" not in ln)
|
||||||
|
transient = sum(1 for ln in lines
|
||||||
|
if "ship " in ln and "transient" in ln)
|
||||||
|
|
||||||
|
if fatal_412 > 0:
|
||||||
|
report.add(Check(
|
||||||
|
"shipper: recent ship results",
|
||||||
|
"fail",
|
||||||
|
detail=f"{fatal_412} ship(s) rejected as out-of-window in last 10 min",
|
||||||
|
fix=("cd /opt/cis490 && sudo -u cis490 git pull origin main && "
|
||||||
|
"sudo /opt/cis490/scripts/install-lab-host.sh "
|
||||||
|
"# pulls new code + drains stale queue + restarts daemon"),
|
||||||
|
))
|
||||||
|
elif fatal_400 > 0:
|
||||||
|
report.add(Check(
|
||||||
|
"shipper: recent ship results",
|
||||||
|
"fail",
|
||||||
|
detail=(
|
||||||
|
f"{fatal_400} ship(s) rejected as missing-commit-header — "
|
||||||
|
"orchestrator is emitting episodes without code_version"
|
||||||
|
),
|
||||||
|
fix=("sudo /opt/cis490/scripts/install-lab-host.sh "
|
||||||
|
"# rewrites VERSION + restarts orchestrator"),
|
||||||
|
))
|
||||||
|
elif other_fatal > 0:
|
||||||
|
report.add(Check(
|
||||||
|
"shipper: recent ship results",
|
||||||
|
"warn",
|
||||||
|
detail=f"{other_fatal} fatal ship(s) in last 10 min (other 4xx)",
|
||||||
|
fix="sudo journalctl -u cis490-shipper --since '10 minutes ago' "
|
||||||
|
"| grep -E 'ship .*fatal'",
|
||||||
|
))
|
||||||
|
elif transient > 5:
|
||||||
|
report.add(Check(
|
||||||
|
"shipper: recent ship results",
|
||||||
|
"warn",
|
||||||
|
detail=f"{transient} transient failures in last 10 min — receiver reachable?",
|
||||||
|
fix="sudo /opt/cis490/.venv/bin/python -m shipper "
|
||||||
|
"--config /etc/cis490/lab-host.toml --ping",
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# At least one line of output, but no error patterns matched.
|
||||||
|
report.add(Check("shipper: recent ship results", "ok"))
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# checks — network (lab-host)
|
# checks — network (lab-host)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -647,6 +741,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||||
check_bridge(report)
|
check_bridge(report)
|
||||||
if not args.no_tier3:
|
if not args.no_tier3:
|
||||||
check_tier3(report)
|
check_tier3(report)
|
||||||
|
check_recent_shipping_errors(report)
|
||||||
check_end_to_end(report)
|
check_end_to_end(report)
|
||||||
|
|
||||||
summary = report.summary()
|
summary = report.summary()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue