Two pieces of self-monitoring so the maintainer isn't the alarm:
(2) Receiver-side fleet health monitor
cis490-fleet-health.timer runs check_fleet_health.py every 5 min.
Detects three symptoms and writes them to
/var/lib/cis490/alerts.jsonl + a syslog WARNING (greppable / easy
to forward to a notifier):
silent — host shipped in last 24h but has been quiet >30 min
fatal-only — actively shipping but every PUT 4xx
unstamped — shipping without X-Cis490-Code-Commit header
Dedup is keyed on (host, symptom, hour-bucket) so a sustained fault
fires once per hour, not every 5 min. 15 unit tests cover the index
parser, three detectors, and dedup.
(3) Per-host doctor snapshots
Lab hosts run cis490-doctor-check.timer once a day (10 min after
boot, then daily with 30-min jitter). The timer runs
cis490_doctor.py --json and PUTs the result to a new endpoint:
PUT /v1/host-health/<host> → /var/lib/cis490/host-health/<host>.json
GET /v1/host-health → aggregate across all hosts
Endpoint is NOT gated by version_gate — sick hosts running stale
code MUST still be able to report sickness. 11 unit tests cover
PUT/GET, atomic-write semantics, bearer auth, and the
not-gated-by-version-gate property.
ship_health_check.py reuses the existing shipper transport (mTLS +
bearer + receiver URL from lab-host.toml) so we don't reimplement
auth.
Both timers wired into install-lab-host.sh — the loop also enables
the previously-added autoupdate + cert-fetch timers, so a single
install run gives a host all four self-healing mechanisms.
Tests: 293 pass (26 new — 15 fleet-health, 11 host-health). 2
pre-existing test_fleet.py failures from the elliott-ThinkPad
merge (667f042) are unrelated to this change.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
114 lines
4 KiB
Python
114 lines
4 KiB
Python
"""Run cis490_doctor.py and PUT the JSON output to the receiver.
|
|
|
|
Triggered by cis490-doctor-check.timer (once a day) or invoked by
|
|
hand. Best-effort: a doctor that exits with red rows still ships its
|
|
output — that's the most useful case.
|
|
|
|
Reuses the shipper's transport (mTLS + bearer + receiver URL from
|
|
lab-host.toml) so we don't reimplement auth.
|
|
|
|
Failure modes:
|
|
- doctor crashes → exit 2, log error
|
|
- PUT fails (non-2xx) → exit 1, log error (timer fires next day)
|
|
- PUT succeeds → exit 0
|
|
- mTLS not yet on disk → exit 0 (silent — first-boot path)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
|
|
from shipper.config import ShipperConfig
|
|
from shipper.transport import ShipperTransport, _build_ssl_context, _CertNotReadyError
|
|
|
|
|
|
log = logging.getLogger("cis490.shipper.health-check")
|
|
|
|
|
|
def run_doctor(doctor_path: Path, role: str = "lab-host") -> dict:
|
|
"""Run cis490_doctor.py --json --role lab-host. Returns the parsed
|
|
JSON (which always has a `checks` array — even when reds are
|
|
present, the doctor exits non-zero but still prints the report).
|
|
Raises RuntimeError if the doctor crashed without printing JSON."""
|
|
venv_py = Path("/opt/cis490/.venv/bin/python")
|
|
py = str(venv_py) if venv_py.exists() else sys.executable
|
|
rc = subprocess.run(
|
|
[py, str(doctor_path), "--role", role, "--json"],
|
|
capture_output=True, text=True, timeout=120,
|
|
)
|
|
# Doctor exits non-zero when red rows are present — that's
|
|
# exactly when we MOST want to ship the snapshot. Don't gate on
|
|
# exit code; gate on whether parseable JSON came out.
|
|
try:
|
|
return json.loads(rc.stdout)
|
|
except json.JSONDecodeError as e:
|
|
raise RuntimeError(
|
|
f"doctor produced no JSON (exit={rc.returncode}, "
|
|
f"stderr={rc.stderr[:500]!r})"
|
|
) from e
|
|
|
|
|
|
def ship_health(cfg: ShipperConfig, snapshot: dict) -> tuple[int, str]:
|
|
"""PUT snapshot to /v1/host-health/<host_id>. Reuses the shipper's
|
|
SSL context build so we get mTLS + the cert-not-ready deferral
|
|
behaviour for free."""
|
|
try:
|
|
verify = _build_ssl_context(cfg.receiver)
|
|
except _CertNotReadyError as e:
|
|
log.info("mTLS material not on disk yet; skipping health ship: %s", e)
|
|
return 0, "deferred"
|
|
|
|
url = f"{cfg.receiver.url}/v1/host-health/{cfg.host_id}"
|
|
headers = {"X-Lab-Host": cfg.host_id, "Content-Type": "application/json"}
|
|
if cfg.receiver.bearer_token:
|
|
headers["Authorization"] = f"Bearer {cfg.receiver.bearer_token}"
|
|
|
|
try:
|
|
with httpx.Client(verify=verify, timeout=cfg.request_timeout_s) as c:
|
|
r = c.put(url, headers=headers, content=json.dumps(snapshot))
|
|
except httpx.HTTPError as e:
|
|
return 1, f"HTTP error: {e}"
|
|
|
|
if 200 <= r.status_code < 300:
|
|
return 0, f"ok ({r.status_code})"
|
|
return 1, f"non-2xx: {r.status_code} {r.text[:200]}"
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
import argparse
|
|
p = argparse.ArgumentParser(prog="cis490-ship-health-check")
|
|
p.add_argument("--config", default="/etc/cis490/lab-host.toml")
|
|
p.add_argument("--doctor",
|
|
default="/opt/cis490/tools/cis490_doctor.py", type=Path)
|
|
p.add_argument("--log-level", default="INFO")
|
|
args = p.parse_args(argv)
|
|
logging.basicConfig(
|
|
level=getattr(logging, args.log_level.upper(), logging.INFO),
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
)
|
|
|
|
try:
|
|
cfg = ShipperConfig.load(args.config)
|
|
except (FileNotFoundError, ValueError) as e:
|
|
log.error("config error: %s", e)
|
|
return 2
|
|
|
|
try:
|
|
snapshot = run_doctor(args.doctor)
|
|
except (RuntimeError, subprocess.TimeoutExpired, FileNotFoundError) as e:
|
|
log.error("doctor failed: %s", e)
|
|
return 2
|
|
|
|
rc, msg = ship_health(cfg, snapshot)
|
|
log.info("health ship: %s", msg)
|
|
return rc
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|