CIS490/tools/cis490_doctor.py
max f9b2e5c4e6 shipper: systemd watchdog, quarantine cleanup; doctor surfaces ship errors
Three robustness items off the future-work list:

1. Shipper sd_notify watchdog. Type=notify + WatchdogSec=180. The
   daemon sends READY=1 after queue construction and WATCHDOG=1 once
   per scan pass via a heartbeat callback wired into run_forever.
   Restart=on-failure only catches process death — silent stalls
   (deadlock, hung tar subprocess, blocked I/O past timeout) used to
   leave a zombie running with the data backlog growing. Now systemd
   kills + restarts the daemon if no WATCHDOG=1 arrives within 180s.

   Verified end-to-end against systemd via `systemd-run --transient
   --property=Type=notify --property=WatchdogSec=10`: unit transitions
   to active on READY=1; SIGSTOP'ing the process triggers
   `Watchdog timeout (limit 10s)! Killing process N with SIGABRT` at
   exactly t+10s, then unit goes failed → restart cycle.

2. Quarantine cleanup. Without an upper bound, data/quarantine/ grew
   forever as fatal episodes piled up. New ShipperConfig fields:
     quarantine_keep_days = 30           # opt-out: 0 disables
     quarantine_cleanup_interval_s = 3600 # gate so 5s tick doesn't
                                          # statx() the whole tree
   Cleanup runs at the start of run_once() but is gated to once per
   hour. Removed entries logged.

3. Doctor surfaces shipping errors. Tails 10 minutes of cis490-shipper
   journal and surfaces 412/400/transient patterns as red/yellow rows
   with the canonical fix command. An on-device agent running
   cis490_doctor.py now sees one line ("12 ship(s) rejected as
   out-of-window") instead of needing to grep the journal.

Tests: 200/200 (was 188). New coverage: heartbeat callback fires +
survives exceptions; quarantine cleanup respects keep_days, gate, and
opt-out; doctor parser correctly classifies 412/400/transient/clean/
empty/journalctl-denied; both error classes prioritise 412 (more
actionable) when present together.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 12:02:59 -05:00

777 lines
28 KiB
Python

"""``cis490-doctor`` — single-command diagnostic for a lab host or receiver.
Walks the full bring-up stack from the bottom up and prints a
green/yellow/red checklist with the exact command that fixes each
red row. Run this whenever:
- you just cloned the repo and aren't sure what's missing
- you ran install-lab-host.sh but `index.jsonl` on the Pi is empty
- somebody filed an issue saying "shipping isn't working"
Usage:
uv run python tools/cis490_doctor.py # human output
uv run python tools/cis490_doctor.py --json # machine-readable
uv run python tools/cis490_doctor.py --role lab-host # default
uv run python tools/cis490_doctor.py --role receiver
Exits non-zero if any RED check fails.
"""
from __future__ import annotations
import argparse
import dataclasses
import json
import os
import shutil
import socket
import ssl
import subprocess
import sys
import tomllib
from dataclasses import dataclass, field
from pathlib import Path
# ANSI color codes; auto-disable on non-tty.
def _supports_color() -> bool:
return sys.stdout.isatty() and os.environ.get("NO_COLOR") is None
_ANSI_GREEN = "\033[32m" if _supports_color() else ""
_ANSI_YELLOW = "\033[33m" if _supports_color() else ""
_ANSI_RED = "\033[31m" if _supports_color() else ""
_ANSI_BOLD = "\033[1m" if _supports_color() else ""
_ANSI_DIM = "\033[2m" if _supports_color() else ""
_ANSI_RESET = "\033[0m" if _supports_color() else ""
@dataclass
class Check:
name: str
status: str # "ok" | "warn" | "fail" | "skip"
detail: str = ""
fix: str = ""
def render(self) -> str:
glyph = {
"ok": f"{_ANSI_GREEN}[✓]{_ANSI_RESET}",
"warn": f"{_ANSI_YELLOW}[!]{_ANSI_RESET}",
"fail": f"{_ANSI_RED}[✗]{_ANSI_RESET}",
"skip": f"{_ANSI_DIM}[-]{_ANSI_RESET}",
}[self.status]
line = f"{glyph} {self.name}"
if self.detail:
line += f" {_ANSI_DIM}{self.detail}{_ANSI_RESET}"
if self.status == "fail" and self.fix:
line += f"\n {_ANSI_BOLD}fix:{_ANSI_RESET} {self.fix}"
return line
@dataclass
class Report:
role: str
checks: list[Check] = field(default_factory=list)
def add(self, c: Check) -> None:
self.checks.append(c)
# Mirror to stdout immediately so a hung check doesn't leave
# the operator without partial info.
if not _JSON_MODE:
print(c.render(), flush=True)
def to_dict(self) -> dict:
return {
"role": self.role,
"checks": [dataclasses.asdict(c) for c in self.checks],
"summary": self.summary(),
}
def summary(self) -> dict:
out = {"ok": 0, "warn": 0, "fail": 0, "skip": 0}
for c in self.checks:
out[c.status] = out.get(c.status, 0) + 1
return out
_JSON_MODE = False
# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
def _run(cmd: list[str], *, timeout: float = 5.0, cwd: str | None = None) -> tuple[int, str, str]:
try:
p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, cwd=cwd)
return p.returncode, p.stdout.strip(), p.stderr.strip()
except (FileNotFoundError, subprocess.TimeoutExpired) as e:
return -1, "", str(e)
def _path_exists(p: Path) -> bool:
try:
return p.exists()
except PermissionError:
return True # treat unreadable-but-present as present
def _size_str(p: Path) -> str:
try:
return f"{p.stat().st_size // (1024*1024)} MiB"
except (OSError, PermissionError):
return "(stat denied — re-run with sudo for size)"
# ---------------------------------------------------------------------------
# checks — repo
# ---------------------------------------------------------------------------
def check_repo(report: Report, repo_root: Path) -> None:
if not (repo_root / ".git").exists():
report.add(Check(
"repo: .git directory present",
"warn",
detail=f"running from {repo_root} which isn't a git checkout — fine for /opt/cis490 (cp -aT'd) but not the source clone",
))
return
rc, head, _ = _run(["git", "-C", str(repo_root), "rev-parse", "--short=8", "HEAD"])
rc2, branch, _ = _run(["git", "-C", str(repo_root), "rev-parse", "--abbrev-ref", "HEAD"])
rc3, dirty, _ = _run(["git", "-C", str(repo_root), "status", "--porcelain"])
rc4, log, _ = _run(["git", "-C", str(repo_root), "log", "-1", "--format=%s"])
detail = f"{branch}@{head}: {log[:60]}"
if branch != "main":
report.add(Check(
"repo: on main",
"warn",
detail=detail,
fix=f"cd {repo_root} && git fetch && git checkout main && git pull",
))
else:
report.add(Check("repo: on main", "ok", detail=detail))
if dirty:
report.add(Check(
"repo: tree clean",
"warn",
detail=f"{len(dirty.splitlines())} modified files",
))
else:
report.add(Check("repo: tree clean", "ok"))
rc5, behind, _ = _run(
["git", "-C", str(repo_root), "rev-list", "--count", "HEAD..@{u}"],
)
if rc5 == 0 and behind.isdigit() and int(behind) > 0:
report.add(Check(
"repo: up to date with origin",
"warn",
detail=f"{behind} commits behind",
fix=f"cd {repo_root} && git pull",
))
elif rc5 == 0:
report.add(Check("repo: up to date with origin", "ok"))
# ---------------------------------------------------------------------------
# checks — install
# ---------------------------------------------------------------------------
def check_install(report: Report, role: str) -> None:
install_root = Path("/opt/cis490")
if not _path_exists(install_root):
report.add(Check(
"install: /opt/cis490 exists",
"fail",
fix=f"sudo $(pwd)/scripts/install-{role}.sh",
))
return
report.add(Check("install: /opt/cis490 exists", "ok"))
venv_python = install_root / ".venv" / "bin" / "python"
if _path_exists(venv_python):
rc, ver, _ = _run([str(venv_python), "--version"])
report.add(Check("install: venv python", "ok",
detail=ver if rc == 0 else "(unreadable)"))
else:
report.add(Check(
"install: venv python",
"fail",
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
))
# VERSION file — written by install-lab-host.sh on every successful
# run. Its absence means the install never finished step 3, so the
# orchestrator falls back to git rev-parse (or "unknown" if no .git/
# is here either). Stamping "unknown" gets every episode rejected
# by the receiver gate as bad-format → drained to quarantine/. The
# fix is the same git-pull-and-reinstall as for stale code.
version_file = install_root / "VERSION"
if role == "lab-host" and _path_exists(version_file):
try:
v = json.loads(version_file.read_text())
commit = v.get("commit", "")
branch = v.get("branch", "?")
dirty = " [dirty]" if v.get("dirty") else ""
if isinstance(commit, str) and len(commit) == 40:
report.add(Check(
"install: VERSION stamp",
"ok",
detail=f"{branch}@{commit[:8]}{dirty}",
))
else:
report.add(Check(
"install: VERSION stamp",
"fail",
detail=f"commit field malformed: {commit!r}",
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
))
except (OSError, json.JSONDecodeError) as e:
report.add(Check(
"install: VERSION stamp",
"fail",
detail=f"unreadable: {e}",
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
))
elif role == "lab-host":
report.add(Check(
"install: VERSION stamp",
"fail",
detail="missing — orchestrator will stamp 'unknown' and the "
"receiver gate will reject every PUT",
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
))
cfg_name = "lab-host.toml" if role == "lab-host" else "receiver.toml"
cfg = Path("/etc/cis490") / cfg_name
if _path_exists(cfg):
try:
with open(cfg, "rb") as f:
tomllib.load(f)
report.add(Check(f"config: {cfg}", "ok", detail="parses"))
except PermissionError:
# Mode 0640 root:cis490 is the install default. Doctor often
# runs as the unprivileged user — file is fine, we just
# can't read it from here.
report.add(Check(
f"config: {cfg}",
"warn",
detail="exists, can't read (mode 0640 root:cis490 — re-run with sudo for full audit)",
))
except tomllib.TOMLDecodeError as e:
report.add(Check(
f"config: {cfg}",
"fail",
detail=str(e),
fix=f"sudo $EDITOR {cfg}",
))
else:
report.add(Check(
f"config: {cfg}",
"fail",
fix=f"sudo cp /opt/cis490/etc/{cfg_name}.example {cfg}",
))
if role == "lab-host":
env = Path("/etc/cis490/lab-host.env")
if _path_exists(env):
report.add(Check("config: lab-host.env", "ok"))
else:
report.add(Check(
"config: lab-host.env",
"fail",
fix="sudo /opt/cis490/scripts/install-lab-host.sh "
"# regenerates the env file",
))
# ---------------------------------------------------------------------------
# checks — certs (lab-host)
# ---------------------------------------------------------------------------
def check_certs_lab_host(report: Report) -> None:
base = Path("/etc/cis490/certs")
expected = ["wg-ca.pem", "lab-host.pem", "lab-host.key"]
missing = [n for n in expected if not _path_exists(base / n)]
if missing:
report.add(Check(
f"mTLS: certs at {base}",
"fail",
detail=f"missing: {missing}",
fix="On the Pi: sudo /home/max/.env/wg-pki/scripts/"
"deploy-cis490-cert.sh <host_id> <this-machine-wg-ip>",
))
return
# Verify the chain.
rc, out, err = _run([
"openssl", "verify",
"-CAfile", str(base / "wg-ca.pem"),
str(base / "lab-host.pem"),
])
if rc == 0 and "OK" in out:
report.add(Check("mTLS: cert chain validates", "ok",
detail=out.splitlines()[0]))
else:
report.add(Check(
"mTLS: cert chain validates",
"fail",
detail=err or out,
fix="re-issue the leaf via wg-pki/scripts/deploy-cis490-cert.sh",
))
# ---------------------------------------------------------------------------
# checks — services
# ---------------------------------------------------------------------------
def check_services(report: Report, role: str) -> None:
services = (
["cis490-receiver"]
if role == "receiver"
else ["cis490-shipper", "cis490-orchestrator"]
)
for svc in services:
rc, state, _ = _run(["systemctl", "is-active", svc])
if state == "active":
report.add(Check(f"systemd: {svc} active", "ok"))
elif state == "inactive":
report.add(Check(
f"systemd: {svc} active",
"fail",
detail="inactive",
fix=f"sudo systemctl enable --now {svc}",
))
else:
report.add(Check(
f"systemd: {svc} active",
"fail",
detail=state or "unknown",
fix=f"sudo journalctl -u {svc} --no-pager -n 30",
))
# ---------------------------------------------------------------------------
# checks — recent shipping errors (lab-host)
# ---------------------------------------------------------------------------
def check_recent_shipping_errors(report: Report) -> None:
"""Tail the last 10 minutes of cis490-shipper logs and surface
any 400/412 patterns. The shipper logs every PUT outcome, so a
fresh stream of fatals means the lab host's code is older than
the receiver's allow-list — exactly the loop our gate-cutover
fixes were meant to prevent. Surfacing here gives the operator
a one-line "what's broken" instead of having to grep the journal.
Skipped silently if journalctl isn't accessible (doctor often
runs as the unprivileged user and reading the system journal
needs the systemd-journal group)."""
rc, out, err = _run([
"journalctl", "-u", "cis490-shipper",
"--since", "10 minutes ago", "--no-pager", "--output=cat",
])
if rc != 0:
# Permission denied / journalctl not available / unit not
# installed yet — none of these merit a red row.
report.add(Check(
"shipper: recent log scan",
"skip",
detail=(err.strip().splitlines()[-1] if err else "no output")[:80],
))
return
lines = out.splitlines()
if not lines:
report.add(Check(
"shipper: recent ship results",
"ok",
detail="no output in last 10 minutes (daemon may be idle)",
))
return
# Match what queue.py / app.py actually log. We're conservative:
# only count lines that explicitly identify a ship outcome so
# we don't false-positive on unrelated 400s the receiver might
# log (e.g. health-check probes).
fatal_400 = sum(1 for ln in lines if "missing X-Cis490-Code-Commit" in ln)
fatal_412 = sum(1 for ln in lines if "412 commit-rejected" in ln
or "code commit rejected" in ln)
other_fatal = sum(1 for ln in lines
if "ship " in ln and "fatal" in ln
and "missing X-Cis490-Code-Commit" not in ln
and "commit rejected" not in ln)
transient = sum(1 for ln in lines
if "ship " in ln and "transient" in ln)
if fatal_412 > 0:
report.add(Check(
"shipper: recent ship results",
"fail",
detail=f"{fatal_412} ship(s) rejected as out-of-window in last 10 min",
fix=("cd /opt/cis490 && sudo -u cis490 git pull origin main && "
"sudo /opt/cis490/scripts/install-lab-host.sh "
"# pulls new code + drains stale queue + restarts daemon"),
))
elif fatal_400 > 0:
report.add(Check(
"shipper: recent ship results",
"fail",
detail=(
f"{fatal_400} ship(s) rejected as missing-commit-header — "
"orchestrator is emitting episodes without code_version"
),
fix=("sudo /opt/cis490/scripts/install-lab-host.sh "
"# rewrites VERSION + restarts orchestrator"),
))
elif other_fatal > 0:
report.add(Check(
"shipper: recent ship results",
"warn",
detail=f"{other_fatal} fatal ship(s) in last 10 min (other 4xx)",
fix="sudo journalctl -u cis490-shipper --since '10 minutes ago' "
"| grep -E 'ship .*fatal'",
))
elif transient > 5:
report.add(Check(
"shipper: recent ship results",
"warn",
detail=f"{transient} transient failures in last 10 min — receiver reachable?",
fix="sudo /opt/cis490/.venv/bin/python -m shipper "
"--config /etc/cis490/lab-host.toml --ping",
))
else:
# At least one line of output, but no error patterns matched.
report.add(Check("shipper: recent ship results", "ok"))
# ---------------------------------------------------------------------------
# checks — network (lab-host)
# ---------------------------------------------------------------------------
def check_network_lab_host(report: Report, cfg_path: Path) -> None:
try:
with open(cfg_path, "rb") as f:
cfg = tomllib.load(f)
except (FileNotFoundError, PermissionError, tomllib.TOMLDecodeError) as e:
report.add(Check("net: lab-host.toml readable", "fail", detail=str(e)))
return
receiver_url = cfg.get("receiver", {}).get("url", "")
if not receiver_url.startswith("https://"):
report.add(Check(
"net: receiver.url present",
"fail",
detail=receiver_url,
fix=f"edit {cfg_path}: receiver.url = 'https://collector.wg'",
))
return
host = receiver_url.split("//", 1)[1].split("/", 1)[0].split(":")[0]
port = 443
if ":" in receiver_url.split("//", 1)[1].split("/", 1)[0]:
port = int(receiver_url.split("//", 1)[1].split("/", 1)[0].split(":")[1])
try:
ip = socket.gethostbyname(host)
report.add(Check(f"net: DNS resolve {host}", "ok",
detail=f"-> {ip}"))
except socket.gaierror as e:
report.add(Check(
f"net: DNS resolve {host}",
"fail",
detail=str(e),
fix=f"echo '10.100.0.1 {host}' | sudo tee -a /etc/hosts "
"# wg-enroll provisions this on real lab hosts",
))
return
try:
with socket.create_connection((host, port), timeout=5):
report.add(Check(f"net: TCP {host}:{port} reachable", "ok"))
except OSError as e:
report.add(Check(
f"net: TCP {host}:{port} reachable",
"fail",
detail=str(e),
fix="check iptmonads is allowing the WG-side 443 + Caddy is up",
))
return
# mTLS handshake — pull the receiver cert paths from cfg.
ca = cfg.get("receiver", {}).get("ca_bundle")
cert = cfg.get("receiver", {}).get("client_cert")
key = cfg.get("receiver", {}).get("client_key")
if not (ca and cert and key):
report.add(Check("net: mTLS handshake to collector.wg",
"skip", detail="cert paths not in config"))
return
try:
ctx = ssl.create_default_context(cafile="/home/max/wg-pki/certs/caddy-root.crt"
if Path("/home/max/wg-pki/certs/caddy-root.crt").exists()
else None)
ctx.load_cert_chain(certfile=cert, keyfile=key)
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
with socket.create_connection((host, port), timeout=5) as sock:
with ctx.wrap_socket(sock, server_hostname=host) as ssock:
report.add(Check("net: mTLS handshake to collector.wg",
"ok",
detail=f"cipher={ssock.cipher()[0]}"))
except (ssl.SSLError, OSError, FileNotFoundError) as e:
report.add(Check(
"net: mTLS handshake to collector.wg",
"fail",
detail=str(e),
fix="sudo /home/max/wg-pki/scripts/deploy-cis490-cert.sh <host_id> <wg_ip> "
"(rerun cert deploy)",
))
# ---------------------------------------------------------------------------
# checks — VM prereqs (lab-host)
# ---------------------------------------------------------------------------
def check_vm_prereqs(report: Report) -> None:
if not _path_exists(Path("/dev/kvm")):
report.add(Check(
"vm: /dev/kvm",
"fail",
fix="ensure KVM kernel module is loaded; on x86 hosts: sudo modprobe kvm-intel || sudo modprobe kvm-amd",
))
else:
report.add(Check("vm: /dev/kvm", "ok"))
if shutil.which("qemu-system-x86_64") is None:
report.add(Check(
"vm: qemu-system-x86_64 on PATH",
"fail",
fix="install qemu-system-x86 via the host package manager",
))
else:
report.add(Check("vm: qemu-system-x86_64 on PATH", "ok"))
if shutil.which("zstd") is None:
report.add(Check(
"vm: zstd on PATH (shipper compression)",
"fail",
fix="install zstd via the host package manager",
))
else:
report.add(Check("vm: zstd on PATH", "ok"))
images = Path("/var/lib/cis490/vm/images")
alpine = images / "alpine-baseline.qcow2"
cidata = images / "cidata.iso"
if _path_exists(alpine):
report.add(Check(f"vm: {alpine}", "ok",
detail=_size_str(alpine)))
else:
report.add(Check(
f"vm: {alpine}",
"fail",
fix=f"sudo /opt/cis490/scripts/fetch-alpine-baseline.sh {alpine}",
))
if _path_exists(cidata):
report.add(Check(f"vm: {cidata}", "ok",
detail=_size_str(cidata)))
else:
report.add(Check(
f"vm: {cidata}",
"fail",
fix=f"sudo /opt/cis490/.venv/bin/python /opt/cis490/tools/build_cidata.py {cidata}",
))
# ---------------------------------------------------------------------------
# checks — Tier 3 (optional)
# ---------------------------------------------------------------------------
def check_tier3(report: Report) -> None:
if shutil.which("msfrpcd") is None:
report.add(Check(
"tier3: msfrpcd on PATH",
"warn",
detail="optional — only needed for real exploit episodes",
fix="sudo /opt/cis490/scripts/install-msfrpcd.sh",
))
else:
report.add(Check("tier3: msfrpcd on PATH", "ok"))
# Probe whether msfrpcd is actually listening (tier-3 fleet
# dispatch checks the same thing).
msfrpcd_listening = False
try:
with socket.create_connection(("127.0.0.1", 55553), timeout=0.5):
msfrpcd_listening = True
except OSError:
pass
if msfrpcd_listening:
report.add(Check("tier3: msfrpcd listening on 127.0.0.1:55553", "ok"))
else:
report.add(Check(
"tier3: msfrpcd listening on 127.0.0.1:55553",
"warn",
detail="optional — fleet falls back to Tier 2 when down",
fix="sudo systemctl enable --now cis490-msfrpcd",
))
# Module catalog parses + at least one same-socket entry.
modules_dir = Path("/opt/cis490/exploits/modules")
if modules_dir.exists():
try:
from exploits.modules import load_module_configs as _load
catalog = _load(modules_dir)
same_socket = [k for k, v in catalog.items() if not v.requires_bridge]
report.add(Check(
"tier3: module catalog parses",
"ok",
detail=f"{len(catalog)} modules, {len(same_socket)} same-socket "
f"({len(catalog) - len(same_socket)} need BRIDGE)",
))
except Exception as e:
report.add(Check(
"tier3: module catalog parses",
"fail",
detail=str(e),
fix="check exploits/modules/*.toml syntax",
))
images = Path("/var/lib/cis490/vm/images")
msf2 = images / "metasploitable2.qcow2"
if _path_exists(msf2):
report.add(Check(f"tier3: {msf2}", "ok",
detail=_size_str(msf2)))
else:
report.add(Check(
f"tier3: {msf2}",
"warn",
detail="optional — needed for Tier-3 episodes",
fix="IMAGE_URL=… IMAGE_SHA256=… sudo /opt/cis490/scripts/fetch-metasploitable2.sh",
))
def check_bridge(report: Report) -> None:
"""Bridge readiness — pcap (source 4) + reverse/bind callback
payloads both need this. Without it, Tier-3 episodes that pick
callback modules will fire but the session never lands."""
rc, out, _ = _run(["ip", "-br", "link", "show", "br-malware"])
if rc == 0 and "br-malware" in out:
if "UP" in out or "UNKNOWN" in out:
report.add(Check("bridge: br-malware up", "ok", detail=out.strip()[:80]))
else:
report.add(Check(
"bridge: br-malware up",
"warn",
detail=out.strip()[:80],
fix="sudo ip link set br-malware up",
))
else:
report.add(Check(
"bridge: br-malware exists",
"warn",
detail="optional — pcap capture + callback-payload Tier-3 "
"modules require it",
fix="sudo /opt/cis490/vm/setup_bridge.sh",
))
# ---------------------------------------------------------------------------
# checks — end to end (lab-host)
# ---------------------------------------------------------------------------
def check_end_to_end(report: Report) -> None:
cfg = "/etc/cis490/lab-host.toml"
if not _path_exists(Path(cfg)):
report.add(Check("e2e: cis490-shipper --ping", "skip",
detail="no lab-host.toml"))
return
rc, out, err = _run([
"/opt/cis490/.venv/bin/python", "-m", "shipper",
"--config", cfg, "--ping",
], timeout=15.0, cwd="/opt/cis490")
if rc == 0 and '"ok": true' in out:
report.add(Check("e2e: cis490-shipper --ping", "ok",
detail="200 OK"))
else:
report.add(Check(
"e2e: cis490-shipper --ping",
"fail",
detail=(out or err)[:200],
fix="paste this row's detail into a Forgejo issue or to the operator",
))
# ---------------------------------------------------------------------------
# main
# ---------------------------------------------------------------------------
def main(argv: list[str] | None = None) -> int:
global _JSON_MODE
p = argparse.ArgumentParser(prog="cis490-doctor")
p.add_argument("--role", choices=("lab-host", "receiver"), default="lab-host")
p.add_argument("--json", action="store_true",
help="machine-readable output (suppresses progressive printing)")
p.add_argument("--no-tier3", action="store_true",
help="skip the optional Tier-3 prerequisite checks")
args = p.parse_args(argv)
_JSON_MODE = args.json
repo_root = Path(__file__).resolve().parent.parent
if str(repo_root) not in sys.path:
sys.path.insert(0, str(repo_root))
if not _JSON_MODE:
print(f"{_ANSI_BOLD}cis490-doctor{_ANSI_RESET} role={args.role} repo={repo_root}\n")
report = Report(role=args.role)
check_repo(report, repo_root)
check_install(report, args.role)
if args.role == "lab-host":
check_certs_lab_host(report)
check_services(report, args.role)
if args.role == "lab-host":
check_network_lab_host(report, Path("/etc/cis490/lab-host.toml"))
check_vm_prereqs(report)
check_bridge(report)
if not args.no_tier3:
check_tier3(report)
check_recent_shipping_errors(report)
check_end_to_end(report)
summary = report.summary()
if _JSON_MODE:
json.dump(report.to_dict(), sys.stdout, indent=2)
print()
else:
print()
print(f"{_ANSI_BOLD}summary:{_ANSI_RESET} "
f"{_ANSI_GREEN}{summary['ok']} ok{_ANSI_RESET}, "
f"{_ANSI_YELLOW}{summary['warn']} warn{_ANSI_RESET}, "
f"{_ANSI_RED}{summary['fail']} fail{_ANSI_RESET}, "
f"{_ANSI_DIM}{summary['skip']} skip{_ANSI_RESET}")
if summary["fail"]:
print(
f"\n{_ANSI_BOLD}{_ANSI_RED}NOT READY.{_ANSI_RESET} "
"Run the `fix:` commands above in order, then re-run "
"`cis490-doctor`. When all rows are green/yellow, "
"episodes will start shipping to the Pi."
)
else:
print(
f"\n{_ANSI_BOLD}{_ANSI_GREEN}READY.{_ANSI_RESET} "
"Episodes should be flowing. Watch:\n"
" sudo journalctl -u cis490-shipper -f\n"
" ssh <pi> 'sudo tail -f /var/lib/cis490/index.jsonl'"
)
return 1 if summary["fail"] else 0
if __name__ == "__main__":
sys.exit(main())