Three robustness items off the future-work list:
1. Shipper sd_notify watchdog. Type=notify + WatchdogSec=180. The
daemon sends READY=1 after queue construction and WATCHDOG=1 once
per scan pass via a heartbeat callback wired into run_forever.
Restart=on-failure only catches process death — silent stalls
(deadlock, hung tar subprocess, blocked I/O past timeout) used to
leave a zombie running with the data backlog growing. Now systemd
kills + restarts the daemon if no WATCHDOG=1 arrives within 180s.
Verified end-to-end against systemd via `systemd-run --transient
--property=Type=notify --property=WatchdogSec=10`: unit transitions
to active on READY=1; SIGSTOP'ing the process triggers
`Watchdog timeout (limit 10s)! Killing process N with SIGABRT` at
exactly t+10s, then unit goes failed → restart cycle.
2. Quarantine cleanup. Without an upper bound, data/quarantine/ grew
forever as fatal episodes piled up. New ShipperConfig fields:
quarantine_keep_days = 30 # opt-out: 0 disables
quarantine_cleanup_interval_s = 3600 # gate so 5s tick doesn't
# statx() the whole tree
Cleanup runs at the start of run_once() but is gated to once per
hour. Removed entries logged.
3. Doctor surfaces shipping errors. Tails 10 minutes of cis490-shipper
journal and surfaces 412/400/transient patterns as red/yellow rows
with the canonical fix command. An on-device agent running
cis490_doctor.py now sees one line ("12 ship(s) rejected as
out-of-window") instead of needing to grep the journal.
Tests: 200/200 (was 188). New coverage: heartbeat callback fires +
survives exceptions; quarantine cleanup respects keep_days, gate, and
opt-out; doctor parser correctly classifies 412/400/transient/clean/
empty/journalctl-denied; both error classes prioritise 412 (more
actionable) when present together.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
777 lines
28 KiB
Python
777 lines
28 KiB
Python
"""``cis490-doctor`` — single-command diagnostic for a lab host or receiver.
|
|
|
|
Walks the full bring-up stack from the bottom up and prints a
|
|
green/yellow/red checklist with the exact command that fixes each
|
|
red row. Run this whenever:
|
|
|
|
- you just cloned the repo and aren't sure what's missing
|
|
- you ran install-lab-host.sh but `index.jsonl` on the Pi is empty
|
|
- somebody filed an issue saying "shipping isn't working"
|
|
|
|
Usage:
|
|
uv run python tools/cis490_doctor.py # human output
|
|
uv run python tools/cis490_doctor.py --json # machine-readable
|
|
uv run python tools/cis490_doctor.py --role lab-host # default
|
|
uv run python tools/cis490_doctor.py --role receiver
|
|
|
|
Exits non-zero if any RED check fails.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import dataclasses
|
|
import json
|
|
import os
|
|
import shutil
|
|
import socket
|
|
import ssl
|
|
import subprocess
|
|
import sys
|
|
import tomllib
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
|
|
# ANSI color codes; auto-disable on non-tty.
|
|
def _supports_color() -> bool:
|
|
return sys.stdout.isatty() and os.environ.get("NO_COLOR") is None
|
|
|
|
|
|
_ANSI_GREEN = "\033[32m" if _supports_color() else ""
|
|
_ANSI_YELLOW = "\033[33m" if _supports_color() else ""
|
|
_ANSI_RED = "\033[31m" if _supports_color() else ""
|
|
_ANSI_BOLD = "\033[1m" if _supports_color() else ""
|
|
_ANSI_DIM = "\033[2m" if _supports_color() else ""
|
|
_ANSI_RESET = "\033[0m" if _supports_color() else ""
|
|
|
|
|
|
@dataclass
|
|
class Check:
|
|
name: str
|
|
status: str # "ok" | "warn" | "fail" | "skip"
|
|
detail: str = ""
|
|
fix: str = ""
|
|
|
|
def render(self) -> str:
|
|
glyph = {
|
|
"ok": f"{_ANSI_GREEN}[✓]{_ANSI_RESET}",
|
|
"warn": f"{_ANSI_YELLOW}[!]{_ANSI_RESET}",
|
|
"fail": f"{_ANSI_RED}[✗]{_ANSI_RESET}",
|
|
"skip": f"{_ANSI_DIM}[-]{_ANSI_RESET}",
|
|
}[self.status]
|
|
line = f"{glyph} {self.name}"
|
|
if self.detail:
|
|
line += f" {_ANSI_DIM}{self.detail}{_ANSI_RESET}"
|
|
if self.status == "fail" and self.fix:
|
|
line += f"\n {_ANSI_BOLD}fix:{_ANSI_RESET} {self.fix}"
|
|
return line
|
|
|
|
|
|
@dataclass
|
|
class Report:
|
|
role: str
|
|
checks: list[Check] = field(default_factory=list)
|
|
|
|
def add(self, c: Check) -> None:
|
|
self.checks.append(c)
|
|
# Mirror to stdout immediately so a hung check doesn't leave
|
|
# the operator without partial info.
|
|
if not _JSON_MODE:
|
|
print(c.render(), flush=True)
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"role": self.role,
|
|
"checks": [dataclasses.asdict(c) for c in self.checks],
|
|
"summary": self.summary(),
|
|
}
|
|
|
|
def summary(self) -> dict:
|
|
out = {"ok": 0, "warn": 0, "fail": 0, "skip": 0}
|
|
for c in self.checks:
|
|
out[c.status] = out.get(c.status, 0) + 1
|
|
return out
|
|
|
|
|
|
_JSON_MODE = False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _run(cmd: list[str], *, timeout: float = 5.0, cwd: str | None = None) -> tuple[int, str, str]:
|
|
try:
|
|
p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, cwd=cwd)
|
|
return p.returncode, p.stdout.strip(), p.stderr.strip()
|
|
except (FileNotFoundError, subprocess.TimeoutExpired) as e:
|
|
return -1, "", str(e)
|
|
|
|
|
|
def _path_exists(p: Path) -> bool:
|
|
try:
|
|
return p.exists()
|
|
except PermissionError:
|
|
return True # treat unreadable-but-present as present
|
|
|
|
|
|
def _size_str(p: Path) -> str:
|
|
try:
|
|
return f"{p.stat().st_size // (1024*1024)} MiB"
|
|
except (OSError, PermissionError):
|
|
return "(stat denied — re-run with sudo for size)"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — repo
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_repo(report: Report, repo_root: Path) -> None:
|
|
if not (repo_root / ".git").exists():
|
|
report.add(Check(
|
|
"repo: .git directory present",
|
|
"warn",
|
|
detail=f"running from {repo_root} which isn't a git checkout — fine for /opt/cis490 (cp -aT'd) but not the source clone",
|
|
))
|
|
return
|
|
rc, head, _ = _run(["git", "-C", str(repo_root), "rev-parse", "--short=8", "HEAD"])
|
|
rc2, branch, _ = _run(["git", "-C", str(repo_root), "rev-parse", "--abbrev-ref", "HEAD"])
|
|
rc3, dirty, _ = _run(["git", "-C", str(repo_root), "status", "--porcelain"])
|
|
rc4, log, _ = _run(["git", "-C", str(repo_root), "log", "-1", "--format=%s"])
|
|
detail = f"{branch}@{head}: {log[:60]}"
|
|
if branch != "main":
|
|
report.add(Check(
|
|
"repo: on main",
|
|
"warn",
|
|
detail=detail,
|
|
fix=f"cd {repo_root} && git fetch && git checkout main && git pull",
|
|
))
|
|
else:
|
|
report.add(Check("repo: on main", "ok", detail=detail))
|
|
if dirty:
|
|
report.add(Check(
|
|
"repo: tree clean",
|
|
"warn",
|
|
detail=f"{len(dirty.splitlines())} modified files",
|
|
))
|
|
else:
|
|
report.add(Check("repo: tree clean", "ok"))
|
|
|
|
rc5, behind, _ = _run(
|
|
["git", "-C", str(repo_root), "rev-list", "--count", "HEAD..@{u}"],
|
|
)
|
|
if rc5 == 0 and behind.isdigit() and int(behind) > 0:
|
|
report.add(Check(
|
|
"repo: up to date with origin",
|
|
"warn",
|
|
detail=f"{behind} commits behind",
|
|
fix=f"cd {repo_root} && git pull",
|
|
))
|
|
elif rc5 == 0:
|
|
report.add(Check("repo: up to date with origin", "ok"))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — install
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_install(report: Report, role: str) -> None:
|
|
install_root = Path("/opt/cis490")
|
|
if not _path_exists(install_root):
|
|
report.add(Check(
|
|
"install: /opt/cis490 exists",
|
|
"fail",
|
|
fix=f"sudo $(pwd)/scripts/install-{role}.sh",
|
|
))
|
|
return
|
|
report.add(Check("install: /opt/cis490 exists", "ok"))
|
|
|
|
venv_python = install_root / ".venv" / "bin" / "python"
|
|
if _path_exists(venv_python):
|
|
rc, ver, _ = _run([str(venv_python), "--version"])
|
|
report.add(Check("install: venv python", "ok",
|
|
detail=ver if rc == 0 else "(unreadable)"))
|
|
else:
|
|
report.add(Check(
|
|
"install: venv python",
|
|
"fail",
|
|
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
|
|
))
|
|
|
|
# VERSION file — written by install-lab-host.sh on every successful
|
|
# run. Its absence means the install never finished step 3, so the
|
|
# orchestrator falls back to git rev-parse (or "unknown" if no .git/
|
|
# is here either). Stamping "unknown" gets every episode rejected
|
|
# by the receiver gate as bad-format → drained to quarantine/. The
|
|
# fix is the same git-pull-and-reinstall as for stale code.
|
|
version_file = install_root / "VERSION"
|
|
if role == "lab-host" and _path_exists(version_file):
|
|
try:
|
|
v = json.loads(version_file.read_text())
|
|
commit = v.get("commit", "")
|
|
branch = v.get("branch", "?")
|
|
dirty = " [dirty]" if v.get("dirty") else ""
|
|
if isinstance(commit, str) and len(commit) == 40:
|
|
report.add(Check(
|
|
"install: VERSION stamp",
|
|
"ok",
|
|
detail=f"{branch}@{commit[:8]}{dirty}",
|
|
))
|
|
else:
|
|
report.add(Check(
|
|
"install: VERSION stamp",
|
|
"fail",
|
|
detail=f"commit field malformed: {commit!r}",
|
|
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
|
|
))
|
|
except (OSError, json.JSONDecodeError) as e:
|
|
report.add(Check(
|
|
"install: VERSION stamp",
|
|
"fail",
|
|
detail=f"unreadable: {e}",
|
|
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
|
|
))
|
|
elif role == "lab-host":
|
|
report.add(Check(
|
|
"install: VERSION stamp",
|
|
"fail",
|
|
detail="missing — orchestrator will stamp 'unknown' and the "
|
|
"receiver gate will reject every PUT",
|
|
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
|
|
))
|
|
|
|
cfg_name = "lab-host.toml" if role == "lab-host" else "receiver.toml"
|
|
cfg = Path("/etc/cis490") / cfg_name
|
|
if _path_exists(cfg):
|
|
try:
|
|
with open(cfg, "rb") as f:
|
|
tomllib.load(f)
|
|
report.add(Check(f"config: {cfg}", "ok", detail="parses"))
|
|
except PermissionError:
|
|
# Mode 0640 root:cis490 is the install default. Doctor often
|
|
# runs as the unprivileged user — file is fine, we just
|
|
# can't read it from here.
|
|
report.add(Check(
|
|
f"config: {cfg}",
|
|
"warn",
|
|
detail="exists, can't read (mode 0640 root:cis490 — re-run with sudo for full audit)",
|
|
))
|
|
except tomllib.TOMLDecodeError as e:
|
|
report.add(Check(
|
|
f"config: {cfg}",
|
|
"fail",
|
|
detail=str(e),
|
|
fix=f"sudo $EDITOR {cfg}",
|
|
))
|
|
else:
|
|
report.add(Check(
|
|
f"config: {cfg}",
|
|
"fail",
|
|
fix=f"sudo cp /opt/cis490/etc/{cfg_name}.example {cfg}",
|
|
))
|
|
|
|
if role == "lab-host":
|
|
env = Path("/etc/cis490/lab-host.env")
|
|
if _path_exists(env):
|
|
report.add(Check("config: lab-host.env", "ok"))
|
|
else:
|
|
report.add(Check(
|
|
"config: lab-host.env",
|
|
"fail",
|
|
fix="sudo /opt/cis490/scripts/install-lab-host.sh "
|
|
"# regenerates the env file",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — certs (lab-host)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_certs_lab_host(report: Report) -> None:
|
|
base = Path("/etc/cis490/certs")
|
|
expected = ["wg-ca.pem", "lab-host.pem", "lab-host.key"]
|
|
missing = [n for n in expected if not _path_exists(base / n)]
|
|
if missing:
|
|
report.add(Check(
|
|
f"mTLS: certs at {base}",
|
|
"fail",
|
|
detail=f"missing: {missing}",
|
|
fix="On the Pi: sudo /home/max/.env/wg-pki/scripts/"
|
|
"deploy-cis490-cert.sh <host_id> <this-machine-wg-ip>",
|
|
))
|
|
return
|
|
# Verify the chain.
|
|
rc, out, err = _run([
|
|
"openssl", "verify",
|
|
"-CAfile", str(base / "wg-ca.pem"),
|
|
str(base / "lab-host.pem"),
|
|
])
|
|
if rc == 0 and "OK" in out:
|
|
report.add(Check("mTLS: cert chain validates", "ok",
|
|
detail=out.splitlines()[0]))
|
|
else:
|
|
report.add(Check(
|
|
"mTLS: cert chain validates",
|
|
"fail",
|
|
detail=err or out,
|
|
fix="re-issue the leaf via wg-pki/scripts/deploy-cis490-cert.sh",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — services
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_services(report: Report, role: str) -> None:
|
|
services = (
|
|
["cis490-receiver"]
|
|
if role == "receiver"
|
|
else ["cis490-shipper", "cis490-orchestrator"]
|
|
)
|
|
for svc in services:
|
|
rc, state, _ = _run(["systemctl", "is-active", svc])
|
|
if state == "active":
|
|
report.add(Check(f"systemd: {svc} active", "ok"))
|
|
elif state == "inactive":
|
|
report.add(Check(
|
|
f"systemd: {svc} active",
|
|
"fail",
|
|
detail="inactive",
|
|
fix=f"sudo systemctl enable --now {svc}",
|
|
))
|
|
else:
|
|
report.add(Check(
|
|
f"systemd: {svc} active",
|
|
"fail",
|
|
detail=state or "unknown",
|
|
fix=f"sudo journalctl -u {svc} --no-pager -n 30",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — recent shipping errors (lab-host)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_recent_shipping_errors(report: Report) -> None:
|
|
"""Tail the last 10 minutes of cis490-shipper logs and surface
|
|
any 400/412 patterns. The shipper logs every PUT outcome, so a
|
|
fresh stream of fatals means the lab host's code is older than
|
|
the receiver's allow-list — exactly the loop our gate-cutover
|
|
fixes were meant to prevent. Surfacing here gives the operator
|
|
a one-line "what's broken" instead of having to grep the journal.
|
|
|
|
Skipped silently if journalctl isn't accessible (doctor often
|
|
runs as the unprivileged user and reading the system journal
|
|
needs the systemd-journal group)."""
|
|
rc, out, err = _run([
|
|
"journalctl", "-u", "cis490-shipper",
|
|
"--since", "10 minutes ago", "--no-pager", "--output=cat",
|
|
])
|
|
if rc != 0:
|
|
# Permission denied / journalctl not available / unit not
|
|
# installed yet — none of these merit a red row.
|
|
report.add(Check(
|
|
"shipper: recent log scan",
|
|
"skip",
|
|
detail=(err.strip().splitlines()[-1] if err else "no output")[:80],
|
|
))
|
|
return
|
|
|
|
lines = out.splitlines()
|
|
if not lines:
|
|
report.add(Check(
|
|
"shipper: recent ship results",
|
|
"ok",
|
|
detail="no output in last 10 minutes (daemon may be idle)",
|
|
))
|
|
return
|
|
|
|
# Match what queue.py / app.py actually log. We're conservative:
|
|
# only count lines that explicitly identify a ship outcome so
|
|
# we don't false-positive on unrelated 400s the receiver might
|
|
# log (e.g. health-check probes).
|
|
fatal_400 = sum(1 for ln in lines if "missing X-Cis490-Code-Commit" in ln)
|
|
fatal_412 = sum(1 for ln in lines if "412 commit-rejected" in ln
|
|
or "code commit rejected" in ln)
|
|
other_fatal = sum(1 for ln in lines
|
|
if "ship " in ln and "fatal" in ln
|
|
and "missing X-Cis490-Code-Commit" not in ln
|
|
and "commit rejected" not in ln)
|
|
transient = sum(1 for ln in lines
|
|
if "ship " in ln and "transient" in ln)
|
|
|
|
if fatal_412 > 0:
|
|
report.add(Check(
|
|
"shipper: recent ship results",
|
|
"fail",
|
|
detail=f"{fatal_412} ship(s) rejected as out-of-window in last 10 min",
|
|
fix=("cd /opt/cis490 && sudo -u cis490 git pull origin main && "
|
|
"sudo /opt/cis490/scripts/install-lab-host.sh "
|
|
"# pulls new code + drains stale queue + restarts daemon"),
|
|
))
|
|
elif fatal_400 > 0:
|
|
report.add(Check(
|
|
"shipper: recent ship results",
|
|
"fail",
|
|
detail=(
|
|
f"{fatal_400} ship(s) rejected as missing-commit-header — "
|
|
"orchestrator is emitting episodes without code_version"
|
|
),
|
|
fix=("sudo /opt/cis490/scripts/install-lab-host.sh "
|
|
"# rewrites VERSION + restarts orchestrator"),
|
|
))
|
|
elif other_fatal > 0:
|
|
report.add(Check(
|
|
"shipper: recent ship results",
|
|
"warn",
|
|
detail=f"{other_fatal} fatal ship(s) in last 10 min (other 4xx)",
|
|
fix="sudo journalctl -u cis490-shipper --since '10 minutes ago' "
|
|
"| grep -E 'ship .*fatal'",
|
|
))
|
|
elif transient > 5:
|
|
report.add(Check(
|
|
"shipper: recent ship results",
|
|
"warn",
|
|
detail=f"{transient} transient failures in last 10 min — receiver reachable?",
|
|
fix="sudo /opt/cis490/.venv/bin/python -m shipper "
|
|
"--config /etc/cis490/lab-host.toml --ping",
|
|
))
|
|
else:
|
|
# At least one line of output, but no error patterns matched.
|
|
report.add(Check("shipper: recent ship results", "ok"))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — network (lab-host)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_network_lab_host(report: Report, cfg_path: Path) -> None:
|
|
try:
|
|
with open(cfg_path, "rb") as f:
|
|
cfg = tomllib.load(f)
|
|
except (FileNotFoundError, PermissionError, tomllib.TOMLDecodeError) as e:
|
|
report.add(Check("net: lab-host.toml readable", "fail", detail=str(e)))
|
|
return
|
|
|
|
receiver_url = cfg.get("receiver", {}).get("url", "")
|
|
if not receiver_url.startswith("https://"):
|
|
report.add(Check(
|
|
"net: receiver.url present",
|
|
"fail",
|
|
detail=receiver_url,
|
|
fix=f"edit {cfg_path}: receiver.url = 'https://collector.wg'",
|
|
))
|
|
return
|
|
host = receiver_url.split("//", 1)[1].split("/", 1)[0].split(":")[0]
|
|
port = 443
|
|
if ":" in receiver_url.split("//", 1)[1].split("/", 1)[0]:
|
|
port = int(receiver_url.split("//", 1)[1].split("/", 1)[0].split(":")[1])
|
|
|
|
try:
|
|
ip = socket.gethostbyname(host)
|
|
report.add(Check(f"net: DNS resolve {host}", "ok",
|
|
detail=f"-> {ip}"))
|
|
except socket.gaierror as e:
|
|
report.add(Check(
|
|
f"net: DNS resolve {host}",
|
|
"fail",
|
|
detail=str(e),
|
|
fix=f"echo '10.100.0.1 {host}' | sudo tee -a /etc/hosts "
|
|
"# wg-enroll provisions this on real lab hosts",
|
|
))
|
|
return
|
|
|
|
try:
|
|
with socket.create_connection((host, port), timeout=5):
|
|
report.add(Check(f"net: TCP {host}:{port} reachable", "ok"))
|
|
except OSError as e:
|
|
report.add(Check(
|
|
f"net: TCP {host}:{port} reachable",
|
|
"fail",
|
|
detail=str(e),
|
|
fix="check iptmonads is allowing the WG-side 443 + Caddy is up",
|
|
))
|
|
return
|
|
|
|
# mTLS handshake — pull the receiver cert paths from cfg.
|
|
ca = cfg.get("receiver", {}).get("ca_bundle")
|
|
cert = cfg.get("receiver", {}).get("client_cert")
|
|
key = cfg.get("receiver", {}).get("client_key")
|
|
if not (ca and cert and key):
|
|
report.add(Check("net: mTLS handshake to collector.wg",
|
|
"skip", detail="cert paths not in config"))
|
|
return
|
|
try:
|
|
ctx = ssl.create_default_context(cafile="/home/max/wg-pki/certs/caddy-root.crt"
|
|
if Path("/home/max/wg-pki/certs/caddy-root.crt").exists()
|
|
else None)
|
|
ctx.load_cert_chain(certfile=cert, keyfile=key)
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
with socket.create_connection((host, port), timeout=5) as sock:
|
|
with ctx.wrap_socket(sock, server_hostname=host) as ssock:
|
|
report.add(Check("net: mTLS handshake to collector.wg",
|
|
"ok",
|
|
detail=f"cipher={ssock.cipher()[0]}"))
|
|
except (ssl.SSLError, OSError, FileNotFoundError) as e:
|
|
report.add(Check(
|
|
"net: mTLS handshake to collector.wg",
|
|
"fail",
|
|
detail=str(e),
|
|
fix="sudo /home/max/wg-pki/scripts/deploy-cis490-cert.sh <host_id> <wg_ip> "
|
|
"(rerun cert deploy)",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — VM prereqs (lab-host)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_vm_prereqs(report: Report) -> None:
|
|
if not _path_exists(Path("/dev/kvm")):
|
|
report.add(Check(
|
|
"vm: /dev/kvm",
|
|
"fail",
|
|
fix="ensure KVM kernel module is loaded; on x86 hosts: sudo modprobe kvm-intel || sudo modprobe kvm-amd",
|
|
))
|
|
else:
|
|
report.add(Check("vm: /dev/kvm", "ok"))
|
|
|
|
if shutil.which("qemu-system-x86_64") is None:
|
|
report.add(Check(
|
|
"vm: qemu-system-x86_64 on PATH",
|
|
"fail",
|
|
fix="install qemu-system-x86 via the host package manager",
|
|
))
|
|
else:
|
|
report.add(Check("vm: qemu-system-x86_64 on PATH", "ok"))
|
|
|
|
if shutil.which("zstd") is None:
|
|
report.add(Check(
|
|
"vm: zstd on PATH (shipper compression)",
|
|
"fail",
|
|
fix="install zstd via the host package manager",
|
|
))
|
|
else:
|
|
report.add(Check("vm: zstd on PATH", "ok"))
|
|
|
|
images = Path("/var/lib/cis490/vm/images")
|
|
alpine = images / "alpine-baseline.qcow2"
|
|
cidata = images / "cidata.iso"
|
|
if _path_exists(alpine):
|
|
report.add(Check(f"vm: {alpine}", "ok",
|
|
detail=_size_str(alpine)))
|
|
else:
|
|
report.add(Check(
|
|
f"vm: {alpine}",
|
|
"fail",
|
|
fix=f"sudo /opt/cis490/scripts/fetch-alpine-baseline.sh {alpine}",
|
|
))
|
|
if _path_exists(cidata):
|
|
report.add(Check(f"vm: {cidata}", "ok",
|
|
detail=_size_str(cidata)))
|
|
else:
|
|
report.add(Check(
|
|
f"vm: {cidata}",
|
|
"fail",
|
|
fix=f"sudo /opt/cis490/.venv/bin/python /opt/cis490/tools/build_cidata.py {cidata}",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — Tier 3 (optional)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_tier3(report: Report) -> None:
|
|
if shutil.which("msfrpcd") is None:
|
|
report.add(Check(
|
|
"tier3: msfrpcd on PATH",
|
|
"warn",
|
|
detail="optional — only needed for real exploit episodes",
|
|
fix="sudo /opt/cis490/scripts/install-msfrpcd.sh",
|
|
))
|
|
else:
|
|
report.add(Check("tier3: msfrpcd on PATH", "ok"))
|
|
|
|
# Probe whether msfrpcd is actually listening (tier-3 fleet
|
|
# dispatch checks the same thing).
|
|
msfrpcd_listening = False
|
|
try:
|
|
with socket.create_connection(("127.0.0.1", 55553), timeout=0.5):
|
|
msfrpcd_listening = True
|
|
except OSError:
|
|
pass
|
|
if msfrpcd_listening:
|
|
report.add(Check("tier3: msfrpcd listening on 127.0.0.1:55553", "ok"))
|
|
else:
|
|
report.add(Check(
|
|
"tier3: msfrpcd listening on 127.0.0.1:55553",
|
|
"warn",
|
|
detail="optional — fleet falls back to Tier 2 when down",
|
|
fix="sudo systemctl enable --now cis490-msfrpcd",
|
|
))
|
|
|
|
# Module catalog parses + at least one same-socket entry.
|
|
modules_dir = Path("/opt/cis490/exploits/modules")
|
|
if modules_dir.exists():
|
|
try:
|
|
from exploits.modules import load_module_configs as _load
|
|
catalog = _load(modules_dir)
|
|
same_socket = [k for k, v in catalog.items() if not v.requires_bridge]
|
|
report.add(Check(
|
|
"tier3: module catalog parses",
|
|
"ok",
|
|
detail=f"{len(catalog)} modules, {len(same_socket)} same-socket "
|
|
f"({len(catalog) - len(same_socket)} need BRIDGE)",
|
|
))
|
|
except Exception as e:
|
|
report.add(Check(
|
|
"tier3: module catalog parses",
|
|
"fail",
|
|
detail=str(e),
|
|
fix="check exploits/modules/*.toml syntax",
|
|
))
|
|
images = Path("/var/lib/cis490/vm/images")
|
|
msf2 = images / "metasploitable2.qcow2"
|
|
if _path_exists(msf2):
|
|
report.add(Check(f"tier3: {msf2}", "ok",
|
|
detail=_size_str(msf2)))
|
|
else:
|
|
report.add(Check(
|
|
f"tier3: {msf2}",
|
|
"warn",
|
|
detail="optional — needed for Tier-3 episodes",
|
|
fix="IMAGE_URL=… IMAGE_SHA256=… sudo /opt/cis490/scripts/fetch-metasploitable2.sh",
|
|
))
|
|
|
|
|
|
def check_bridge(report: Report) -> None:
|
|
"""Bridge readiness — pcap (source 4) + reverse/bind callback
|
|
payloads both need this. Without it, Tier-3 episodes that pick
|
|
callback modules will fire but the session never lands."""
|
|
rc, out, _ = _run(["ip", "-br", "link", "show", "br-malware"])
|
|
if rc == 0 and "br-malware" in out:
|
|
if "UP" in out or "UNKNOWN" in out:
|
|
report.add(Check("bridge: br-malware up", "ok", detail=out.strip()[:80]))
|
|
else:
|
|
report.add(Check(
|
|
"bridge: br-malware up",
|
|
"warn",
|
|
detail=out.strip()[:80],
|
|
fix="sudo ip link set br-malware up",
|
|
))
|
|
else:
|
|
report.add(Check(
|
|
"bridge: br-malware exists",
|
|
"warn",
|
|
detail="optional — pcap capture + callback-payload Tier-3 "
|
|
"modules require it",
|
|
fix="sudo /opt/cis490/vm/setup_bridge.sh",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — end to end (lab-host)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_end_to_end(report: Report) -> None:
|
|
cfg = "/etc/cis490/lab-host.toml"
|
|
if not _path_exists(Path(cfg)):
|
|
report.add(Check("e2e: cis490-shipper --ping", "skip",
|
|
detail="no lab-host.toml"))
|
|
return
|
|
rc, out, err = _run([
|
|
"/opt/cis490/.venv/bin/python", "-m", "shipper",
|
|
"--config", cfg, "--ping",
|
|
], timeout=15.0, cwd="/opt/cis490")
|
|
if rc == 0 and '"ok": true' in out:
|
|
report.add(Check("e2e: cis490-shipper --ping", "ok",
|
|
detail="200 OK"))
|
|
else:
|
|
report.add(Check(
|
|
"e2e: cis490-shipper --ping",
|
|
"fail",
|
|
detail=(out or err)[:200],
|
|
fix="paste this row's detail into a Forgejo issue or to the operator",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
global _JSON_MODE
|
|
p = argparse.ArgumentParser(prog="cis490-doctor")
|
|
p.add_argument("--role", choices=("lab-host", "receiver"), default="lab-host")
|
|
p.add_argument("--json", action="store_true",
|
|
help="machine-readable output (suppresses progressive printing)")
|
|
p.add_argument("--no-tier3", action="store_true",
|
|
help="skip the optional Tier-3 prerequisite checks")
|
|
args = p.parse_args(argv)
|
|
_JSON_MODE = args.json
|
|
|
|
repo_root = Path(__file__).resolve().parent.parent
|
|
if str(repo_root) not in sys.path:
|
|
sys.path.insert(0, str(repo_root))
|
|
if not _JSON_MODE:
|
|
print(f"{_ANSI_BOLD}cis490-doctor{_ANSI_RESET} role={args.role} repo={repo_root}\n")
|
|
|
|
report = Report(role=args.role)
|
|
check_repo(report, repo_root)
|
|
check_install(report, args.role)
|
|
if args.role == "lab-host":
|
|
check_certs_lab_host(report)
|
|
check_services(report, args.role)
|
|
if args.role == "lab-host":
|
|
check_network_lab_host(report, Path("/etc/cis490/lab-host.toml"))
|
|
check_vm_prereqs(report)
|
|
check_bridge(report)
|
|
if not args.no_tier3:
|
|
check_tier3(report)
|
|
check_recent_shipping_errors(report)
|
|
check_end_to_end(report)
|
|
|
|
summary = report.summary()
|
|
if _JSON_MODE:
|
|
json.dump(report.to_dict(), sys.stdout, indent=2)
|
|
print()
|
|
else:
|
|
print()
|
|
print(f"{_ANSI_BOLD}summary:{_ANSI_RESET} "
|
|
f"{_ANSI_GREEN}{summary['ok']} ok{_ANSI_RESET}, "
|
|
f"{_ANSI_YELLOW}{summary['warn']} warn{_ANSI_RESET}, "
|
|
f"{_ANSI_RED}{summary['fail']} fail{_ANSI_RESET}, "
|
|
f"{_ANSI_DIM}{summary['skip']} skip{_ANSI_RESET}")
|
|
if summary["fail"]:
|
|
print(
|
|
f"\n{_ANSI_BOLD}{_ANSI_RED}NOT READY.{_ANSI_RESET} "
|
|
"Run the `fix:` commands above in order, then re-run "
|
|
"`cis490-doctor`. When all rows are green/yellow, "
|
|
"episodes will start shipping to the Pi."
|
|
)
|
|
else:
|
|
print(
|
|
f"\n{_ANSI_BOLD}{_ANSI_GREEN}READY.{_ANSI_RESET} "
|
|
"Episodes should be flowing. Watch:\n"
|
|
" sudo journalctl -u cis490-shipper -f\n"
|
|
" ssh <pi> 'sudo tail -f /var/lib/cis490/index.jsonl'"
|
|
)
|
|
|
|
return 1 if summary["fail"] else 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|