receiver.toml.example: the local_repo_path comment was wrong about when it kicks in. With the new fallback path, it's used both when forgejo_url is unset (sole backend) AND when forgejo is unreachable (failover). Document that, plus the auto-detect of /opt/cis490/.git. cis490_doctor: add a VERSION-stamp check for lab-host role. If /opt/cis490/VERSION is missing or malformed, the orchestrator stamps "unknown" → receiver gate rejects every PUT → quarantine. Surface this as a red row with the canonical fix (re-run install-lab-host.sh) so an on-device agent doesn't have to grep journal logs to figure it out. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
682 lines
24 KiB
Python
682 lines
24 KiB
Python
"""``cis490-doctor`` — single-command diagnostic for a lab host or receiver.
|
|
|
|
Walks the full bring-up stack from the bottom up and prints a
|
|
green/yellow/red checklist with the exact command that fixes each
|
|
red row. Run this whenever:
|
|
|
|
- you just cloned the repo and aren't sure what's missing
|
|
- you ran install-lab-host.sh but `index.jsonl` on the Pi is empty
|
|
- somebody filed an issue saying "shipping isn't working"
|
|
|
|
Usage:
|
|
uv run python tools/cis490_doctor.py # human output
|
|
uv run python tools/cis490_doctor.py --json # machine-readable
|
|
uv run python tools/cis490_doctor.py --role lab-host # default
|
|
uv run python tools/cis490_doctor.py --role receiver
|
|
|
|
Exits non-zero if any RED check fails.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import dataclasses
|
|
import json
|
|
import os
|
|
import shutil
|
|
import socket
|
|
import ssl
|
|
import subprocess
|
|
import sys
|
|
import tomllib
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
|
|
# ANSI color codes; auto-disable on non-tty.
|
|
def _supports_color() -> bool:
|
|
return sys.stdout.isatty() and os.environ.get("NO_COLOR") is None
|
|
|
|
|
|
_ANSI_GREEN = "\033[32m" if _supports_color() else ""
|
|
_ANSI_YELLOW = "\033[33m" if _supports_color() else ""
|
|
_ANSI_RED = "\033[31m" if _supports_color() else ""
|
|
_ANSI_BOLD = "\033[1m" if _supports_color() else ""
|
|
_ANSI_DIM = "\033[2m" if _supports_color() else ""
|
|
_ANSI_RESET = "\033[0m" if _supports_color() else ""
|
|
|
|
|
|
@dataclass
|
|
class Check:
|
|
name: str
|
|
status: str # "ok" | "warn" | "fail" | "skip"
|
|
detail: str = ""
|
|
fix: str = ""
|
|
|
|
def render(self) -> str:
|
|
glyph = {
|
|
"ok": f"{_ANSI_GREEN}[✓]{_ANSI_RESET}",
|
|
"warn": f"{_ANSI_YELLOW}[!]{_ANSI_RESET}",
|
|
"fail": f"{_ANSI_RED}[✗]{_ANSI_RESET}",
|
|
"skip": f"{_ANSI_DIM}[-]{_ANSI_RESET}",
|
|
}[self.status]
|
|
line = f"{glyph} {self.name}"
|
|
if self.detail:
|
|
line += f" {_ANSI_DIM}{self.detail}{_ANSI_RESET}"
|
|
if self.status == "fail" and self.fix:
|
|
line += f"\n {_ANSI_BOLD}fix:{_ANSI_RESET} {self.fix}"
|
|
return line
|
|
|
|
|
|
@dataclass
|
|
class Report:
|
|
role: str
|
|
checks: list[Check] = field(default_factory=list)
|
|
|
|
def add(self, c: Check) -> None:
|
|
self.checks.append(c)
|
|
# Mirror to stdout immediately so a hung check doesn't leave
|
|
# the operator without partial info.
|
|
if not _JSON_MODE:
|
|
print(c.render(), flush=True)
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"role": self.role,
|
|
"checks": [dataclasses.asdict(c) for c in self.checks],
|
|
"summary": self.summary(),
|
|
}
|
|
|
|
def summary(self) -> dict:
|
|
out = {"ok": 0, "warn": 0, "fail": 0, "skip": 0}
|
|
for c in self.checks:
|
|
out[c.status] = out.get(c.status, 0) + 1
|
|
return out
|
|
|
|
|
|
_JSON_MODE = False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _run(cmd: list[str], *, timeout: float = 5.0, cwd: str | None = None) -> tuple[int, str, str]:
|
|
try:
|
|
p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, cwd=cwd)
|
|
return p.returncode, p.stdout.strip(), p.stderr.strip()
|
|
except (FileNotFoundError, subprocess.TimeoutExpired) as e:
|
|
return -1, "", str(e)
|
|
|
|
|
|
def _path_exists(p: Path) -> bool:
|
|
try:
|
|
return p.exists()
|
|
except PermissionError:
|
|
return True # treat unreadable-but-present as present
|
|
|
|
|
|
def _size_str(p: Path) -> str:
|
|
try:
|
|
return f"{p.stat().st_size // (1024*1024)} MiB"
|
|
except (OSError, PermissionError):
|
|
return "(stat denied — re-run with sudo for size)"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — repo
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_repo(report: Report, repo_root: Path) -> None:
|
|
if not (repo_root / ".git").exists():
|
|
report.add(Check(
|
|
"repo: .git directory present",
|
|
"warn",
|
|
detail=f"running from {repo_root} which isn't a git checkout — fine for /opt/cis490 (cp -aT'd) but not the source clone",
|
|
))
|
|
return
|
|
rc, head, _ = _run(["git", "-C", str(repo_root), "rev-parse", "--short=8", "HEAD"])
|
|
rc2, branch, _ = _run(["git", "-C", str(repo_root), "rev-parse", "--abbrev-ref", "HEAD"])
|
|
rc3, dirty, _ = _run(["git", "-C", str(repo_root), "status", "--porcelain"])
|
|
rc4, log, _ = _run(["git", "-C", str(repo_root), "log", "-1", "--format=%s"])
|
|
detail = f"{branch}@{head}: {log[:60]}"
|
|
if branch != "main":
|
|
report.add(Check(
|
|
"repo: on main",
|
|
"warn",
|
|
detail=detail,
|
|
fix=f"cd {repo_root} && git fetch && git checkout main && git pull",
|
|
))
|
|
else:
|
|
report.add(Check("repo: on main", "ok", detail=detail))
|
|
if dirty:
|
|
report.add(Check(
|
|
"repo: tree clean",
|
|
"warn",
|
|
detail=f"{len(dirty.splitlines())} modified files",
|
|
))
|
|
else:
|
|
report.add(Check("repo: tree clean", "ok"))
|
|
|
|
rc5, behind, _ = _run(
|
|
["git", "-C", str(repo_root), "rev-list", "--count", "HEAD..@{u}"],
|
|
)
|
|
if rc5 == 0 and behind.isdigit() and int(behind) > 0:
|
|
report.add(Check(
|
|
"repo: up to date with origin",
|
|
"warn",
|
|
detail=f"{behind} commits behind",
|
|
fix=f"cd {repo_root} && git pull",
|
|
))
|
|
elif rc5 == 0:
|
|
report.add(Check("repo: up to date with origin", "ok"))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — install
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_install(report: Report, role: str) -> None:
|
|
install_root = Path("/opt/cis490")
|
|
if not _path_exists(install_root):
|
|
report.add(Check(
|
|
"install: /opt/cis490 exists",
|
|
"fail",
|
|
fix=f"sudo $(pwd)/scripts/install-{role}.sh",
|
|
))
|
|
return
|
|
report.add(Check("install: /opt/cis490 exists", "ok"))
|
|
|
|
venv_python = install_root / ".venv" / "bin" / "python"
|
|
if _path_exists(venv_python):
|
|
rc, ver, _ = _run([str(venv_python), "--version"])
|
|
report.add(Check("install: venv python", "ok",
|
|
detail=ver if rc == 0 else "(unreadable)"))
|
|
else:
|
|
report.add(Check(
|
|
"install: venv python",
|
|
"fail",
|
|
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
|
|
))
|
|
|
|
# VERSION file — written by install-lab-host.sh on every successful
|
|
# run. Its absence means the install never finished step 3, so the
|
|
# orchestrator falls back to git rev-parse (or "unknown" if no .git/
|
|
# is here either). Stamping "unknown" gets every episode rejected
|
|
# by the receiver gate as bad-format → drained to quarantine/. The
|
|
# fix is the same git-pull-and-reinstall as for stale code.
|
|
version_file = install_root / "VERSION"
|
|
if role == "lab-host" and _path_exists(version_file):
|
|
try:
|
|
v = json.loads(version_file.read_text())
|
|
commit = v.get("commit", "")
|
|
branch = v.get("branch", "?")
|
|
dirty = " [dirty]" if v.get("dirty") else ""
|
|
if isinstance(commit, str) and len(commit) == 40:
|
|
report.add(Check(
|
|
"install: VERSION stamp",
|
|
"ok",
|
|
detail=f"{branch}@{commit[:8]}{dirty}",
|
|
))
|
|
else:
|
|
report.add(Check(
|
|
"install: VERSION stamp",
|
|
"fail",
|
|
detail=f"commit field malformed: {commit!r}",
|
|
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
|
|
))
|
|
except (OSError, json.JSONDecodeError) as e:
|
|
report.add(Check(
|
|
"install: VERSION stamp",
|
|
"fail",
|
|
detail=f"unreadable: {e}",
|
|
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
|
|
))
|
|
elif role == "lab-host":
|
|
report.add(Check(
|
|
"install: VERSION stamp",
|
|
"fail",
|
|
detail="missing — orchestrator will stamp 'unknown' and the "
|
|
"receiver gate will reject every PUT",
|
|
fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
|
|
))
|
|
|
|
cfg_name = "lab-host.toml" if role == "lab-host" else "receiver.toml"
|
|
cfg = Path("/etc/cis490") / cfg_name
|
|
if _path_exists(cfg):
|
|
try:
|
|
with open(cfg, "rb") as f:
|
|
tomllib.load(f)
|
|
report.add(Check(f"config: {cfg}", "ok", detail="parses"))
|
|
except PermissionError:
|
|
# Mode 0640 root:cis490 is the install default. Doctor often
|
|
# runs as the unprivileged user — file is fine, we just
|
|
# can't read it from here.
|
|
report.add(Check(
|
|
f"config: {cfg}",
|
|
"warn",
|
|
detail="exists, can't read (mode 0640 root:cis490 — re-run with sudo for full audit)",
|
|
))
|
|
except tomllib.TOMLDecodeError as e:
|
|
report.add(Check(
|
|
f"config: {cfg}",
|
|
"fail",
|
|
detail=str(e),
|
|
fix=f"sudo $EDITOR {cfg}",
|
|
))
|
|
else:
|
|
report.add(Check(
|
|
f"config: {cfg}",
|
|
"fail",
|
|
fix=f"sudo cp /opt/cis490/etc/{cfg_name}.example {cfg}",
|
|
))
|
|
|
|
if role == "lab-host":
|
|
env = Path("/etc/cis490/lab-host.env")
|
|
if _path_exists(env):
|
|
report.add(Check("config: lab-host.env", "ok"))
|
|
else:
|
|
report.add(Check(
|
|
"config: lab-host.env",
|
|
"fail",
|
|
fix="sudo /opt/cis490/scripts/install-lab-host.sh "
|
|
"# regenerates the env file",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — certs (lab-host)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_certs_lab_host(report: Report) -> None:
|
|
base = Path("/etc/cis490/certs")
|
|
expected = ["wg-ca.pem", "lab-host.pem", "lab-host.key"]
|
|
missing = [n for n in expected if not _path_exists(base / n)]
|
|
if missing:
|
|
report.add(Check(
|
|
f"mTLS: certs at {base}",
|
|
"fail",
|
|
detail=f"missing: {missing}",
|
|
fix="On the Pi: sudo /home/max/.env/wg-pki/scripts/"
|
|
"deploy-cis490-cert.sh <host_id> <this-machine-wg-ip>",
|
|
))
|
|
return
|
|
# Verify the chain.
|
|
rc, out, err = _run([
|
|
"openssl", "verify",
|
|
"-CAfile", str(base / "wg-ca.pem"),
|
|
str(base / "lab-host.pem"),
|
|
])
|
|
if rc == 0 and "OK" in out:
|
|
report.add(Check("mTLS: cert chain validates", "ok",
|
|
detail=out.splitlines()[0]))
|
|
else:
|
|
report.add(Check(
|
|
"mTLS: cert chain validates",
|
|
"fail",
|
|
detail=err or out,
|
|
fix="re-issue the leaf via wg-pki/scripts/deploy-cis490-cert.sh",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — services
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_services(report: Report, role: str) -> None:
|
|
services = (
|
|
["cis490-receiver"]
|
|
if role == "receiver"
|
|
else ["cis490-shipper", "cis490-orchestrator"]
|
|
)
|
|
for svc in services:
|
|
rc, state, _ = _run(["systemctl", "is-active", svc])
|
|
if state == "active":
|
|
report.add(Check(f"systemd: {svc} active", "ok"))
|
|
elif state == "inactive":
|
|
report.add(Check(
|
|
f"systemd: {svc} active",
|
|
"fail",
|
|
detail="inactive",
|
|
fix=f"sudo systemctl enable --now {svc}",
|
|
))
|
|
else:
|
|
report.add(Check(
|
|
f"systemd: {svc} active",
|
|
"fail",
|
|
detail=state or "unknown",
|
|
fix=f"sudo journalctl -u {svc} --no-pager -n 30",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — network (lab-host)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_network_lab_host(report: Report, cfg_path: Path) -> None:
|
|
try:
|
|
with open(cfg_path, "rb") as f:
|
|
cfg = tomllib.load(f)
|
|
except (FileNotFoundError, PermissionError, tomllib.TOMLDecodeError) as e:
|
|
report.add(Check("net: lab-host.toml readable", "fail", detail=str(e)))
|
|
return
|
|
|
|
receiver_url = cfg.get("receiver", {}).get("url", "")
|
|
if not receiver_url.startswith("https://"):
|
|
report.add(Check(
|
|
"net: receiver.url present",
|
|
"fail",
|
|
detail=receiver_url,
|
|
fix=f"edit {cfg_path}: receiver.url = 'https://collector.wg'",
|
|
))
|
|
return
|
|
host = receiver_url.split("//", 1)[1].split("/", 1)[0].split(":")[0]
|
|
port = 443
|
|
if ":" in receiver_url.split("//", 1)[1].split("/", 1)[0]:
|
|
port = int(receiver_url.split("//", 1)[1].split("/", 1)[0].split(":")[1])
|
|
|
|
try:
|
|
ip = socket.gethostbyname(host)
|
|
report.add(Check(f"net: DNS resolve {host}", "ok",
|
|
detail=f"-> {ip}"))
|
|
except socket.gaierror as e:
|
|
report.add(Check(
|
|
f"net: DNS resolve {host}",
|
|
"fail",
|
|
detail=str(e),
|
|
fix=f"echo '10.100.0.1 {host}' | sudo tee -a /etc/hosts "
|
|
"# wg-enroll provisions this on real lab hosts",
|
|
))
|
|
return
|
|
|
|
try:
|
|
with socket.create_connection((host, port), timeout=5):
|
|
report.add(Check(f"net: TCP {host}:{port} reachable", "ok"))
|
|
except OSError as e:
|
|
report.add(Check(
|
|
f"net: TCP {host}:{port} reachable",
|
|
"fail",
|
|
detail=str(e),
|
|
fix="check iptmonads is allowing the WG-side 443 + Caddy is up",
|
|
))
|
|
return
|
|
|
|
# mTLS handshake — pull the receiver cert paths from cfg.
|
|
ca = cfg.get("receiver", {}).get("ca_bundle")
|
|
cert = cfg.get("receiver", {}).get("client_cert")
|
|
key = cfg.get("receiver", {}).get("client_key")
|
|
if not (ca and cert and key):
|
|
report.add(Check("net: mTLS handshake to collector.wg",
|
|
"skip", detail="cert paths not in config"))
|
|
return
|
|
try:
|
|
ctx = ssl.create_default_context(cafile="/home/max/wg-pki/certs/caddy-root.crt"
|
|
if Path("/home/max/wg-pki/certs/caddy-root.crt").exists()
|
|
else None)
|
|
ctx.load_cert_chain(certfile=cert, keyfile=key)
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
with socket.create_connection((host, port), timeout=5) as sock:
|
|
with ctx.wrap_socket(sock, server_hostname=host) as ssock:
|
|
report.add(Check("net: mTLS handshake to collector.wg",
|
|
"ok",
|
|
detail=f"cipher={ssock.cipher()[0]}"))
|
|
except (ssl.SSLError, OSError, FileNotFoundError) as e:
|
|
report.add(Check(
|
|
"net: mTLS handshake to collector.wg",
|
|
"fail",
|
|
detail=str(e),
|
|
fix="sudo /home/max/wg-pki/scripts/deploy-cis490-cert.sh <host_id> <wg_ip> "
|
|
"(rerun cert deploy)",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — VM prereqs (lab-host)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_vm_prereqs(report: Report) -> None:
|
|
if not _path_exists(Path("/dev/kvm")):
|
|
report.add(Check(
|
|
"vm: /dev/kvm",
|
|
"fail",
|
|
fix="ensure KVM kernel module is loaded; on x86 hosts: sudo modprobe kvm-intel || sudo modprobe kvm-amd",
|
|
))
|
|
else:
|
|
report.add(Check("vm: /dev/kvm", "ok"))
|
|
|
|
if shutil.which("qemu-system-x86_64") is None:
|
|
report.add(Check(
|
|
"vm: qemu-system-x86_64 on PATH",
|
|
"fail",
|
|
fix="install qemu-system-x86 via the host package manager",
|
|
))
|
|
else:
|
|
report.add(Check("vm: qemu-system-x86_64 on PATH", "ok"))
|
|
|
|
if shutil.which("zstd") is None:
|
|
report.add(Check(
|
|
"vm: zstd on PATH (shipper compression)",
|
|
"fail",
|
|
fix="install zstd via the host package manager",
|
|
))
|
|
else:
|
|
report.add(Check("vm: zstd on PATH", "ok"))
|
|
|
|
images = Path("/var/lib/cis490/vm/images")
|
|
alpine = images / "alpine-baseline.qcow2"
|
|
cidata = images / "cidata.iso"
|
|
if _path_exists(alpine):
|
|
report.add(Check(f"vm: {alpine}", "ok",
|
|
detail=_size_str(alpine)))
|
|
else:
|
|
report.add(Check(
|
|
f"vm: {alpine}",
|
|
"fail",
|
|
fix=f"sudo /opt/cis490/scripts/fetch-alpine-baseline.sh {alpine}",
|
|
))
|
|
if _path_exists(cidata):
|
|
report.add(Check(f"vm: {cidata}", "ok",
|
|
detail=_size_str(cidata)))
|
|
else:
|
|
report.add(Check(
|
|
f"vm: {cidata}",
|
|
"fail",
|
|
fix=f"sudo /opt/cis490/.venv/bin/python /opt/cis490/tools/build_cidata.py {cidata}",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — Tier 3 (optional)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_tier3(report: Report) -> None:
|
|
if shutil.which("msfrpcd") is None:
|
|
report.add(Check(
|
|
"tier3: msfrpcd on PATH",
|
|
"warn",
|
|
detail="optional — only needed for real exploit episodes",
|
|
fix="sudo /opt/cis490/scripts/install-msfrpcd.sh",
|
|
))
|
|
else:
|
|
report.add(Check("tier3: msfrpcd on PATH", "ok"))
|
|
|
|
# Probe whether msfrpcd is actually listening (tier-3 fleet
|
|
# dispatch checks the same thing).
|
|
msfrpcd_listening = False
|
|
try:
|
|
with socket.create_connection(("127.0.0.1", 55553), timeout=0.5):
|
|
msfrpcd_listening = True
|
|
except OSError:
|
|
pass
|
|
if msfrpcd_listening:
|
|
report.add(Check("tier3: msfrpcd listening on 127.0.0.1:55553", "ok"))
|
|
else:
|
|
report.add(Check(
|
|
"tier3: msfrpcd listening on 127.0.0.1:55553",
|
|
"warn",
|
|
detail="optional — fleet falls back to Tier 2 when down",
|
|
fix="sudo systemctl enable --now cis490-msfrpcd",
|
|
))
|
|
|
|
# Module catalog parses + at least one same-socket entry.
|
|
modules_dir = Path("/opt/cis490/exploits/modules")
|
|
if modules_dir.exists():
|
|
try:
|
|
from exploits.modules import load_module_configs as _load
|
|
catalog = _load(modules_dir)
|
|
same_socket = [k for k, v in catalog.items() if not v.requires_bridge]
|
|
report.add(Check(
|
|
"tier3: module catalog parses",
|
|
"ok",
|
|
detail=f"{len(catalog)} modules, {len(same_socket)} same-socket "
|
|
f"({len(catalog) - len(same_socket)} need BRIDGE)",
|
|
))
|
|
except Exception as e:
|
|
report.add(Check(
|
|
"tier3: module catalog parses",
|
|
"fail",
|
|
detail=str(e),
|
|
fix="check exploits/modules/*.toml syntax",
|
|
))
|
|
images = Path("/var/lib/cis490/vm/images")
|
|
msf2 = images / "metasploitable2.qcow2"
|
|
if _path_exists(msf2):
|
|
report.add(Check(f"tier3: {msf2}", "ok",
|
|
detail=_size_str(msf2)))
|
|
else:
|
|
report.add(Check(
|
|
f"tier3: {msf2}",
|
|
"warn",
|
|
detail="optional — needed for Tier-3 episodes",
|
|
fix="IMAGE_URL=… IMAGE_SHA256=… sudo /opt/cis490/scripts/fetch-metasploitable2.sh",
|
|
))
|
|
|
|
|
|
def check_bridge(report: Report) -> None:
|
|
"""Bridge readiness — pcap (source 4) + reverse/bind callback
|
|
payloads both need this. Without it, Tier-3 episodes that pick
|
|
callback modules will fire but the session never lands."""
|
|
rc, out, _ = _run(["ip", "-br", "link", "show", "br-malware"])
|
|
if rc == 0 and "br-malware" in out:
|
|
if "UP" in out or "UNKNOWN" in out:
|
|
report.add(Check("bridge: br-malware up", "ok", detail=out.strip()[:80]))
|
|
else:
|
|
report.add(Check(
|
|
"bridge: br-malware up",
|
|
"warn",
|
|
detail=out.strip()[:80],
|
|
fix="sudo ip link set br-malware up",
|
|
))
|
|
else:
|
|
report.add(Check(
|
|
"bridge: br-malware exists",
|
|
"warn",
|
|
detail="optional — pcap capture + callback-payload Tier-3 "
|
|
"modules require it",
|
|
fix="sudo /opt/cis490/vm/setup_bridge.sh",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# checks — end to end (lab-host)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_end_to_end(report: Report) -> None:
|
|
cfg = "/etc/cis490/lab-host.toml"
|
|
if not _path_exists(Path(cfg)):
|
|
report.add(Check("e2e: cis490-shipper --ping", "skip",
|
|
detail="no lab-host.toml"))
|
|
return
|
|
rc, out, err = _run([
|
|
"/opt/cis490/.venv/bin/python", "-m", "shipper",
|
|
"--config", cfg, "--ping",
|
|
], timeout=15.0, cwd="/opt/cis490")
|
|
if rc == 0 and '"ok": true' in out:
|
|
report.add(Check("e2e: cis490-shipper --ping", "ok",
|
|
detail="200 OK"))
|
|
else:
|
|
report.add(Check(
|
|
"e2e: cis490-shipper --ping",
|
|
"fail",
|
|
detail=(out or err)[:200],
|
|
fix="paste this row's detail into a Forgejo issue or to the operator",
|
|
))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
global _JSON_MODE
|
|
p = argparse.ArgumentParser(prog="cis490-doctor")
|
|
p.add_argument("--role", choices=("lab-host", "receiver"), default="lab-host")
|
|
p.add_argument("--json", action="store_true",
|
|
help="machine-readable output (suppresses progressive printing)")
|
|
p.add_argument("--no-tier3", action="store_true",
|
|
help="skip the optional Tier-3 prerequisite checks")
|
|
args = p.parse_args(argv)
|
|
_JSON_MODE = args.json
|
|
|
|
repo_root = Path(__file__).resolve().parent.parent
|
|
if str(repo_root) not in sys.path:
|
|
sys.path.insert(0, str(repo_root))
|
|
if not _JSON_MODE:
|
|
print(f"{_ANSI_BOLD}cis490-doctor{_ANSI_RESET} role={args.role} repo={repo_root}\n")
|
|
|
|
report = Report(role=args.role)
|
|
check_repo(report, repo_root)
|
|
check_install(report, args.role)
|
|
if args.role == "lab-host":
|
|
check_certs_lab_host(report)
|
|
check_services(report, args.role)
|
|
if args.role == "lab-host":
|
|
check_network_lab_host(report, Path("/etc/cis490/lab-host.toml"))
|
|
check_vm_prereqs(report)
|
|
check_bridge(report)
|
|
if not args.no_tier3:
|
|
check_tier3(report)
|
|
check_end_to_end(report)
|
|
|
|
summary = report.summary()
|
|
if _JSON_MODE:
|
|
json.dump(report.to_dict(), sys.stdout, indent=2)
|
|
print()
|
|
else:
|
|
print()
|
|
print(f"{_ANSI_BOLD}summary:{_ANSI_RESET} "
|
|
f"{_ANSI_GREEN}{summary['ok']} ok{_ANSI_RESET}, "
|
|
f"{_ANSI_YELLOW}{summary['warn']} warn{_ANSI_RESET}, "
|
|
f"{_ANSI_RED}{summary['fail']} fail{_ANSI_RESET}, "
|
|
f"{_ANSI_DIM}{summary['skip']} skip{_ANSI_RESET}")
|
|
if summary["fail"]:
|
|
print(
|
|
f"\n{_ANSI_BOLD}{_ANSI_RED}NOT READY.{_ANSI_RESET} "
|
|
"Run the `fix:` commands above in order, then re-run "
|
|
"`cis490-doctor`. When all rows are green/yellow, "
|
|
"episodes will start shipping to the Pi."
|
|
)
|
|
else:
|
|
print(
|
|
f"\n{_ANSI_BOLD}{_ANSI_GREEN}READY.{_ANSI_RESET} "
|
|
"Episodes should be flowing. Watch:\n"
|
|
" sudo journalctl -u cis490-shipper -f\n"
|
|
" ssh <pi> 'sudo tail -f /var/lib/cis490/index.jsonl'"
|
|
)
|
|
|
|
return 1 if summary["fail"] else 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|