CIS490/tools/cis490_doctor.py

"""``cis490-doctor`` — single-command diagnostic for a lab host or receiver.

Walks the full bring-up stack from the bottom up and prints a
green/yellow/red checklist with the exact command that fixes each
red row. Run this whenever:

  - you just cloned the repo and aren't sure what's missing
  - you ran install-lab-host.sh but `index.jsonl` on the Pi is empty
  - somebody filed an issue saying "shipping isn't working"

Usage:
  uv run python tools/cis490_doctor.py            # human output
  uv run python tools/cis490_doctor.py --json     # machine-readable
  uv run python tools/cis490_doctor.py --role lab-host    # default
  uv run python tools/cis490_doctor.py --role receiver

Exits non-zero if any RED check fails.
"""

from __future__ import annotations

import argparse
import dataclasses
import json
import os
import shutil
import socket
import ssl
import subprocess
import sys
import tomllib
from dataclasses import dataclass, field
from pathlib import Path


# ANSI color codes; auto-disable on non-tty.
def _supports_color() -> bool:
    return sys.stdout.isatty() and os.environ.get("NO_COLOR") is None


_ANSI_GREEN = "\033[32m" if _supports_color() else ""
_ANSI_YELLOW = "\033[33m" if _supports_color() else ""
_ANSI_RED = "\033[31m" if _supports_color() else ""
_ANSI_BOLD = "\033[1m" if _supports_color() else ""
_ANSI_DIM = "\033[2m" if _supports_color() else ""
_ANSI_RESET = "\033[0m" if _supports_color() else ""


@dataclass
class Check:
    name: str
    status: str  # "ok" | "warn" | "fail" | "skip"
    detail: str = ""
    fix: str = ""

    def render(self) -> str:
        glyph = {
            "ok":   f"{_ANSI_GREEN}[✓]{_ANSI_RESET}",
            "warn": f"{_ANSI_YELLOW}[!]{_ANSI_RESET}",
            "fail": f"{_ANSI_RED}[✗]{_ANSI_RESET}",
            "skip": f"{_ANSI_DIM}[-]{_ANSI_RESET}",
        }[self.status]
        line = f"{glyph} {self.name}"
        if self.detail:
            line += f"  {_ANSI_DIM}{self.detail}{_ANSI_RESET}"
        if self.status == "fail" and self.fix:
            line += f"\n     {_ANSI_BOLD}fix:{_ANSI_RESET} {self.fix}"
        return line


@dataclass
class Report:
    role: str
    checks: list[Check] = field(default_factory=list)

    def add(self, c: Check) -> None:
        self.checks.append(c)
        # Mirror to stdout immediately so a hung check doesn't leave
        # the operator without partial info.
        if not _JSON_MODE:
            print(c.render(), flush=True)

    def to_dict(self) -> dict:
        return {
            "role": self.role,
            "checks": [dataclasses.asdict(c) for c in self.checks],
            "summary": self.summary(),
        }

    def summary(self) -> dict:
        out = {"ok": 0, "warn": 0, "fail": 0, "skip": 0}
        for c in self.checks:
            out[c.status] = out.get(c.status, 0) + 1
        return out


_JSON_MODE = False


# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------


def _run(cmd: list[str], *, timeout: float = 5.0) -> tuple[int, str, str]:
    try:
        p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        return p.returncode, p.stdout.strip(), p.stderr.strip()
    except (FileNotFoundError, subprocess.TimeoutExpired) as e:
        return -1, "", str(e)


def _path_exists(p: Path) -> bool:
    try:
        return p.exists()
    except PermissionError:
        return True  # treat unreadable-but-present as present


def _size_str(p: Path) -> str:
    try:
        return f"{p.stat().st_size // (1024*1024)} MiB"
    except (OSError, PermissionError):
        return "(stat denied — re-run with sudo for size)"


# ---------------------------------------------------------------------------
# checks — repo
# ---------------------------------------------------------------------------


def check_repo(report: Report, repo_root: Path) -> None:
    if not (repo_root / ".git").exists():
        report.add(Check(
            "repo: .git directory present",
            "warn",
            detail=f"running from {repo_root} which isn't a git checkout — fine for /opt/cis490 (cp -aT'd) but not the source clone",
        ))
        return
    rc, head, _ = _run(["git", "-C", str(repo_root), "rev-parse", "--short=8", "HEAD"])
    rc2, branch, _ = _run(["git", "-C", str(repo_root), "rev-parse", "--abbrev-ref", "HEAD"])
    rc3, dirty, _ = _run(["git", "-C", str(repo_root), "status", "--porcelain"])
    rc4, log, _ = _run(["git", "-C", str(repo_root), "log", "-1", "--format=%s"])
    detail = f"{branch}@{head}: {log[:60]}"
    if branch != "main":
        report.add(Check(
            "repo: on main",
            "warn",
            detail=detail,
            fix=f"cd {repo_root} && git fetch && git checkout main && git pull",
        ))
    else:
        report.add(Check("repo: on main", "ok", detail=detail))
    if dirty:
        report.add(Check(
            "repo: tree clean",
            "warn",
            detail=f"{len(dirty.splitlines())} modified files",
        ))
    else:
        report.add(Check("repo: tree clean", "ok"))

    rc5, behind, _ = _run(
        ["git", "-C", str(repo_root), "rev-list", "--count", "HEAD..@{u}"],
    )
    if rc5 == 0 and behind.isdigit() and int(behind) > 0:
        report.add(Check(
            "repo: up to date with origin",
            "warn",
            detail=f"{behind} commits behind",
            fix=f"cd {repo_root} && git pull",
        ))
    elif rc5 == 0:
        report.add(Check("repo: up to date with origin", "ok"))


# ---------------------------------------------------------------------------
# checks — install
# ---------------------------------------------------------------------------


def check_install(report: Report, role: str) -> None:
    install_root = Path("/opt/cis490")
    if not _path_exists(install_root):
        report.add(Check(
            "install: /opt/cis490 exists",
            "fail",
            fix=f"sudo $(pwd)/scripts/install-{role}.sh",
        ))
        return
    report.add(Check("install: /opt/cis490 exists", "ok"))

    venv_python = install_root / ".venv" / "bin" / "python"
    if _path_exists(venv_python):
        rc, ver, _ = _run([str(venv_python), "--version"])
        report.add(Check("install: venv python", "ok",
                         detail=ver if rc == 0 else "(unreadable)"))
    else:
        report.add(Check(
            "install: venv python",
            "fail",
            fix=f"sudo /opt/cis490/scripts/install-{role}.sh",
        ))

    cfg_name = "lab-host.toml" if role == "lab-host" else "receiver.toml"
    cfg = Path("/etc/cis490") / cfg_name
    if _path_exists(cfg):
        try:
            with open(cfg, "rb") as f:
                tomllib.load(f)
            report.add(Check(f"config: {cfg}", "ok", detail="parses"))
        except PermissionError:
            # Mode 0640 root:cis490 is the install default. Doctor often
            # runs as the unprivileged user — file is fine, we just
            # can't read it from here.
            report.add(Check(
                f"config: {cfg}",
                "warn",
                detail="exists, can't read (mode 0640 root:cis490 — re-run with sudo for full audit)",
            ))
        except tomllib.TOMLDecodeError as e:
            report.add(Check(
                f"config: {cfg}",
                "fail",
                detail=str(e),
                fix=f"sudo $EDITOR {cfg}",
            ))
    else:
        report.add(Check(
            f"config: {cfg}",
            "fail",
            fix=f"sudo cp /opt/cis490/etc/{cfg_name}.example {cfg}",
        ))

    if role == "lab-host":
        env = Path("/etc/cis490/lab-host.env")
        if _path_exists(env):
            report.add(Check("config: lab-host.env", "ok"))
        else:
            report.add(Check(
                "config: lab-host.env",
                "fail",
                fix="sudo /opt/cis490/scripts/install-lab-host.sh   "
                    "# regenerates the env file",
            ))


# ---------------------------------------------------------------------------
# checks — certs (lab-host)
# ---------------------------------------------------------------------------


def check_certs_lab_host(report: Report) -> None:
    base = Path("/etc/cis490/certs")
    expected = ["wg-ca.pem", "lab-host.pem", "lab-host.key"]
    missing = [n for n in expected if not _path_exists(base / n)]
    if missing:
        report.add(Check(
            f"mTLS: certs at {base}",
            "fail",
            detail=f"missing: {missing}",
            fix="On the Pi: sudo /home/max/.env/wg-pki/scripts/"
                "deploy-cis490-cert.sh <host_id> <this-machine-wg-ip>",
        ))
        return
    # Verify the chain.
    rc, out, err = _run([
        "openssl", "verify",
        "-CAfile", str(base / "wg-ca.pem"),
        str(base / "lab-host.pem"),
    ])
    if rc == 0 and "OK" in out:
        report.add(Check("mTLS: cert chain validates", "ok",
                         detail=out.splitlines()[0]))
    else:
        report.add(Check(
            "mTLS: cert chain validates",
            "fail",
            detail=err or out,
            fix="re-issue the leaf via wg-pki/scripts/deploy-cis490-cert.sh",
        ))


# ---------------------------------------------------------------------------
# checks — services
# ---------------------------------------------------------------------------


def check_services(report: Report, role: str) -> None:
    services = (
        ["cis490-receiver"]
        if role == "receiver"
        else ["cis490-shipper", "cis490-orchestrator"]
    )
    for svc in services:
        rc, state, _ = _run(["systemctl", "is-active", svc])
        if state == "active":
            report.add(Check(f"systemd: {svc} active", "ok"))
        elif state == "inactive":
            report.add(Check(
                f"systemd: {svc} active",
                "fail",
                detail="inactive",
                fix=f"sudo systemctl enable --now {svc}",
            ))
        else:
            report.add(Check(
                f"systemd: {svc} active",
                "fail",
                detail=state or "unknown",
                fix=f"sudo journalctl -u {svc} --no-pager -n 30",
            ))


# ---------------------------------------------------------------------------
# checks — network (lab-host)
# ---------------------------------------------------------------------------


def check_network_lab_host(report: Report, cfg_path: Path) -> None:
    try:
        with open(cfg_path, "rb") as f:
            cfg = tomllib.load(f)
    except (FileNotFoundError, PermissionError, tomllib.TOMLDecodeError) as e:
        report.add(Check("net: lab-host.toml readable", "fail", detail=str(e)))
        return

    receiver_url = cfg.get("receiver", {}).get("url", "")
    if not receiver_url.startswith("https://"):
        report.add(Check(
            "net: receiver.url present",
            "fail",
            detail=receiver_url,
            fix=f"edit {cfg_path}: receiver.url = 'https://collector.wg'",
        ))
        return
    host = receiver_url.split("//", 1)[1].split("/", 1)[0].split(":")[0]
    port = 443
    if ":" in receiver_url.split("//", 1)[1].split("/", 1)[0]:
        port = int(receiver_url.split("//", 1)[1].split("/", 1)[0].split(":")[1])

    try:
        ip = socket.gethostbyname(host)
        report.add(Check(f"net: DNS resolve {host}", "ok",
                         detail=f"-> {ip}"))
    except socket.gaierror as e:
        report.add(Check(
            f"net: DNS resolve {host}",
            "fail",
            detail=str(e),
            fix=f"echo '10.100.0.1 {host}' | sudo tee -a /etc/hosts   "
                "# wg-enroll provisions this on real lab hosts",
        ))
        return

    try:
        with socket.create_connection((host, port), timeout=5):
            report.add(Check(f"net: TCP {host}:{port} reachable", "ok"))
    except OSError as e:
        report.add(Check(
            f"net: TCP {host}:{port} reachable",
            "fail",
            detail=str(e),
            fix="check iptmonads is allowing the WG-side 443 + Caddy is up",
        ))
        return

    # mTLS handshake — pull the receiver cert paths from cfg.
    ca = cfg.get("receiver", {}).get("ca_bundle")
    cert = cfg.get("receiver", {}).get("client_cert")
    key = cfg.get("receiver", {}).get("client_key")
    if not (ca and cert and key):
        report.add(Check("net: mTLS handshake to collector.wg",
                         "skip", detail="cert paths not in config"))
        return
    try:
        ctx = ssl.create_default_context(cafile="/home/max/wg-pki/certs/caddy-root.crt"
                                         if Path("/home/max/wg-pki/certs/caddy-root.crt").exists()
                                         else None)
        ctx.load_cert_chain(certfile=cert, keyfile=key)
        ctx.check_hostname = False
        ctx.verify_mode = ssl.CERT_NONE
        with socket.create_connection((host, port), timeout=5) as sock:
            with ctx.wrap_socket(sock, server_hostname=host) as ssock:
                report.add(Check("net: mTLS handshake to collector.wg",
                                 "ok",
                                 detail=f"cipher={ssock.cipher()[0]}"))
    except (ssl.SSLError, OSError, FileNotFoundError) as e:
        report.add(Check(
            "net: mTLS handshake to collector.wg",
            "fail",
            detail=str(e),
            fix="sudo /home/max/wg-pki/scripts/deploy-cis490-cert.sh <host_id> <wg_ip>   "
                "(rerun cert deploy)",
        ))


# ---------------------------------------------------------------------------
# checks — VM prereqs (lab-host)
# ---------------------------------------------------------------------------


def check_vm_prereqs(report: Report) -> None:
    if not _path_exists(Path("/dev/kvm")):
        report.add(Check(
            "vm: /dev/kvm",
            "fail",
            fix="ensure KVM kernel module is loaded; on x86 hosts: sudo modprobe kvm-intel || sudo modprobe kvm-amd",
        ))
    else:
        report.add(Check("vm: /dev/kvm", "ok"))

    if shutil.which("qemu-system-x86_64") is None:
        report.add(Check(
            "vm: qemu-system-x86_64 on PATH",
            "fail",
            fix="install qemu-system-x86 via the host package manager",
        ))
    else:
        report.add(Check("vm: qemu-system-x86_64 on PATH", "ok"))

    if shutil.which("zstd") is None:
        report.add(Check(
            "vm: zstd on PATH (shipper compression)",
            "fail",
            fix="install zstd via the host package manager",
        ))
    else:
        report.add(Check("vm: zstd on PATH", "ok"))

    images = Path("/var/lib/cis490/vm/images")
    alpine = images / "alpine-baseline.qcow2"
    cidata = images / "cidata.iso"
    if _path_exists(alpine):
        report.add(Check(f"vm: {alpine}", "ok",
                         detail=_size_str(alpine)))
    else:
        report.add(Check(
            f"vm: {alpine}",
            "fail",
            fix=f"sudo /opt/cis490/scripts/fetch-alpine-baseline.sh {alpine}",
        ))
    if _path_exists(cidata):
        report.add(Check(f"vm: {cidata}", "ok",
                         detail=_size_str(cidata)))
    else:
        report.add(Check(
            f"vm: {cidata}",
            "fail",
            fix=f"sudo /opt/cis490/.venv/bin/python /opt/cis490/tools/build_cidata.py {cidata}",
        ))


# ---------------------------------------------------------------------------
# checks — Tier 3 (optional)
# ---------------------------------------------------------------------------


def check_tier3(report: Report) -> None:
    if shutil.which("msfrpcd") is None:
        report.add(Check(
            "tier3: msfrpcd on PATH",
            "warn",
            detail="optional — only needed for real exploit episodes",
            fix="sudo /opt/cis490/scripts/install-msfrpcd.sh",
        ))
    else:
        report.add(Check("tier3: msfrpcd on PATH", "ok"))

    images = Path("/var/lib/cis490/vm/images")
    msf2 = images / "metasploitable2.qcow2"
    if _path_exists(msf2):
        report.add(Check(f"tier3: {msf2}", "ok",
                         detail=_size_str(msf2)))
    else:
        report.add(Check(
            f"tier3: {msf2}",
            "warn",
            detail="optional — needed for Tier-3 episodes",
            fix="IMAGE_URL=… IMAGE_SHA256=… sudo /opt/cis490/scripts/fetch-metasploitable2.sh",
        ))


# ---------------------------------------------------------------------------
# checks — end to end (lab-host)
# ---------------------------------------------------------------------------


def check_end_to_end(report: Report) -> None:
    cfg = "/etc/cis490/lab-host.toml"
    if not _path_exists(Path(cfg)):
        report.add(Check("e2e: cis490-shipper --ping", "skip",
                         detail="no lab-host.toml"))
        return
    rc, out, err = _run([
        "/opt/cis490/.venv/bin/python", "-m", "shipper",
        "--config", cfg, "--ping",
    ], timeout=15.0)
    if rc == 0 and '"ok": true' in out:
        report.add(Check("e2e: cis490-shipper --ping", "ok",
                         detail="200 OK"))
    else:
        report.add(Check(
            "e2e: cis490-shipper --ping",
            "fail",
            detail=(out or err)[:200],
            fix="paste this row's detail into a Forgejo issue or to the operator",
        ))


# ---------------------------------------------------------------------------
# main
# ---------------------------------------------------------------------------


def main(argv: list[str] | None = None) -> int:
    global _JSON_MODE
    p = argparse.ArgumentParser(prog="cis490-doctor")
    p.add_argument("--role", choices=("lab-host", "receiver"), default="lab-host")
    p.add_argument("--json", action="store_true",
                   help="machine-readable output (suppresses progressive printing)")
    p.add_argument("--no-tier3", action="store_true",
                   help="skip the optional Tier-3 prerequisite checks")
    args = p.parse_args(argv)
    _JSON_MODE = args.json

    repo_root = Path(__file__).resolve().parent.parent
    if not _JSON_MODE:
        print(f"{_ANSI_BOLD}cis490-doctor{_ANSI_RESET} role={args.role} repo={repo_root}\n")

    report = Report(role=args.role)
    check_repo(report, repo_root)
    check_install(report, args.role)
    if args.role == "lab-host":
        check_certs_lab_host(report)
    check_services(report, args.role)
    if args.role == "lab-host":
        check_network_lab_host(report, Path("/etc/cis490/lab-host.toml"))
        check_vm_prereqs(report)
        if not args.no_tier3:
            check_tier3(report)
        check_end_to_end(report)

    summary = report.summary()
    if _JSON_MODE:
        json.dump(report.to_dict(), sys.stdout, indent=2)
        print()
    else:
        print()
        print(f"{_ANSI_BOLD}summary:{_ANSI_RESET} "
              f"{_ANSI_GREEN}{summary['ok']} ok{_ANSI_RESET}, "
              f"{_ANSI_YELLOW}{summary['warn']} warn{_ANSI_RESET}, "
              f"{_ANSI_RED}{summary['fail']} fail{_ANSI_RESET}, "
              f"{_ANSI_DIM}{summary['skip']} skip{_ANSI_RESET}")
        if summary["fail"]:
            print(
                f"\n{_ANSI_BOLD}{_ANSI_RED}NOT READY.{_ANSI_RESET} "
                "Run the `fix:` commands above in order, then re-run "
                "`cis490-doctor`. When all rows are green/yellow, "
                "episodes will start shipping to the Pi."
            )
        else:
            print(
                f"\n{_ANSI_BOLD}{_ANSI_GREEN}READY.{_ANSI_RESET} "
                "Episodes should be flowing. Watch:\n"
                "  sudo journalctl -u cis490-shipper -f\n"
                "  ssh <pi> 'sudo tail -f /var/lib/cis490/index.jsonl'"
            )

    return 1 if summary["fail"] else 0


if __name__ == "__main__":
    sys.exit(main())