CIS490/receiver/app.py
max 49eba2fd60 fleet-health: proactive alerts on the Pi + per-host doctor reports
Two pieces of self-monitoring so the maintainer isn't the alarm:

(2) Receiver-side fleet health monitor

cis490-fleet-health.timer runs check_fleet_health.py every 5 min.
Detects three symptoms and writes them to
/var/lib/cis490/alerts.jsonl + a syslog WARNING (greppable / easy
to forward to a notifier):

  silent      — host shipped in last 24h but has been quiet >30 min
  fatal-only  — actively shipping but every PUT 4xx
  unstamped   — shipping without X-Cis490-Code-Commit header

Dedup is keyed on (host, symptom, hour-bucket) so a sustained fault
fires once per hour, not every 5 min. 15 unit tests cover the index
parser, three detectors, and dedup.

(3) Per-host doctor snapshots

Lab hosts run cis490-doctor-check.timer once a day (10 min after
boot, then daily with 30-min jitter). The timer runs
cis490_doctor.py --json and PUTs the result to a new endpoint:

  PUT /v1/host-health/<host>   →  /var/lib/cis490/host-health/<host>.json
  GET /v1/host-health          →  aggregate across all hosts

Endpoint is NOT gated by version_gate — sick hosts running stale
code MUST still be able to report sickness. 11 unit tests cover
PUT/GET, atomic-write semantics, bearer auth, and the
not-gated-by-version-gate property.

ship_health_check.py reuses the existing shipper transport (mTLS +
bearer + receiver URL from lab-host.toml) so we don't reimplement
auth.

Both timers wired into install-lab-host.sh — the loop also enables
the previously-added autoupdate + cert-fetch timers, so a single
install run gives a host all four self-healing mechanisms.

Tests: 293 pass (26 new — 15 fleet-health, 11 host-health). 2
pre-existing test_fleet.py failures from the elliott-ThinkPad
merge (667f042) are unrelated to this change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 13:48:31 -05:00

324 lines
13 KiB
Python

from __future__ import annotations
import json
import logging
import secrets
import time
from pathlib import Path
from typing import Awaitable, Callable
from starlette.applications import Starlette
from starlette.requests import Request
from starlette.responses import JSONResponse, Response
from starlette.routing import Route
from .store import EpisodeStore, is_valid_id
from .version_gate import VersionGate
log = logging.getLogger("cis490.receiver")
SUFFIX = ".tar.zst"
SCHEMA_VERSION = 2
# Mirrored from orchestrator.benign so the receiver can validate the
# benign-profile header without taking a dependency on the orchestrator
# package. Keep in sync if BENIGN_PROFILES grows.
_VALID_BENIGN_PROFILES: frozenset[str] = frozenset({
"idle", "web_visitor", "admin_session", "cron_burst",
"file_browse", "db_query", "package_check",
})
_VALID_EPISODE_TYPES: frozenset[str] = frozenset({"control", "infected"})
def _bearer_check(request: Request, expected: str | None) -> Response | None:
if expected is None:
return None
auth = request.headers.get("authorization", "")
prefix = "Bearer "
if not auth.startswith(prefix):
return JSONResponse({"error": "missing bearer token"}, status_code=401)
presented = auth[len(prefix):]
if not secrets.compare_digest(presented, expected):
return JSONResponse({"error": "bad bearer token"}, status_code=401)
return None
def make_app(
store: EpisodeStore,
max_episode_bytes: int,
bearer_token: str | None = None,
version_gate: VersionGate | None = None,
health_root: "Path | None" = None,
) -> Starlette:
async def health(request: Request) -> JSONResponse:
return JSONResponse({"status": "ok"})
async def ping(request: Request) -> JSONResponse:
"""Smoke-test endpoint. Verifies that the auth layer and the
WG/Caddy/receiver pipe are alive end-to-end without persisting
anything — index.jsonl is untouched. Used by ``cis490-shipper
--ping`` during initial bring-up of a new lab host."""
guard = _bearer_check(request, bearer_token)
if guard is not None:
return guard
return JSONResponse(
{
"ok": True,
"host_id": request.headers.get("x-lab-host"),
"t_wall_ns": time.time_ns(),
"schema_version": SCHEMA_VERSION,
}
)
async def put_episode(request: Request) -> JSONResponse:
guard = _bearer_check(request, bearer_token)
if guard is not None:
return guard
host_id: str = request.path_params["host_id"]
filename: str = request.path_params["filename"]
if not filename.endswith(SUFFIX):
return JSONResponse(
{"error": f"expected {SUFFIX} suffix"}, status_code=400
)
episode_id = filename[: -len(SUFFIX)]
if not is_valid_id(host_id) or not is_valid_id(episode_id):
return JSONResponse({"error": "bad host_id or episode_id"}, status_code=400)
expected_sha = request.headers.get("x-content-sha256")
if not expected_sha or len(expected_sha) != 64:
return JSONResponse(
{"error": "X-Content-SHA256 header (64 hex chars) required"},
status_code=400,
)
expected_sha = expected_sha.lower()
try:
schema_version = int(request.headers.get("x-schema-version", "2"))
except ValueError:
return JSONResponse({"error": "bad X-Schema-Version"}, status_code=400)
# Code-version gate. Every PUT must carry the orchestrator's
# commit hash and that hash must be in the receiver's current
# allow-list (last N commits on the maintainer's working clone).
# Missing → 400 (client bug); not-in-window → 412 (out-of-date
# lab host, must pull main).
commit = request.headers.get("x-cis490-code-commit", "").strip().lower()
if version_gate is not None:
ok, reason = version_gate.check(commit) if commit else (False, "missing")
if not ok:
head = version_gate.head()
if reason == "missing":
body = {
"error": "missing X-Cis490-Code-Commit header",
"remediation": (
"Lab-host is shipping with no code_version stamp. "
"On the lab host:\n"
" cd /opt/cis490 && sudo -u cis490 git pull origin main && "
"sudo /opt/cis490/scripts/install-lab-host.sh\n"
"If that errors out, read FIXYOURSELF.md at the repo "
"root — it's a six-branch decision tree for stuck "
"states the auto-update timer can't recover from."
),
"see_also": "FIXYOURSELF.md",
}
return JSONResponse(body, status_code=400)
if reason == "bad-format":
return JSONResponse(
{"error": "X-Cis490-Code-Commit must be 40 lowercase hex"},
status_code=400,
)
# not-in-window: out-of-date lab host OR diverged-HEAD
body = {
"error": "code commit rejected: not in receiver's allow-list",
"your_commit": commit,
"valid_window_size": version_gate.valid_count(),
"head_commit": head,
"remediation": (
"Your commit isn't on origin/main. Two cases:\n"
"\n"
"(1) You're just behind. Run on the lab host:\n"
" cd /opt/cis490 && sudo -u cis490 git pull --ff-only "
"origin main && sudo /opt/cis490/scripts/install-lab-host.sh\n"
"\n"
"(2) You have a LOCAL commit that's not on origin/main "
"(git pull --ff-only fails). This is the diverged-HEAD "
"case — the auto-update timer will refuse to fix it. "
"Read FIXYOURSELF.md §B at the repo root: three options "
"(push your commit, reset --hard origin/main, or file an "
"issue and wait). Pick one.\n"
"\n"
"Do NOT bypass this check by faking code_version in "
"meta.json — the gate exists to keep buggy data out of "
"the training set."
),
}
log.warning(
"rejected episode host=%s id=%s commit=%s reason=%s",
host_id, episode_id, commit[:12], reason,
)
return JSONResponse(body, status_code=412)
# Optional matrix-stratification headers. Validated against the
# closed enums so a misbehaving shipper can't write garbage into
# the index. Unknown values are dropped (header treated as absent)
# and logged so the operator can spot a version drift quickly.
episode_type = (request.headers.get("x-episode-type") or "").strip().lower()
if episode_type and episode_type not in _VALID_EPISODE_TYPES:
log.warning("dropping unknown X-Episode-Type=%r host=%s id=%s",
episode_type, host_id, episode_id)
episode_type = ""
benign_profile = (request.headers.get("x-benign-profile") or "").strip().lower()
if benign_profile and benign_profile not in _VALID_BENIGN_PROFILES:
log.warning("dropping unknown X-Benign-Profile=%r host=%s id=%s",
benign_profile, host_id, episode_id)
benign_profile = ""
cl = request.headers.get("content-length")
if cl is not None:
try:
if int(cl) > max_episode_bytes:
return JSONResponse(
{"error": "episode exceeds max size"}, status_code=413
)
except ValueError:
return JSONResponse({"error": "bad Content-Length"}, status_code=400)
result = await store.ingest_stream(
host_id=host_id,
episode_id=episode_id,
expected_sha256=expected_sha,
schema_version=schema_version,
commit=commit or None,
episode_type=episode_type or None,
benign_profile=benign_profile or None,
body=request.stream(),
max_bytes=max_episode_bytes,
)
if result.status == "stored":
log.info(
"stored episode host=%s id=%s sha=%s size=%d",
host_id, episode_id, result.sha256, result.size_bytes,
)
return JSONResponse(
{"status": "stored", "sha256": result.sha256, "size_bytes": result.size_bytes},
status_code=201,
)
if result.status == "already-present":
return JSONResponse(
{"status": "already-present", "sha256": result.sha256},
status_code=200,
)
if result.status == "conflict":
log.warning(
"conflict host=%s id=%s existing=%s",
host_id, episode_id, result.existing_sha256,
)
return JSONResponse(
{"status": "conflict", "existing_sha256": result.existing_sha256},
status_code=409,
)
if result.status == "sha-mismatch":
return JSONResponse(
{"status": "sha-mismatch", "actual_sha256": result.sha256},
status_code=400,
)
if result.status == "too-large":
return JSONResponse({"error": "episode exceeds max size"}, status_code=413)
return JSONResponse({"error": "unknown ingest result"}, status_code=500)
async def put_host_health(request: Request) -> JSONResponse:
"""Lab hosts PUT their cis490-doctor.py JSON output here once
a day. We persist the latest snapshot per host at
<health_root>/<host_id>.json. Not gated by version_gate —
this is metadata about the host itself, not training data,
and we want sick hosts to be ABLE to report sickness even if
their code is out-of-date.
No size cap on the body beyond the global Starlette/uvicorn
max — doctor JSON is small (<10 KiB)."""
guard = _bearer_check(request, bearer_token)
if guard is not None:
return guard
if health_root is None:
return JSONResponse(
{"error": "health endpoint disabled (no health_root configured)"},
status_code=404,
)
host_id = request.path_params["host_id"]
if not is_valid_id(host_id):
return JSONResponse({"error": "bad host_id"}, status_code=400)
body_bytes = await request.body()
try:
body = json.loads(body_bytes)
except (ValueError, json.JSONDecodeError):
return JSONResponse({"error": "body must be JSON"}, status_code=400)
if not isinstance(body, dict):
return JSONResponse({"error": "body must be a JSON object"},
status_code=400)
# Stamp the receiver's view of when this arrived. Lab-host
# clocks are coarse so we don't trust them for "freshness."
record = {
"host_id": host_id,
"received_at_wall": time.strftime("%Y-%m-%dT%H:%M:%SZ",
time.gmtime()),
"doctor": body,
}
health_root.mkdir(parents=True, exist_ok=True)
target = health_root / f"{host_id}.json"
# Write to a sibling tmp + atomic rename so a concurrent
# reader never sees a half-written file.
tmp = health_root / f".{host_id}.json.tmp"
tmp.write_text(json.dumps(record, indent=2))
tmp.replace(target)
return JSONResponse({"status": "stored", "host_id": host_id},
status_code=200)
async def get_fleet_health(request: Request) -> JSONResponse:
"""Aggregate view across all hosts that have reported. Used by
the maintainer / fleet-health monitor to spot sick hosts
without grepping per-host files."""
guard = _bearer_check(request, bearer_token)
if guard is not None:
return guard
if health_root is None or not health_root.exists():
return JSONResponse({"hosts": []})
out: list[dict] = []
for f in sorted(health_root.glob("*.json")):
if f.name.startswith("."): # tmp files
continue
try:
out.append(json.loads(f.read_text()))
except (OSError, json.JSONDecodeError):
continue
return JSONResponse({"hosts": out})
routes = [
Route("/v1/health", health, methods=["GET"]),
Route("/v1/ping", ping, methods=["POST"]),
Route(
"/v1/episodes/{host_id}/{filename}",
put_episode,
methods=["PUT"],
),
Route(
"/v1/host-health/{host_id}",
put_host_health,
methods=["PUT"],
),
Route(
"/v1/host-health",
get_fleet_health,
methods=["GET"],
),
]
return Starlette(routes=routes)