Two pieces of self-monitoring so the maintainer isn't the alarm:
(2) Receiver-side fleet health monitor
cis490-fleet-health.timer runs check_fleet_health.py every 5 min.
Detects three symptoms and writes them to
/var/lib/cis490/alerts.jsonl + a syslog WARNING (greppable / easy
to forward to a notifier):
silent — host shipped in last 24h but has been quiet >30 min
fatal-only — actively shipping but every PUT 4xx
unstamped — shipping without X-Cis490-Code-Commit header
Dedup is keyed on (host, symptom, hour-bucket) so a sustained fault
fires once per hour, not every 5 min. 15 unit tests cover the index
parser, three detectors, and dedup.
(3) Per-host doctor snapshots
Lab hosts run cis490-doctor-check.timer once a day (10 min after
boot, then daily with 30-min jitter). The timer runs
cis490_doctor.py --json and PUTs the result to a new endpoint:
PUT /v1/host-health/<host> → /var/lib/cis490/host-health/<host>.json
GET /v1/host-health → aggregate across all hosts
Endpoint is NOT gated by version_gate — sick hosts running stale
code MUST still be able to report sickness. 11 unit tests cover
PUT/GET, atomic-write semantics, bearer auth, and the
not-gated-by-version-gate property.
ship_health_check.py reuses the existing shipper transport (mTLS +
bearer + receiver URL from lab-host.toml) so we don't reimplement
auth.
Both timers wired into install-lab-host.sh — the loop also enables
the previously-added autoupdate + cert-fetch timers, so a single
install run gives a host all four self-healing mechanisms.
Tests: 293 pass (26 new — 15 fleet-health, 11 host-health). 2
pre-existing test_fleet.py failures from the elliott-ThinkPad
merge (667f042) are unrelated to this change.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
324 lines
13 KiB
Python
324 lines
13 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import secrets
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Awaitable, Callable
|
|
|
|
from starlette.applications import Starlette
|
|
from starlette.requests import Request
|
|
from starlette.responses import JSONResponse, Response
|
|
from starlette.routing import Route
|
|
|
|
from .store import EpisodeStore, is_valid_id
|
|
from .version_gate import VersionGate
|
|
|
|
|
|
log = logging.getLogger("cis490.receiver")
|
|
|
|
|
|
SUFFIX = ".tar.zst"
|
|
SCHEMA_VERSION = 2
|
|
|
|
# Mirrored from orchestrator.benign so the receiver can validate the
|
|
# benign-profile header without taking a dependency on the orchestrator
|
|
# package. Keep in sync if BENIGN_PROFILES grows.
|
|
_VALID_BENIGN_PROFILES: frozenset[str] = frozenset({
|
|
"idle", "web_visitor", "admin_session", "cron_burst",
|
|
"file_browse", "db_query", "package_check",
|
|
})
|
|
_VALID_EPISODE_TYPES: frozenset[str] = frozenset({"control", "infected"})
|
|
|
|
|
|
def _bearer_check(request: Request, expected: str | None) -> Response | None:
|
|
if expected is None:
|
|
return None
|
|
auth = request.headers.get("authorization", "")
|
|
prefix = "Bearer "
|
|
if not auth.startswith(prefix):
|
|
return JSONResponse({"error": "missing bearer token"}, status_code=401)
|
|
presented = auth[len(prefix):]
|
|
if not secrets.compare_digest(presented, expected):
|
|
return JSONResponse({"error": "bad bearer token"}, status_code=401)
|
|
return None
|
|
|
|
|
|
def make_app(
|
|
store: EpisodeStore,
|
|
max_episode_bytes: int,
|
|
bearer_token: str | None = None,
|
|
version_gate: VersionGate | None = None,
|
|
health_root: "Path | None" = None,
|
|
) -> Starlette:
|
|
async def health(request: Request) -> JSONResponse:
|
|
return JSONResponse({"status": "ok"})
|
|
|
|
async def ping(request: Request) -> JSONResponse:
|
|
"""Smoke-test endpoint. Verifies that the auth layer and the
|
|
WG/Caddy/receiver pipe are alive end-to-end without persisting
|
|
anything — index.jsonl is untouched. Used by ``cis490-shipper
|
|
--ping`` during initial bring-up of a new lab host."""
|
|
guard = _bearer_check(request, bearer_token)
|
|
if guard is not None:
|
|
return guard
|
|
return JSONResponse(
|
|
{
|
|
"ok": True,
|
|
"host_id": request.headers.get("x-lab-host"),
|
|
"t_wall_ns": time.time_ns(),
|
|
"schema_version": SCHEMA_VERSION,
|
|
}
|
|
)
|
|
|
|
async def put_episode(request: Request) -> JSONResponse:
|
|
guard = _bearer_check(request, bearer_token)
|
|
if guard is not None:
|
|
return guard
|
|
|
|
host_id: str = request.path_params["host_id"]
|
|
filename: str = request.path_params["filename"]
|
|
|
|
if not filename.endswith(SUFFIX):
|
|
return JSONResponse(
|
|
{"error": f"expected {SUFFIX} suffix"}, status_code=400
|
|
)
|
|
episode_id = filename[: -len(SUFFIX)]
|
|
|
|
if not is_valid_id(host_id) or not is_valid_id(episode_id):
|
|
return JSONResponse({"error": "bad host_id or episode_id"}, status_code=400)
|
|
|
|
expected_sha = request.headers.get("x-content-sha256")
|
|
if not expected_sha or len(expected_sha) != 64:
|
|
return JSONResponse(
|
|
{"error": "X-Content-SHA256 header (64 hex chars) required"},
|
|
status_code=400,
|
|
)
|
|
expected_sha = expected_sha.lower()
|
|
|
|
try:
|
|
schema_version = int(request.headers.get("x-schema-version", "2"))
|
|
except ValueError:
|
|
return JSONResponse({"error": "bad X-Schema-Version"}, status_code=400)
|
|
|
|
# Code-version gate. Every PUT must carry the orchestrator's
|
|
# commit hash and that hash must be in the receiver's current
|
|
# allow-list (last N commits on the maintainer's working clone).
|
|
# Missing → 400 (client bug); not-in-window → 412 (out-of-date
|
|
# lab host, must pull main).
|
|
commit = request.headers.get("x-cis490-code-commit", "").strip().lower()
|
|
if version_gate is not None:
|
|
ok, reason = version_gate.check(commit) if commit else (False, "missing")
|
|
if not ok:
|
|
head = version_gate.head()
|
|
if reason == "missing":
|
|
body = {
|
|
"error": "missing X-Cis490-Code-Commit header",
|
|
"remediation": (
|
|
"Lab-host is shipping with no code_version stamp. "
|
|
"On the lab host:\n"
|
|
" cd /opt/cis490 && sudo -u cis490 git pull origin main && "
|
|
"sudo /opt/cis490/scripts/install-lab-host.sh\n"
|
|
"If that errors out, read FIXYOURSELF.md at the repo "
|
|
"root — it's a six-branch decision tree for stuck "
|
|
"states the auto-update timer can't recover from."
|
|
),
|
|
"see_also": "FIXYOURSELF.md",
|
|
}
|
|
return JSONResponse(body, status_code=400)
|
|
if reason == "bad-format":
|
|
return JSONResponse(
|
|
{"error": "X-Cis490-Code-Commit must be 40 lowercase hex"},
|
|
status_code=400,
|
|
)
|
|
# not-in-window: out-of-date lab host OR diverged-HEAD
|
|
body = {
|
|
"error": "code commit rejected: not in receiver's allow-list",
|
|
"your_commit": commit,
|
|
"valid_window_size": version_gate.valid_count(),
|
|
"head_commit": head,
|
|
"remediation": (
|
|
"Your commit isn't on origin/main. Two cases:\n"
|
|
"\n"
|
|
"(1) You're just behind. Run on the lab host:\n"
|
|
" cd /opt/cis490 && sudo -u cis490 git pull --ff-only "
|
|
"origin main && sudo /opt/cis490/scripts/install-lab-host.sh\n"
|
|
"\n"
|
|
"(2) You have a LOCAL commit that's not on origin/main "
|
|
"(git pull --ff-only fails). This is the diverged-HEAD "
|
|
"case — the auto-update timer will refuse to fix it. "
|
|
"Read FIXYOURSELF.md §B at the repo root: three options "
|
|
"(push your commit, reset --hard origin/main, or file an "
|
|
"issue and wait). Pick one.\n"
|
|
"\n"
|
|
"Do NOT bypass this check by faking code_version in "
|
|
"meta.json — the gate exists to keep buggy data out of "
|
|
"the training set."
|
|
),
|
|
}
|
|
log.warning(
|
|
"rejected episode host=%s id=%s commit=%s reason=%s",
|
|
host_id, episode_id, commit[:12], reason,
|
|
)
|
|
return JSONResponse(body, status_code=412)
|
|
|
|
# Optional matrix-stratification headers. Validated against the
|
|
# closed enums so a misbehaving shipper can't write garbage into
|
|
# the index. Unknown values are dropped (header treated as absent)
|
|
# and logged so the operator can spot a version drift quickly.
|
|
episode_type = (request.headers.get("x-episode-type") or "").strip().lower()
|
|
if episode_type and episode_type not in _VALID_EPISODE_TYPES:
|
|
log.warning("dropping unknown X-Episode-Type=%r host=%s id=%s",
|
|
episode_type, host_id, episode_id)
|
|
episode_type = ""
|
|
benign_profile = (request.headers.get("x-benign-profile") or "").strip().lower()
|
|
if benign_profile and benign_profile not in _VALID_BENIGN_PROFILES:
|
|
log.warning("dropping unknown X-Benign-Profile=%r host=%s id=%s",
|
|
benign_profile, host_id, episode_id)
|
|
benign_profile = ""
|
|
|
|
cl = request.headers.get("content-length")
|
|
if cl is not None:
|
|
try:
|
|
if int(cl) > max_episode_bytes:
|
|
return JSONResponse(
|
|
{"error": "episode exceeds max size"}, status_code=413
|
|
)
|
|
except ValueError:
|
|
return JSONResponse({"error": "bad Content-Length"}, status_code=400)
|
|
|
|
result = await store.ingest_stream(
|
|
host_id=host_id,
|
|
episode_id=episode_id,
|
|
expected_sha256=expected_sha,
|
|
schema_version=schema_version,
|
|
commit=commit or None,
|
|
episode_type=episode_type or None,
|
|
benign_profile=benign_profile or None,
|
|
body=request.stream(),
|
|
max_bytes=max_episode_bytes,
|
|
)
|
|
|
|
if result.status == "stored":
|
|
log.info(
|
|
"stored episode host=%s id=%s sha=%s size=%d",
|
|
host_id, episode_id, result.sha256, result.size_bytes,
|
|
)
|
|
return JSONResponse(
|
|
{"status": "stored", "sha256": result.sha256, "size_bytes": result.size_bytes},
|
|
status_code=201,
|
|
)
|
|
if result.status == "already-present":
|
|
return JSONResponse(
|
|
{"status": "already-present", "sha256": result.sha256},
|
|
status_code=200,
|
|
)
|
|
if result.status == "conflict":
|
|
log.warning(
|
|
"conflict host=%s id=%s existing=%s",
|
|
host_id, episode_id, result.existing_sha256,
|
|
)
|
|
return JSONResponse(
|
|
{"status": "conflict", "existing_sha256": result.existing_sha256},
|
|
status_code=409,
|
|
)
|
|
if result.status == "sha-mismatch":
|
|
return JSONResponse(
|
|
{"status": "sha-mismatch", "actual_sha256": result.sha256},
|
|
status_code=400,
|
|
)
|
|
if result.status == "too-large":
|
|
return JSONResponse({"error": "episode exceeds max size"}, status_code=413)
|
|
|
|
return JSONResponse({"error": "unknown ingest result"}, status_code=500)
|
|
|
|
async def put_host_health(request: Request) -> JSONResponse:
|
|
"""Lab hosts PUT their cis490-doctor.py JSON output here once
|
|
a day. We persist the latest snapshot per host at
|
|
<health_root>/<host_id>.json. Not gated by version_gate —
|
|
this is metadata about the host itself, not training data,
|
|
and we want sick hosts to be ABLE to report sickness even if
|
|
their code is out-of-date.
|
|
|
|
No size cap on the body beyond the global Starlette/uvicorn
|
|
max — doctor JSON is small (<10 KiB)."""
|
|
guard = _bearer_check(request, bearer_token)
|
|
if guard is not None:
|
|
return guard
|
|
if health_root is None:
|
|
return JSONResponse(
|
|
{"error": "health endpoint disabled (no health_root configured)"},
|
|
status_code=404,
|
|
)
|
|
host_id = request.path_params["host_id"]
|
|
if not is_valid_id(host_id):
|
|
return JSONResponse({"error": "bad host_id"}, status_code=400)
|
|
|
|
body_bytes = await request.body()
|
|
try:
|
|
body = json.loads(body_bytes)
|
|
except (ValueError, json.JSONDecodeError):
|
|
return JSONResponse({"error": "body must be JSON"}, status_code=400)
|
|
if not isinstance(body, dict):
|
|
return JSONResponse({"error": "body must be a JSON object"},
|
|
status_code=400)
|
|
|
|
# Stamp the receiver's view of when this arrived. Lab-host
|
|
# clocks are coarse so we don't trust them for "freshness."
|
|
record = {
|
|
"host_id": host_id,
|
|
"received_at_wall": time.strftime("%Y-%m-%dT%H:%M:%SZ",
|
|
time.gmtime()),
|
|
"doctor": body,
|
|
}
|
|
health_root.mkdir(parents=True, exist_ok=True)
|
|
target = health_root / f"{host_id}.json"
|
|
# Write to a sibling tmp + atomic rename so a concurrent
|
|
# reader never sees a half-written file.
|
|
tmp = health_root / f".{host_id}.json.tmp"
|
|
tmp.write_text(json.dumps(record, indent=2))
|
|
tmp.replace(target)
|
|
|
|
return JSONResponse({"status": "stored", "host_id": host_id},
|
|
status_code=200)
|
|
|
|
async def get_fleet_health(request: Request) -> JSONResponse:
|
|
"""Aggregate view across all hosts that have reported. Used by
|
|
the maintainer / fleet-health monitor to spot sick hosts
|
|
without grepping per-host files."""
|
|
guard = _bearer_check(request, bearer_token)
|
|
if guard is not None:
|
|
return guard
|
|
if health_root is None or not health_root.exists():
|
|
return JSONResponse({"hosts": []})
|
|
out: list[dict] = []
|
|
for f in sorted(health_root.glob("*.json")):
|
|
if f.name.startswith("."): # tmp files
|
|
continue
|
|
try:
|
|
out.append(json.loads(f.read_text()))
|
|
except (OSError, json.JSONDecodeError):
|
|
continue
|
|
return JSONResponse({"hosts": out})
|
|
|
|
routes = [
|
|
Route("/v1/health", health, methods=["GET"]),
|
|
Route("/v1/ping", ping, methods=["POST"]),
|
|
Route(
|
|
"/v1/episodes/{host_id}/{filename}",
|
|
put_episode,
|
|
methods=["PUT"],
|
|
),
|
|
Route(
|
|
"/v1/host-health/{host_id}",
|
|
put_host_health,
|
|
methods=["PUT"],
|
|
),
|
|
Route(
|
|
"/v1/host-health",
|
|
get_fleet_health,
|
|
methods=["GET"],
|
|
),
|
|
]
|
|
return Starlette(routes=routes)
|