Two pieces of self-monitoring so the maintainer isn't the alarm:
(2) Receiver-side fleet health monitor
cis490-fleet-health.timer runs check_fleet_health.py every 5 min.
Detects three symptoms and writes them to
/var/lib/cis490/alerts.jsonl + a syslog WARNING (greppable / easy
to forward to a notifier):
silent — host shipped in last 24h but has been quiet >30 min
fatal-only — actively shipping but every PUT 4xx
unstamped — shipping without X-Cis490-Code-Commit header
Dedup is keyed on (host, symptom, hour-bucket) so a sustained fault
fires once per hour, not every 5 min. 15 unit tests cover the index
parser, three detectors, and dedup.
(3) Per-host doctor snapshots
Lab hosts run cis490-doctor-check.timer once a day (10 min after
boot, then daily with 30-min jitter). The timer runs
cis490_doctor.py --json and PUTs the result to a new endpoint:
PUT /v1/host-health/<host> → /var/lib/cis490/host-health/<host>.json
GET /v1/host-health → aggregate across all hosts
Endpoint is NOT gated by version_gate — sick hosts running stale
code MUST still be able to report sickness. 11 unit tests cover
PUT/GET, atomic-write semantics, bearer auth, and the
not-gated-by-version-gate property.
ship_health_check.py reuses the existing shipper transport (mTLS +
bearer + receiver URL from lab-host.toml) so we don't reimplement
auth.
Both timers wired into install-lab-host.sh — the loop also enables
the previously-added autoupdate + cert-fetch timers, so a single
install run gives a host all four self-healing mechanisms.
Tests: 293 pass (26 new — 15 fleet-health, 11 host-health). 2
pre-existing test_fleet.py failures from the elliott-ThinkPad
merge (667f042) are unrelated to this change.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
69 lines
2.7 KiB
Python
69 lines
2.7 KiB
Python
from __future__ import annotations
|
|
|
|
import tomllib
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
|
|
DEFAULT_MAX_EPISODE_BYTES = 256 * 1024 * 1024
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ReceiverConfig:
|
|
listen_host: str
|
|
listen_port: int
|
|
store_root: Path
|
|
incoming_root: Path
|
|
index_path: Path
|
|
max_episode_bytes: int
|
|
bearer_token: str | None
|
|
# Code-version gate. Production source is the local Forgejo
|
|
# (canonical repo both lab hosts and the receiver pull from);
|
|
# local-git path is a dev-only fallback.
|
|
version_gate_enabled: bool
|
|
version_gate_window: int
|
|
version_gate_forgejo_url: str | None
|
|
version_gate_repo_owner: str | None
|
|
version_gate_repo_name: str | None
|
|
version_gate_branch: str
|
|
version_gate_auth_token: str | None
|
|
version_gate_local_repo: Path | None
|
|
# Per-host doctor snapshots (PUT /v1/host-health/<host>). Disabled
|
|
# if unset — the receiver returns 404. Default points alongside
|
|
# the index in /var/lib/cis490 since that's the receiver's only
|
|
# persistent writable area under hardening.
|
|
health_root: Path | None
|
|
|
|
@classmethod
|
|
def load(cls, path: str | Path) -> "ReceiverConfig":
|
|
with open(path, "rb") as f:
|
|
data = tomllib.load(f)
|
|
|
|
listen_addr = data.get("listen_addr", "127.0.0.1:8443")
|
|
host, _, port = listen_addr.rpartition(":")
|
|
version_gate = data.get("version_gate", {})
|
|
local_repo = version_gate.get("local_repo_path")
|
|
return cls(
|
|
listen_host=host or "127.0.0.1",
|
|
listen_port=int(port),
|
|
store_root=Path(data["store_root"]).resolve(),
|
|
incoming_root=Path(data["incoming_root"]).resolve(),
|
|
index_path=Path(data["index_path"]).resolve(),
|
|
max_episode_bytes=int(
|
|
data.get("limits", {}).get("max_episode_bytes", DEFAULT_MAX_EPISODE_BYTES)
|
|
),
|
|
bearer_token=data.get("auth", {}).get("bearer_token"),
|
|
version_gate_enabled=bool(version_gate.get("enabled", True)),
|
|
version_gate_window=int(version_gate.get("window", 100)),
|
|
version_gate_forgejo_url=version_gate.get("forgejo_url"),
|
|
version_gate_repo_owner=version_gate.get("repo_owner"),
|
|
version_gate_repo_name=version_gate.get("repo_name"),
|
|
version_gate_branch=version_gate.get("branch", "main"),
|
|
version_gate_auth_token=version_gate.get("auth_token"),
|
|
version_gate_local_repo=Path(local_repo).resolve() if local_repo else None,
|
|
health_root=(
|
|
Path(data["health_root"]).resolve()
|
|
if "health_root" in data
|
|
else Path("/var/lib/cis490/host-health").resolve()
|
|
),
|
|
)
|