Two pieces of self-monitoring so the maintainer isn't the alarm:
(2) Receiver-side fleet health monitor
cis490-fleet-health.timer runs check_fleet_health.py every 5 min.
Detects three symptoms and writes them to
/var/lib/cis490/alerts.jsonl + a syslog WARNING (greppable / easy
to forward to a notifier):
silent — host shipped in last 24h but has been quiet >30 min
fatal-only — actively shipping but every PUT 4xx
unstamped — shipping without X-Cis490-Code-Commit header
Dedup is keyed on (host, symptom, hour-bucket) so a sustained fault
fires once per hour, not every 5 min. 15 unit tests cover the index
parser, three detectors, and dedup.
(3) Per-host doctor snapshots
Lab hosts run cis490-doctor-check.timer once a day (10 min after
boot, then daily with 30-min jitter). The timer runs
cis490_doctor.py --json and PUTs the result to a new endpoint:
PUT /v1/host-health/<host> → /var/lib/cis490/host-health/<host>.json
GET /v1/host-health → aggregate across all hosts
Endpoint is NOT gated by version_gate — sick hosts running stale
code MUST still be able to report sickness. 11 unit tests cover
PUT/GET, atomic-write semantics, bearer auth, and the
not-gated-by-version-gate property.
ship_health_check.py reuses the existing shipper transport (mTLS +
bearer + receiver URL from lab-host.toml) so we don't reimplement
auth.
Both timers wired into install-lab-host.sh — the loop also enables
the previously-added autoupdate + cert-fetch timers, so a single
install run gives a host all four self-healing mechanisms.
Tests: 293 pass (26 new — 15 fleet-health, 11 host-health). 2
pre-existing test_fleet.py failures from the elliott-ThinkPad
merge (667f042) are unrelated to this change.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
198 lines
6.8 KiB
Python
198 lines
6.8 KiB
Python
"""Tests for the receiver's /v1/host-health endpoints.
|
|
|
|
PUT /v1/host-health/<host> → store the lab host's daily doctor JSON
|
|
GET /v1/host-health → aggregate snapshot across all hosts
|
|
|
|
The endpoint is deliberately NOT gated by version_gate — sick hosts
|
|
running stale code still need to be able to report sickness.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from starlette.testclient import TestClient
|
|
|
|
from receiver.app import make_app
|
|
from receiver.store import EpisodeStore
|
|
|
|
|
|
@pytest.fixture
|
|
def app(tmp_path: Path):
|
|
store = EpisodeStore(
|
|
store_root=tmp_path / "store",
|
|
incoming_root=tmp_path / "incoming",
|
|
index_path=tmp_path / "index.jsonl",
|
|
)
|
|
return make_app(
|
|
store=store, max_episode_bytes=10_000_000,
|
|
bearer_token=None,
|
|
health_root=tmp_path / "host-health",
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def app_with_bearer(tmp_path: Path):
|
|
store = EpisodeStore(
|
|
store_root=tmp_path / "store",
|
|
incoming_root=tmp_path / "incoming",
|
|
index_path=tmp_path / "index.jsonl",
|
|
)
|
|
return make_app(
|
|
store=store, max_episode_bytes=10_000_000,
|
|
bearer_token="s3cret",
|
|
health_root=tmp_path / "host-health",
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def app_no_health(tmp_path: Path):
|
|
"""health_root=None ⇒ endpoint returns 404."""
|
|
store = EpisodeStore(
|
|
store_root=tmp_path / "store",
|
|
incoming_root=tmp_path / "incoming",
|
|
index_path=tmp_path / "index.jsonl",
|
|
)
|
|
return make_app(
|
|
store=store, max_episode_bytes=10_000_000,
|
|
bearer_token=None,
|
|
health_root=None,
|
|
)
|
|
|
|
|
|
def test_put_health_stores_doctor_json(app, tmp_path: Path) -> None:
|
|
snapshot = {
|
|
"role": "lab-host",
|
|
"checks": [
|
|
{"name": "install: VERSION stamp", "status": "ok", "detail": "main@abc123"},
|
|
{"name": "shipper: recent ship results", "status": "fail",
|
|
"detail": "12 412s in last 10 min"},
|
|
],
|
|
}
|
|
with TestClient(app) as c:
|
|
r = c.put("/v1/host-health/lab1", json=snapshot)
|
|
assert r.status_code == 200
|
|
assert r.json()["status"] == "stored"
|
|
assert r.json()["host_id"] == "lab1"
|
|
|
|
# File written
|
|
target = tmp_path / "host-health" / "lab1.json"
|
|
assert target.exists()
|
|
body = json.loads(target.read_text())
|
|
assert body["host_id"] == "lab1"
|
|
assert body["doctor"] == snapshot
|
|
assert "received_at_wall" in body
|
|
|
|
|
|
def test_put_health_overwrites_previous_snapshot(app, tmp_path: Path) -> None:
|
|
"""The endpoint stores the LATEST snapshot per host, not history."""
|
|
with TestClient(app) as c:
|
|
r1 = c.put("/v1/host-health/lab1", json={"checks": [{"v": 1}]})
|
|
r2 = c.put("/v1/host-health/lab1", json={"checks": [{"v": 2}]})
|
|
assert r1.status_code == 200 and r2.status_code == 200
|
|
body = json.loads((tmp_path / "host-health" / "lab1.json").read_text())
|
|
assert body["doctor"] == {"checks": [{"v": 2}]}
|
|
|
|
|
|
def test_put_health_rejects_invalid_host_id(app) -> None:
|
|
"""is_valid_id rejects path-traversal-y strings + extreme lengths.
|
|
URL-routing-level traversal (../) is normalised by the test
|
|
client before reaching us, so probe with a chunk of bad chars
|
|
that survive routing."""
|
|
with TestClient(app) as c:
|
|
r = c.put("/v1/host-health/lab%20with%20space", json={"checks": []})
|
|
assert r.status_code == 400
|
|
|
|
|
|
def test_put_health_rejects_non_json_body(app) -> None:
|
|
with TestClient(app) as c:
|
|
r = c.put("/v1/host-health/lab1", content=b"not json",
|
|
headers={"Content-Type": "application/json"})
|
|
assert r.status_code == 400
|
|
|
|
|
|
def test_put_health_rejects_array_body(app) -> None:
|
|
"""Body must be an OBJECT, not a list — the doctor's --json output
|
|
is always {role, checks: [...]}, never bare list."""
|
|
with TestClient(app) as c:
|
|
r = c.put("/v1/host-health/lab1", json=["x", "y"])
|
|
assert r.status_code == 400
|
|
|
|
|
|
def test_put_health_404_when_disabled(app_no_health) -> None:
|
|
"""Receivers without health_root configured return 404 — lets a
|
|
deployment opt out without removing the routes."""
|
|
with TestClient(app_no_health) as c:
|
|
r = c.put("/v1/host-health/lab1", json={"checks": []})
|
|
assert r.status_code == 404
|
|
|
|
|
|
def test_put_health_not_gated_by_version_gate(tmp_path: Path) -> None:
|
|
"""A sick host with stale code MUST still be able to report
|
|
sickness. Confirm we don't check X-Cis490-Code-Commit on the
|
|
health endpoint."""
|
|
store = EpisodeStore(
|
|
store_root=tmp_path / "store",
|
|
incoming_root=tmp_path / "incoming",
|
|
index_path=tmp_path / "index.jsonl",
|
|
)
|
|
# Build a gate that rejects everything to prove we don't run it.
|
|
class _RejectAll:
|
|
def check(self, commit): return False, "not-in-window"
|
|
def head(self): return None
|
|
def valid_count(self): return 0
|
|
app = make_app(
|
|
store=store, max_episode_bytes=10_000_000,
|
|
bearer_token=None,
|
|
version_gate=_RejectAll(),
|
|
health_root=tmp_path / "host-health",
|
|
)
|
|
with TestClient(app) as c:
|
|
r = c.put("/v1/host-health/lab1", json={"checks": []})
|
|
assert r.status_code == 200
|
|
|
|
|
|
def test_put_health_requires_bearer_when_configured(app_with_bearer) -> None:
|
|
with TestClient(app_with_bearer) as c:
|
|
r = c.put("/v1/host-health/lab1", json={"checks": []})
|
|
assert r.status_code == 401
|
|
r2 = c.put("/v1/host-health/lab1", json={"checks": []},
|
|
headers={"Authorization": "Bearer s3cret"})
|
|
assert r2.status_code == 200
|
|
|
|
|
|
def test_get_fleet_health_returns_all_snapshots(app, tmp_path: Path) -> None:
|
|
with TestClient(app) as c:
|
|
c.put("/v1/host-health/lab1", json={"checks": [{"v": 1}]})
|
|
c.put("/v1/host-health/lab2", json={"checks": [{"v": 2}]})
|
|
r = c.get("/v1/host-health")
|
|
assert r.status_code == 200
|
|
body = r.json()
|
|
hosts = {h["host_id"]: h for h in body["hosts"]}
|
|
assert hosts["lab1"]["doctor"] == {"checks": [{"v": 1}]}
|
|
assert hosts["lab2"]["doctor"] == {"checks": [{"v": 2}]}
|
|
|
|
|
|
def test_get_fleet_health_empty_when_no_reports(app) -> None:
|
|
with TestClient(app) as c:
|
|
r = c.get("/v1/host-health")
|
|
assert r.status_code == 200
|
|
assert r.json() == {"hosts": []}
|
|
|
|
|
|
def test_get_fleet_health_ignores_temp_files(app, tmp_path: Path) -> None:
|
|
"""Atomic-write tmpfiles (.lab1.json.tmp) shouldn't show up in the
|
|
aggregate listing if a write was in-flight."""
|
|
health_dir = tmp_path / "host-health"
|
|
health_dir.mkdir()
|
|
(health_dir / ".lab1.json.tmp").write_text('{"host_id": "lab1"}')
|
|
(health_dir / "lab2.json").write_text(
|
|
'{"host_id": "lab2", "doctor": {"checks": []}}'
|
|
)
|
|
with TestClient(app) as c:
|
|
r = c.get("/v1/host-health")
|
|
body = r.json()
|
|
hosts = [h.get("host_id") for h in body["hosts"]]
|
|
assert hosts == ["lab2"]
|