Stops out-of-date lab hosts from polluting the dataset with episodes
generated by buggy code. The valid-commits set mirrors the maintainer's
working clone on the Pi automatically — when the maintainer pulls or
pushes a new commit, the receiver picks it up within the 5-second
cache TTL with no service restart.
Receiver changes:
- receiver/version_gate.py (new): VersionGate(repo_path, window).
Each check() consults a frozenset of the last `window` commit
hashes from `git -C <repo> log --format=%H -n <window>`, refreshed
every 5s under a lock. Resilient to transient git failure (keeps
prior cache so a flaky `git` doesn't lock out every shipper).
- receiver/app.py: PUT extracts X-Cis490-Code-Commit; gate.check()
before ingest. Rejects with:
400 + remediation if header missing or malformed
412 + remediation + your_commit + head_commit if not in window
Remediation block is verbatim copy-pasteable into the lab-host
shell:
cd /opt/cis490 && sudo -u cis490 git pull origin main
sudo /opt/cis490/scripts/install-lab-host.sh
sudo systemctl restart cis490-orchestrator
- receiver/store.py: ingest_stream takes commit kwarg, stamps it on
the index.jsonl row (new optional field). Backfilled rows from
index_backfill.py also pull commit out of meta.json.
- receiver/config.py + etc/receiver.toml.example: new [version_gate]
section. enabled=true, repo_path=/home/max/cis490, window=100 by
default. Enabled toggle exists for emergency disable-and-collect.
Shipper changes:
- shipper/transport.py: ship_tarball() takes commit kwarg, sends
X-Cis490-Code-Commit header. 412 maps to status='fatal' so the
queue doesn't infinite-retry — operator must pull and reinstall
before the next ship will succeed.
- shipper/queue.py: reads meta.json::code_version.commit per
episode, passes through. On 412, logs the receiver's full
remediation block at ERROR level so journalctl on the lab host
shows exactly what to run.
Tests: 9 in test_version_gate (including 2 end-to-end via
starlette.testclient), 2 cover the boundary where new commits land
mid-cache and where missing-repo gracefully keeps prior cache.
157/157 total.
Index schema: existing rows stay valid (commit field is optional
on read). New rows from receiver-direct AND from index_backfill.py
include commit.
279 lines
9.8 KiB
Python
279 lines
9.8 KiB
Python
"""HTTP transport for the lab-host shipper.
|
|
|
|
Two operations against the receiver:
|
|
POST /v1/ping — smoke test
|
|
PUT /v1/episodes/<host>/<episode>.tar.zst — episode upload
|
|
|
|
Auth is mTLS (client cert from wg-pki) when configured. A bearer token
|
|
is supported as a stand-in during early bring-up before the cert is
|
|
issued; production runs should set both.
|
|
|
|
The transport returns small dataclasses rather than throwing — the
|
|
caller (shipper queue) decides whether to retry, move to shipped/, or
|
|
alert. This keeps the retry policy in one place.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import logging
|
|
import ssl
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
from .config import ReceiverEndpoint, ShipperConfig
|
|
|
|
|
|
log = logging.getLogger("cis490.shipper.transport")
|
|
|
|
|
|
SCHEMA_VERSION = 1
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class PingResult:
|
|
ok: bool
|
|
status_code: int
|
|
body: dict[str, Any] | None
|
|
error: str | None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ShipResult:
|
|
status: str # "stored" | "already-present" | "conflict" | "transient" | "fatal"
|
|
status_code: int
|
|
sha256: str | None
|
|
body: dict[str, Any] | None
|
|
error: str | None
|
|
|
|
|
|
class _CertNotReadyError(Exception):
|
|
"""Configured cert/CA paths aren't on disk yet.
|
|
|
|
Raised during first-boot bring-up: install-lab-host.sh enables the
|
|
shipper before the Pi has issued the mTLS leaf. The transport
|
|
catches this, logs once, and retries on each request so the daemon
|
|
self-heals when the cert lands."""
|
|
|
|
|
|
def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
|
|
"""Build an SSL context honoring the wg-pki CA bundle + client cert.
|
|
|
|
Returns True / a bundle path / a context. httpx accepts all three;
|
|
we use a context so we can attach the client cert for mTLS."""
|
|
if not rcv.url.lower().startswith("https://"):
|
|
return False
|
|
# Pre-flight check the configured paths so we raise a recoverable
|
|
# error here instead of letting ssl raise FileNotFoundError deep
|
|
# inside create_default_context / load_cert_chain.
|
|
for label, path in (
|
|
("ca_bundle", rcv.ca_bundle),
|
|
("client_cert", rcv.client_cert),
|
|
("client_key", rcv.client_key),
|
|
):
|
|
if path and not Path(path).exists():
|
|
raise _CertNotReadyError(f"{label} path missing: {path}")
|
|
ctx = ssl.create_default_context(
|
|
cafile=str(rcv.ca_bundle) if rcv.ca_bundle else None,
|
|
)
|
|
if not rcv.verify_tls:
|
|
# Dev-only path; production lab-hosts should always pin the
|
|
# wg-pki CA. Logged loudly so it doesn't slip through.
|
|
log.warning("TLS verification disabled — dev-only configuration")
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
if rcv.client_cert and rcv.client_key:
|
|
ctx.load_cert_chain(str(rcv.client_cert), str(rcv.client_key))
|
|
return ctx
|
|
|
|
|
|
class ShipperTransport:
|
|
def __init__(self, cfg: ShipperConfig) -> None:
|
|
self.cfg = cfg
|
|
self._verify: ssl.SSLContext | bool | None = None
|
|
self._cert_warned = False
|
|
# Try once at construction; if certs aren't on disk yet, defer.
|
|
# Each request will retry the build until it succeeds, so
|
|
# systemd doesn't crash-loop the unit during first-boot.
|
|
self._try_build_verify()
|
|
|
|
def _try_build_verify(self) -> bool:
|
|
"""(Re)build the SSL context. Returns True if the transport is
|
|
ready to make requests; False if certs aren't on disk yet."""
|
|
if self._verify is not None:
|
|
return True # already built (False for http, context for https)
|
|
try:
|
|
self._verify = _build_ssl_context(self.cfg.receiver)
|
|
except _CertNotReadyError as e:
|
|
if not self._cert_warned:
|
|
log.warning(
|
|
"shipper waiting on mTLS material (%s); will retry each "
|
|
"request — this is expected during first-boot. To unblock: "
|
|
"set host_id in /etc/cis490/lab-host.toml then run "
|
|
"`sudo /opt/cis490/scripts/install-lab-host.sh` (do NOT "
|
|
"mint certs by hand). See AGENTS.md → Securing the "
|
|
"connection (mTLS).", e,
|
|
)
|
|
self._cert_warned = True
|
|
return False
|
|
if self._cert_warned:
|
|
log.info("mTLS material now on disk; shipper transport ready")
|
|
self._cert_warned = False
|
|
return True
|
|
|
|
# ---- ping ----------------------------------------------------------
|
|
|
|
def ping(self) -> PingResult:
|
|
if not self._try_build_verify():
|
|
return PingResult(
|
|
ok=False, status_code=0, body=None,
|
|
error="mTLS material not yet on disk; waiting for cert delivery",
|
|
)
|
|
url = f"{self.cfg.receiver.url}/v1/ping"
|
|
headers = self._common_headers()
|
|
try:
|
|
with httpx.Client(verify=self._verify, timeout=self.cfg.request_timeout_s) as c:
|
|
r = c.post(url, headers=headers, content=b"")
|
|
except httpx.HTTPError as e:
|
|
return PingResult(ok=False, status_code=0, body=None, error=str(e))
|
|
|
|
body: dict[str, Any] | None = None
|
|
try:
|
|
body = r.json()
|
|
except Exception:
|
|
pass
|
|
|
|
if r.status_code == 200 and isinstance(body, dict) and body.get("ok"):
|
|
return PingResult(ok=True, status_code=200, body=body, error=None)
|
|
return PingResult(
|
|
ok=False,
|
|
status_code=r.status_code,
|
|
body=body,
|
|
error=f"unexpected status {r.status_code}",
|
|
)
|
|
|
|
# ---- ship ----------------------------------------------------------
|
|
|
|
def ship_tarball(
|
|
self,
|
|
episode_id: str,
|
|
tarball_path: Path,
|
|
sha256_hex: str,
|
|
commit: str | None = None,
|
|
) -> ShipResult:
|
|
if not self._try_build_verify():
|
|
return ShipResult(
|
|
status="transient", status_code=0,
|
|
sha256=None, body=None,
|
|
error="mTLS material not yet on disk; waiting for cert delivery",
|
|
)
|
|
url = (
|
|
f"{self.cfg.receiver.url}/v1/episodes/"
|
|
f"{self.cfg.host_id}/{episode_id}.tar.zst"
|
|
)
|
|
size = tarball_path.stat().st_size
|
|
headers = self._common_headers() | {
|
|
"Content-Type": "application/zstd",
|
|
"Content-Length": str(size),
|
|
"X-Content-SHA256": sha256_hex,
|
|
"X-Episode-Id": episode_id,
|
|
}
|
|
if commit:
|
|
# Receiver enforces this against its commit-allow-list and
|
|
# rejects with 412 if not in window. See receiver/version_gate.py.
|
|
headers["X-Cis490-Code-Commit"] = commit
|
|
|
|
try:
|
|
with httpx.Client(verify=self._verify, timeout=self.cfg.request_timeout_s) as c, \
|
|
tarball_path.open("rb") as body:
|
|
# httpx streams from a file-like object via the `content=` kwarg.
|
|
r = c.put(url, headers=headers, content=body)
|
|
except httpx.HTTPError as e:
|
|
return ShipResult(
|
|
status="transient",
|
|
status_code=0,
|
|
sha256=None,
|
|
body=None,
|
|
error=str(e),
|
|
)
|
|
|
|
body_json: dict[str, Any] | None = None
|
|
try:
|
|
body_json = r.json()
|
|
except Exception:
|
|
pass
|
|
|
|
if r.status_code == 201:
|
|
return ShipResult(
|
|
status="stored",
|
|
status_code=201,
|
|
sha256=sha256_hex,
|
|
body=body_json,
|
|
error=None,
|
|
)
|
|
if r.status_code == 200:
|
|
return ShipResult(
|
|
status="already-present",
|
|
status_code=200,
|
|
sha256=sha256_hex,
|
|
body=body_json,
|
|
error=None,
|
|
)
|
|
if r.status_code == 409:
|
|
return ShipResult(
|
|
status="conflict",
|
|
status_code=409,
|
|
sha256=sha256_hex,
|
|
body=body_json,
|
|
error="receiver already has a different sha256 for this id",
|
|
)
|
|
if r.status_code == 412:
|
|
# Code-commit not in receiver's allow-list. The operator
|
|
# of THIS lab host needs to pull main + reinstall;
|
|
# retrying without that won't help. Treat as fatal so
|
|
# queue.run_once() doesn't loop on it.
|
|
return ShipResult(
|
|
status="fatal",
|
|
status_code=412,
|
|
sha256=None,
|
|
body=body_json,
|
|
error="code commit rejected — pull origin/main and reinstall",
|
|
)
|
|
if 500 <= r.status_code < 600:
|
|
return ShipResult(
|
|
status="transient",
|
|
status_code=r.status_code,
|
|
sha256=None,
|
|
body=body_json,
|
|
error=f"server error {r.status_code}",
|
|
)
|
|
# 4xx other than 409: caller-side bug — don't retry.
|
|
return ShipResult(
|
|
status="fatal",
|
|
status_code=r.status_code,
|
|
sha256=None,
|
|
body=body_json,
|
|
error=f"client error {r.status_code}",
|
|
)
|
|
|
|
# ---- helpers -------------------------------------------------------
|
|
|
|
def _common_headers(self) -> dict[str, str]:
|
|
h: dict[str, str] = {
|
|
"X-Lab-Host": self.cfg.host_id,
|
|
"X-Schema-Version": str(SCHEMA_VERSION),
|
|
}
|
|
if self.cfg.receiver.bearer_token:
|
|
h["Authorization"] = f"Bearer {self.cfg.receiver.bearer_token}"
|
|
return h
|
|
|
|
|
|
def hash_file(path: Path) -> str:
|
|
h = hashlib.sha256()
|
|
with path.open("rb") as f:
|
|
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|