CIS490/shipper/transport.py
max f8ad02b2d7 Receiver enforces X-Cis490-Code-Commit allow-list (live, auto-refreshed)
Stops out-of-date lab hosts from polluting the dataset with episodes
generated by buggy code. The valid-commits set mirrors the maintainer's
working clone on the Pi automatically — when the maintainer pulls or
pushes a new commit, the receiver picks it up within the 5-second
cache TTL with no service restart.

Receiver changes:

- receiver/version_gate.py (new): VersionGate(repo_path, window).
  Each check() consults a frozenset of the last `window` commit
  hashes from `git -C <repo> log --format=%H -n <window>`, refreshed
  every 5s under a lock. Resilient to transient git failure (keeps
  prior cache so a flaky `git` doesn't lock out every shipper).

- receiver/app.py: PUT extracts X-Cis490-Code-Commit; gate.check()
  before ingest. Rejects with:
    400 + remediation if header missing or malformed
    412 + remediation + your_commit + head_commit if not in window
  Remediation block is verbatim copy-pasteable into the lab-host
  shell:
    cd /opt/cis490 && sudo -u cis490 git pull origin main
    sudo /opt/cis490/scripts/install-lab-host.sh
    sudo systemctl restart cis490-orchestrator

- receiver/store.py: ingest_stream takes commit kwarg, stamps it on
  the index.jsonl row (new optional field). Backfilled rows from
  index_backfill.py also pull commit out of meta.json.

- receiver/config.py + etc/receiver.toml.example: new [version_gate]
  section. enabled=true, repo_path=/home/max/cis490, window=100 by
  default. Enabled toggle exists for emergency disable-and-collect.

Shipper changes:

- shipper/transport.py: ship_tarball() takes commit kwarg, sends
  X-Cis490-Code-Commit header. 412 maps to status='fatal' so the
  queue doesn't infinite-retry — operator must pull and reinstall
  before the next ship will succeed.

- shipper/queue.py: reads meta.json::code_version.commit per
  episode, passes through. On 412, logs the receiver's full
  remediation block at ERROR level so journalctl on the lab host
  shows exactly what to run.

Tests: 9 in test_version_gate (including 2 end-to-end via
starlette.testclient), 2 cover the boundary where new commits land
mid-cache and where missing-repo gracefully keeps prior cache.
157/157 total.

Index schema: existing rows stay valid (commit field is optional
on read). New rows from receiver-direct AND from index_backfill.py
include commit.
2026-05-01 01:38:50 -05:00

279 lines
9.8 KiB
Python

"""HTTP transport for the lab-host shipper.
Two operations against the receiver:
POST /v1/ping — smoke test
PUT /v1/episodes/<host>/<episode>.tar.zst — episode upload
Auth is mTLS (client cert from wg-pki) when configured. A bearer token
is supported as a stand-in during early bring-up before the cert is
issued; production runs should set both.
The transport returns small dataclasses rather than throwing — the
caller (shipper queue) decides whether to retry, move to shipped/, or
alert. This keeps the retry policy in one place.
"""
from __future__ import annotations
import hashlib
import logging
import ssl
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import httpx
from .config import ReceiverEndpoint, ShipperConfig
log = logging.getLogger("cis490.shipper.transport")
SCHEMA_VERSION = 1
@dataclass(frozen=True)
class PingResult:
ok: bool
status_code: int
body: dict[str, Any] | None
error: str | None
@dataclass(frozen=True)
class ShipResult:
status: str # "stored" | "already-present" | "conflict" | "transient" | "fatal"
status_code: int
sha256: str | None
body: dict[str, Any] | None
error: str | None
class _CertNotReadyError(Exception):
"""Configured cert/CA paths aren't on disk yet.
Raised during first-boot bring-up: install-lab-host.sh enables the
shipper before the Pi has issued the mTLS leaf. The transport
catches this, logs once, and retries on each request so the daemon
self-heals when the cert lands."""
def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
"""Build an SSL context honoring the wg-pki CA bundle + client cert.
Returns True / a bundle path / a context. httpx accepts all three;
we use a context so we can attach the client cert for mTLS."""
if not rcv.url.lower().startswith("https://"):
return False
# Pre-flight check the configured paths so we raise a recoverable
# error here instead of letting ssl raise FileNotFoundError deep
# inside create_default_context / load_cert_chain.
for label, path in (
("ca_bundle", rcv.ca_bundle),
("client_cert", rcv.client_cert),
("client_key", rcv.client_key),
):
if path and not Path(path).exists():
raise _CertNotReadyError(f"{label} path missing: {path}")
ctx = ssl.create_default_context(
cafile=str(rcv.ca_bundle) if rcv.ca_bundle else None,
)
if not rcv.verify_tls:
# Dev-only path; production lab-hosts should always pin the
# wg-pki CA. Logged loudly so it doesn't slip through.
log.warning("TLS verification disabled — dev-only configuration")
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
if rcv.client_cert and rcv.client_key:
ctx.load_cert_chain(str(rcv.client_cert), str(rcv.client_key))
return ctx
class ShipperTransport:
def __init__(self, cfg: ShipperConfig) -> None:
self.cfg = cfg
self._verify: ssl.SSLContext | bool | None = None
self._cert_warned = False
# Try once at construction; if certs aren't on disk yet, defer.
# Each request will retry the build until it succeeds, so
# systemd doesn't crash-loop the unit during first-boot.
self._try_build_verify()
def _try_build_verify(self) -> bool:
"""(Re)build the SSL context. Returns True if the transport is
ready to make requests; False if certs aren't on disk yet."""
if self._verify is not None:
return True # already built (False for http, context for https)
try:
self._verify = _build_ssl_context(self.cfg.receiver)
except _CertNotReadyError as e:
if not self._cert_warned:
log.warning(
"shipper waiting on mTLS material (%s); will retry each "
"request — this is expected during first-boot. To unblock: "
"set host_id in /etc/cis490/lab-host.toml then run "
"`sudo /opt/cis490/scripts/install-lab-host.sh` (do NOT "
"mint certs by hand). See AGENTS.md → Securing the "
"connection (mTLS).", e,
)
self._cert_warned = True
return False
if self._cert_warned:
log.info("mTLS material now on disk; shipper transport ready")
self._cert_warned = False
return True
# ---- ping ----------------------------------------------------------
def ping(self) -> PingResult:
if not self._try_build_verify():
return PingResult(
ok=False, status_code=0, body=None,
error="mTLS material not yet on disk; waiting for cert delivery",
)
url = f"{self.cfg.receiver.url}/v1/ping"
headers = self._common_headers()
try:
with httpx.Client(verify=self._verify, timeout=self.cfg.request_timeout_s) as c:
r = c.post(url, headers=headers, content=b"")
except httpx.HTTPError as e:
return PingResult(ok=False, status_code=0, body=None, error=str(e))
body: dict[str, Any] | None = None
try:
body = r.json()
except Exception:
pass
if r.status_code == 200 and isinstance(body, dict) and body.get("ok"):
return PingResult(ok=True, status_code=200, body=body, error=None)
return PingResult(
ok=False,
status_code=r.status_code,
body=body,
error=f"unexpected status {r.status_code}",
)
# ---- ship ----------------------------------------------------------
def ship_tarball(
self,
episode_id: str,
tarball_path: Path,
sha256_hex: str,
commit: str | None = None,
) -> ShipResult:
if not self._try_build_verify():
return ShipResult(
status="transient", status_code=0,
sha256=None, body=None,
error="mTLS material not yet on disk; waiting for cert delivery",
)
url = (
f"{self.cfg.receiver.url}/v1/episodes/"
f"{self.cfg.host_id}/{episode_id}.tar.zst"
)
size = tarball_path.stat().st_size
headers = self._common_headers() | {
"Content-Type": "application/zstd",
"Content-Length": str(size),
"X-Content-SHA256": sha256_hex,
"X-Episode-Id": episode_id,
}
if commit:
# Receiver enforces this against its commit-allow-list and
# rejects with 412 if not in window. See receiver/version_gate.py.
headers["X-Cis490-Code-Commit"] = commit
try:
with httpx.Client(verify=self._verify, timeout=self.cfg.request_timeout_s) as c, \
tarball_path.open("rb") as body:
# httpx streams from a file-like object via the `content=` kwarg.
r = c.put(url, headers=headers, content=body)
except httpx.HTTPError as e:
return ShipResult(
status="transient",
status_code=0,
sha256=None,
body=None,
error=str(e),
)
body_json: dict[str, Any] | None = None
try:
body_json = r.json()
except Exception:
pass
if r.status_code == 201:
return ShipResult(
status="stored",
status_code=201,
sha256=sha256_hex,
body=body_json,
error=None,
)
if r.status_code == 200:
return ShipResult(
status="already-present",
status_code=200,
sha256=sha256_hex,
body=body_json,
error=None,
)
if r.status_code == 409:
return ShipResult(
status="conflict",
status_code=409,
sha256=sha256_hex,
body=body_json,
error="receiver already has a different sha256 for this id",
)
if r.status_code == 412:
# Code-commit not in receiver's allow-list. The operator
# of THIS lab host needs to pull main + reinstall;
# retrying without that won't help. Treat as fatal so
# queue.run_once() doesn't loop on it.
return ShipResult(
status="fatal",
status_code=412,
sha256=None,
body=body_json,
error="code commit rejected — pull origin/main and reinstall",
)
if 500 <= r.status_code < 600:
return ShipResult(
status="transient",
status_code=r.status_code,
sha256=None,
body=body_json,
error=f"server error {r.status_code}",
)
# 4xx other than 409: caller-side bug — don't retry.
return ShipResult(
status="fatal",
status_code=r.status_code,
sha256=None,
body=body_json,
error=f"client error {r.status_code}",
)
# ---- helpers -------------------------------------------------------
def _common_headers(self) -> dict[str, str]:
h: dict[str, str] = {
"X-Lab-Host": self.cfg.host_id,
"X-Schema-Version": str(SCHEMA_VERSION),
}
if self.cfg.receiver.bearer_token:
h["Authorization"] = f"Bearer {self.cfg.receiver.bearer_token}"
return h
def hash_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest()