First-boot bring-up enables cis490-shipper before the Pi has issued the mTLS leaf, so ssl.create_default_context(cafile=...) raised FileNotFoundError out of __init__ and systemd crash-looped the unit every RestartSec=5. Now the transport pre-flights the configured ca_bundle / client_cert / client_key paths, raises a recoverable _CertNotReadyError, and ping/ship_tarball retry the build on each request — daemon self-heals once the cert lands without a restart. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
258 lines
8.7 KiB
Python
258 lines
8.7 KiB
Python
"""HTTP transport for the lab-host shipper.
|
|
|
|
Two operations against the receiver:
|
|
POST /v1/ping — smoke test
|
|
PUT /v1/episodes/<host>/<episode>.tar.zst — episode upload
|
|
|
|
Auth is mTLS (client cert from wg-pki) when configured. A bearer token
|
|
is supported as a stand-in during early bring-up before the cert is
|
|
issued; production runs should set both.
|
|
|
|
The transport returns small dataclasses rather than throwing — the
|
|
caller (shipper queue) decides whether to retry, move to shipped/, or
|
|
alert. This keeps the retry policy in one place.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import logging
|
|
import ssl
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
from .config import ReceiverEndpoint, ShipperConfig
|
|
|
|
|
|
log = logging.getLogger("cis490.shipper.transport")
|
|
|
|
|
|
SCHEMA_VERSION = 1
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class PingResult:
|
|
ok: bool
|
|
status_code: int
|
|
body: dict[str, Any] | None
|
|
error: str | None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ShipResult:
|
|
status: str # "stored" | "already-present" | "conflict" | "transient" | "fatal"
|
|
status_code: int
|
|
sha256: str | None
|
|
body: dict[str, Any] | None
|
|
error: str | None
|
|
|
|
|
|
class _CertNotReadyError(Exception):
|
|
"""Configured cert/CA paths aren't on disk yet.
|
|
|
|
Raised during first-boot bring-up: install-lab-host.sh enables the
|
|
shipper before the Pi has issued the mTLS leaf. The transport
|
|
catches this, logs once, and retries on each request so the daemon
|
|
self-heals when the cert lands."""
|
|
|
|
|
|
def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
|
|
"""Build an SSL context honoring the wg-pki CA bundle + client cert.
|
|
|
|
Returns True / a bundle path / a context. httpx accepts all three;
|
|
we use a context so we can attach the client cert for mTLS."""
|
|
if not rcv.url.lower().startswith("https://"):
|
|
return False
|
|
# Pre-flight check the configured paths so we raise a recoverable
|
|
# error here instead of letting ssl raise FileNotFoundError deep
|
|
# inside create_default_context / load_cert_chain.
|
|
for label, path in (
|
|
("ca_bundle", rcv.ca_bundle),
|
|
("client_cert", rcv.client_cert),
|
|
("client_key", rcv.client_key),
|
|
):
|
|
if path and not Path(path).exists():
|
|
raise _CertNotReadyError(f"{label} path missing: {path}")
|
|
ctx = ssl.create_default_context(
|
|
cafile=str(rcv.ca_bundle) if rcv.ca_bundle else None,
|
|
)
|
|
if not rcv.verify_tls:
|
|
# Dev-only path; production lab-hosts should always pin the
|
|
# wg-pki CA. Logged loudly so it doesn't slip through.
|
|
log.warning("TLS verification disabled — dev-only configuration")
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
if rcv.client_cert and rcv.client_key:
|
|
ctx.load_cert_chain(str(rcv.client_cert), str(rcv.client_key))
|
|
return ctx
|
|
|
|
|
|
class ShipperTransport:
|
|
def __init__(self, cfg: ShipperConfig) -> None:
|
|
self.cfg = cfg
|
|
self._verify: ssl.SSLContext | bool | None = None
|
|
self._cert_warned = False
|
|
# Try once at construction; if certs aren't on disk yet, defer.
|
|
# Each request will retry the build until it succeeds, so
|
|
# systemd doesn't crash-loop the unit during first-boot.
|
|
self._try_build_verify()
|
|
|
|
def _try_build_verify(self) -> bool:
|
|
"""(Re)build the SSL context. Returns True if the transport is
|
|
ready to make requests; False if certs aren't on disk yet."""
|
|
if self._verify is not None:
|
|
return True # already built (False for http, context for https)
|
|
try:
|
|
self._verify = _build_ssl_context(self.cfg.receiver)
|
|
except _CertNotReadyError as e:
|
|
if not self._cert_warned:
|
|
log.warning(
|
|
"shipper waiting on mTLS material (%s); will retry each request",
|
|
e,
|
|
)
|
|
self._cert_warned = True
|
|
return False
|
|
if self._cert_warned:
|
|
log.info("mTLS material now on disk; shipper transport ready")
|
|
self._cert_warned = False
|
|
return True
|
|
|
|
# ---- ping ----------------------------------------------------------
|
|
|
|
def ping(self) -> PingResult:
|
|
if not self._try_build_verify():
|
|
return PingResult(
|
|
ok=False, status_code=0, body=None,
|
|
error="mTLS material not yet on disk; waiting for cert delivery",
|
|
)
|
|
url = f"{self.cfg.receiver.url}/v1/ping"
|
|
headers = self._common_headers()
|
|
try:
|
|
with httpx.Client(verify=self._verify, timeout=self.cfg.request_timeout_s) as c:
|
|
r = c.post(url, headers=headers, content=b"")
|
|
except httpx.HTTPError as e:
|
|
return PingResult(ok=False, status_code=0, body=None, error=str(e))
|
|
|
|
body: dict[str, Any] | None = None
|
|
try:
|
|
body = r.json()
|
|
except Exception:
|
|
pass
|
|
|
|
if r.status_code == 200 and isinstance(body, dict) and body.get("ok"):
|
|
return PingResult(ok=True, status_code=200, body=body, error=None)
|
|
return PingResult(
|
|
ok=False,
|
|
status_code=r.status_code,
|
|
body=body,
|
|
error=f"unexpected status {r.status_code}",
|
|
)
|
|
|
|
# ---- ship ----------------------------------------------------------
|
|
|
|
def ship_tarball(
|
|
self,
|
|
episode_id: str,
|
|
tarball_path: Path,
|
|
sha256_hex: str,
|
|
) -> ShipResult:
|
|
if not self._try_build_verify():
|
|
return ShipResult(
|
|
status="transient", status_code=0,
|
|
sha256=None, body=None,
|
|
error="mTLS material not yet on disk; waiting for cert delivery",
|
|
)
|
|
url = (
|
|
f"{self.cfg.receiver.url}/v1/episodes/"
|
|
f"{self.cfg.host_id}/{episode_id}.tar.zst"
|
|
)
|
|
size = tarball_path.stat().st_size
|
|
headers = self._common_headers() | {
|
|
"Content-Type": "application/zstd",
|
|
"Content-Length": str(size),
|
|
"X-Content-SHA256": sha256_hex,
|
|
"X-Episode-Id": episode_id,
|
|
}
|
|
|
|
try:
|
|
with httpx.Client(verify=self._verify, timeout=self.cfg.request_timeout_s) as c, \
|
|
tarball_path.open("rb") as body:
|
|
# httpx streams from a file-like object via the `content=` kwarg.
|
|
r = c.put(url, headers=headers, content=body)
|
|
except httpx.HTTPError as e:
|
|
return ShipResult(
|
|
status="transient",
|
|
status_code=0,
|
|
sha256=None,
|
|
body=None,
|
|
error=str(e),
|
|
)
|
|
|
|
body_json: dict[str, Any] | None = None
|
|
try:
|
|
body_json = r.json()
|
|
except Exception:
|
|
pass
|
|
|
|
if r.status_code == 201:
|
|
return ShipResult(
|
|
status="stored",
|
|
status_code=201,
|
|
sha256=sha256_hex,
|
|
body=body_json,
|
|
error=None,
|
|
)
|
|
if r.status_code == 200:
|
|
return ShipResult(
|
|
status="already-present",
|
|
status_code=200,
|
|
sha256=sha256_hex,
|
|
body=body_json,
|
|
error=None,
|
|
)
|
|
if r.status_code == 409:
|
|
return ShipResult(
|
|
status="conflict",
|
|
status_code=409,
|
|
sha256=sha256_hex,
|
|
body=body_json,
|
|
error="receiver already has a different sha256 for this id",
|
|
)
|
|
if 500 <= r.status_code < 600:
|
|
return ShipResult(
|
|
status="transient",
|
|
status_code=r.status_code,
|
|
sha256=None,
|
|
body=body_json,
|
|
error=f"server error {r.status_code}",
|
|
)
|
|
# 4xx other than 409: caller-side bug — don't retry.
|
|
return ShipResult(
|
|
status="fatal",
|
|
status_code=r.status_code,
|
|
sha256=None,
|
|
body=body_json,
|
|
error=f"client error {r.status_code}",
|
|
)
|
|
|
|
# ---- helpers -------------------------------------------------------
|
|
|
|
def _common_headers(self) -> dict[str, str]:
|
|
h: dict[str, str] = {
|
|
"X-Lab-Host": self.cfg.host_id,
|
|
"X-Schema-Version": str(SCHEMA_VERSION),
|
|
}
|
|
if self.cfg.receiver.bearer_token:
|
|
h["Authorization"] = f"Bearer {self.cfg.receiver.bearer_token}"
|
|
return h
|
|
|
|
|
|
def hash_file(path: Path) -> str:
|
|
h = hashlib.sha256()
|
|
with path.open("rb") as f:
|
|
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|