CIS490/shipper/transport.py
max 7c9f9582ca Lab-host shipper + receiver /v1/ping + install scripts
Implements the deployment loop end-to-end on the CIS490 side:

shipper/
  config.py      ShipperConfig (host_id, paths, receiver endpoint, mTLS)
  transport.py   httpx-based PUT + ping with mTLS + bearer support
  queue.py       scan data/episodes/, tar+zstd via system zstd, ship,
                 retire to data/shipped/. Idempotent across crashes per
                 the state machine in docs/transport.md.
  __main__.py    CLI: --ping (smoke test), --once (one pass), or daemon

receiver/app.py: new POST /v1/ping that requires the same auth as PUT
  /v1/episodes but writes nothing. Used by `cis490-shipper --ping`
  during lab-host bring-up to verify the WG/Caddy/mTLS path before
  shipping any real bytes.

etc/
  cis490-shipper.service       systemd unit for the lab-host shipper
  cis490-orchestrator.service  systemd unit for the lab-host queue
                               (kept disabled by default until queue
                               mode lands)
  lab-host.toml.example        config template

scripts/
  install-lab-host.sh   idempotent installer; verifies prereqs,
                        creates cis490 service user, syncs repo to
                        /opt/cis490, builds venv, drops systemd units
                        and config template
  install-receiver.sh   same, for the receiver role on the central WG
                        node (Pi5 in our setup)

tests/test_shipper.py  11 end-to-end tests against a real Uvicorn
                       server hosting the receiver app. Exercises
                       ping, tar+ship, idempotent re-ship, 409
                       conflict, transient (receiver down), tarball
                       round-trip via system zstd.

AGENTS.md  guidance for AI agents working on this and sibling repos.
           Headline: when you hit an issue you can't fully fix in
           scope, file a Forgejo issue rather than leaving a TODO.

51/51 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 23:41:32 -05:00

203 lines
6.3 KiB
Python

"""HTTP transport for the lab-host shipper.
Two operations against the receiver:
POST /v1/ping — smoke test
PUT /v1/episodes/<host>/<episode>.tar.zst — episode upload
Auth is mTLS (client cert from wg-pki) when configured. A bearer token
is supported as a stand-in during early bring-up before the cert is
issued; production runs should set both.
The transport returns small dataclasses rather than throwing — the
caller (shipper queue) decides whether to retry, move to shipped/, or
alert. This keeps the retry policy in one place.
"""
from __future__ import annotations
import hashlib
import logging
import ssl
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import httpx
from .config import ReceiverEndpoint, ShipperConfig
log = logging.getLogger("cis490.shipper.transport")
SCHEMA_VERSION = 1
@dataclass(frozen=True)
class PingResult:
ok: bool
status_code: int
body: dict[str, Any] | None
error: str | None
@dataclass(frozen=True)
class ShipResult:
status: str # "stored" | "already-present" | "conflict" | "transient" | "fatal"
status_code: int
sha256: str | None
body: dict[str, Any] | None
error: str | None
def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
"""Build an SSL context honoring the wg-pki CA bundle + client cert.
Returns True / a bundle path / a context. httpx accepts all three;
we use a context so we can attach the client cert for mTLS."""
if not rcv.url.lower().startswith("https://"):
return False
ctx = ssl.create_default_context(
cafile=str(rcv.ca_bundle) if rcv.ca_bundle else None,
)
if not rcv.verify_tls:
# Dev-only path; production lab-hosts should always pin the
# wg-pki CA. Logged loudly so it doesn't slip through.
log.warning("TLS verification disabled — dev-only configuration")
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
if rcv.client_cert and rcv.client_key:
ctx.load_cert_chain(str(rcv.client_cert), str(rcv.client_key))
return ctx
class ShipperTransport:
def __init__(self, cfg: ShipperConfig) -> None:
self.cfg = cfg
self._verify = _build_ssl_context(cfg.receiver)
# ---- ping ----------------------------------------------------------
def ping(self) -> PingResult:
url = f"{self.cfg.receiver.url}/v1/ping"
headers = self._common_headers()
try:
with httpx.Client(verify=self._verify, timeout=self.cfg.request_timeout_s) as c:
r = c.post(url, headers=headers, content=b"")
except httpx.HTTPError as e:
return PingResult(ok=False, status_code=0, body=None, error=str(e))
body: dict[str, Any] | None = None
try:
body = r.json()
except Exception:
pass
if r.status_code == 200 and isinstance(body, dict) and body.get("ok"):
return PingResult(ok=True, status_code=200, body=body, error=None)
return PingResult(
ok=False,
status_code=r.status_code,
body=body,
error=f"unexpected status {r.status_code}",
)
# ---- ship ----------------------------------------------------------
def ship_tarball(
self,
episode_id: str,
tarball_path: Path,
sha256_hex: str,
) -> ShipResult:
url = (
f"{self.cfg.receiver.url}/v1/episodes/"
f"{self.cfg.host_id}/{episode_id}.tar.zst"
)
size = tarball_path.stat().st_size
headers = self._common_headers() | {
"Content-Type": "application/zstd",
"Content-Length": str(size),
"X-Content-SHA256": sha256_hex,
"X-Episode-Id": episode_id,
}
try:
with httpx.Client(verify=self._verify, timeout=self.cfg.request_timeout_s) as c, \
tarball_path.open("rb") as body:
# httpx streams from a file-like object via the `content=` kwarg.
r = c.put(url, headers=headers, content=body)
except httpx.HTTPError as e:
return ShipResult(
status="transient",
status_code=0,
sha256=None,
body=None,
error=str(e),
)
body_json: dict[str, Any] | None = None
try:
body_json = r.json()
except Exception:
pass
if r.status_code == 201:
return ShipResult(
status="stored",
status_code=201,
sha256=sha256_hex,
body=body_json,
error=None,
)
if r.status_code == 200:
return ShipResult(
status="already-present",
status_code=200,
sha256=sha256_hex,
body=body_json,
error=None,
)
if r.status_code == 409:
return ShipResult(
status="conflict",
status_code=409,
sha256=sha256_hex,
body=body_json,
error="receiver already has a different sha256 for this id",
)
if 500 <= r.status_code < 600:
return ShipResult(
status="transient",
status_code=r.status_code,
sha256=None,
body=body_json,
error=f"server error {r.status_code}",
)
# 4xx other than 409: caller-side bug — don't retry.
return ShipResult(
status="fatal",
status_code=r.status_code,
sha256=None,
body=body_json,
error=f"client error {r.status_code}",
)
# ---- helpers -------------------------------------------------------
def _common_headers(self) -> dict[str, str]:
h: dict[str, str] = {
"X-Lab-Host": self.cfg.host_id,
"X-Schema-Version": str(SCHEMA_VERSION),
}
if self.cfg.receiver.bearer_token:
h["Authorization"] = f"Bearer {self.cfg.receiver.bearer_token}"
return h
def hash_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest()