shipper: defer SSL context build until cert/CA paths exist (closes #11)

First-boot bring-up enables cis490-shipper before the Pi has issued the
mTLS leaf, so ssl.create_default_context(cafile=...) raised
FileNotFoundError out of __init__ and systemd crash-looped the unit
every RestartSec=5. Now the transport pre-flights the configured
ca_bundle / client_cert / client_key paths, raises a recoverable
_CertNotReadyError, and ping/ship_tarball retry the build on each
request — daemon self-heals once the cert lands without a restart.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
max 2026-04-30 16:13:59 -05:00
parent 95ac56a382
commit 86a088c204
2 changed files with 78 additions and 1 deletions

View file

@ -50,6 +50,15 @@ class ShipResult:
error: str | None
class _CertNotReadyError(Exception):
"""Configured cert/CA paths aren't on disk yet.
Raised during first-boot bring-up: install-lab-host.sh enables the
shipper before the Pi has issued the mTLS leaf. The transport
catches this, logs once, and retries on each request so the daemon
self-heals when the cert lands."""
def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
"""Build an SSL context honoring the wg-pki CA bundle + client cert.
@ -57,6 +66,16 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
we use a context so we can attach the client cert for mTLS."""
if not rcv.url.lower().startswith("https://"):
return False
# Pre-flight check the configured paths so we raise a recoverable
# error here instead of letting ssl raise FileNotFoundError deep
# inside create_default_context / load_cert_chain.
for label, path in (
("ca_bundle", rcv.ca_bundle),
("client_cert", rcv.client_cert),
("client_key", rcv.client_key),
):
if path and not Path(path).exists():
raise _CertNotReadyError(f"{label} path missing: {path}")
ctx = ssl.create_default_context(
cafile=str(rcv.ca_bundle) if rcv.ca_bundle else None,
)
@ -74,11 +93,41 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
class ShipperTransport:
def __init__(self, cfg: ShipperConfig) -> None:
self.cfg = cfg
self._verify = _build_ssl_context(cfg.receiver)
self._verify: ssl.SSLContext | bool | None = None
self._cert_warned = False
# Try once at construction; if certs aren't on disk yet, defer.
# Each request will retry the build until it succeeds, so
# systemd doesn't crash-loop the unit during first-boot.
self._try_build_verify()
def _try_build_verify(self) -> bool:
"""(Re)build the SSL context. Returns True if the transport is
ready to make requests; False if certs aren't on disk yet."""
if self._verify is not None:
return True # already built (False for http, context for https)
try:
self._verify = _build_ssl_context(self.cfg.receiver)
except _CertNotReadyError as e:
if not self._cert_warned:
log.warning(
"shipper waiting on mTLS material (%s); will retry each request",
e,
)
self._cert_warned = True
return False
if self._cert_warned:
log.info("mTLS material now on disk; shipper transport ready")
self._cert_warned = False
return True
# ---- ping ----------------------------------------------------------
def ping(self) -> PingResult:
if not self._try_build_verify():
return PingResult(
ok=False, status_code=0, body=None,
error="mTLS material not yet on disk; waiting for cert delivery",
)
url = f"{self.cfg.receiver.url}/v1/ping"
headers = self._common_headers()
try:
@ -110,6 +159,12 @@ class ShipperTransport:
tarball_path: Path,
sha256_hex: str,
) -> ShipResult:
if not self._try_build_verify():
return ShipResult(
status="transient", status_code=0,
sha256=None, body=None,
error="mTLS material not yet on disk; waiting for cert delivery",
)
url = (
f"{self.cfg.receiver.url}/v1/episodes/"
f"{self.cfg.host_id}/{episode_id}.tar.zst"

View file

@ -191,6 +191,28 @@ def test_ping_fails_when_receiver_unreachable(tmp_path: Path) -> None:
assert res.error is not None
def test_transport_defers_when_ca_bundle_missing(tmp_path: Path) -> None:
"""Issue #11: first-boot bring-up enables the shipper before the Pi
has issued the mTLS leaf. Construction must not crash; ping/ship
should return a transient error until the cert lands."""
missing_ca = tmp_path / "not-yet" / "wg-ca.pem"
cfg = ShipperConfig(
host_id="lab1",
data_root=tmp_path / "lab-data",
receiver=ReceiverEndpoint(
url="https://collector.wg",
ca_bundle=missing_ca,
),
)
# Construction MUST succeed even though the CA bundle is missing —
# this is the bug fix: previously raised FileNotFoundError out of
# ssl.create_default_context, crashing the systemd unit.
transport = ShipperTransport(cfg)
res = transport.ping()
assert res.ok is False
assert res.error is not None and "mTLS material" in res.error
# ---------------------------------------------------------------------------
# Tar + ship
# ---------------------------------------------------------------------------