From 86a088c204e96db11a4aa52a3e6f85f08fd2a1d1 Mon Sep 17 00:00:00 2001 From: max Date: Thu, 30 Apr 2026 16:13:59 -0500 Subject: [PATCH] shipper: defer SSL context build until cert/CA paths exist (closes #11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First-boot bring-up enables cis490-shipper before the Pi has issued the mTLS leaf, so ssl.create_default_context(cafile=...) raised FileNotFoundError out of __init__ and systemd crash-looped the unit every RestartSec=5. Now the transport pre-flights the configured ca_bundle / client_cert / client_key paths, raises a recoverable _CertNotReadyError, and ping/ship_tarball retry the build on each request — daemon self-heals once the cert lands without a restart. Co-Authored-By: Claude Opus 4.7 (1M context) --- shipper/transport.py | 57 ++++++++++++++++++++++++++++++++++++++++++- tests/test_shipper.py | 22 +++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/shipper/transport.py b/shipper/transport.py index 06fa8bd..bbb64ae 100644 --- a/shipper/transport.py +++ b/shipper/transport.py @@ -50,6 +50,15 @@ class ShipResult: error: str | None +class _CertNotReadyError(Exception): + """Configured cert/CA paths aren't on disk yet. + + Raised during first-boot bring-up: install-lab-host.sh enables the + shipper before the Pi has issued the mTLS leaf. The transport + catches this, logs once, and retries on each request so the daemon + self-heals when the cert lands.""" + + def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool: """Build an SSL context honoring the wg-pki CA bundle + client cert. @@ -57,6 +66,16 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool: we use a context so we can attach the client cert for mTLS.""" if not rcv.url.lower().startswith("https://"): return False + # Pre-flight check the configured paths so we raise a recoverable + # error here instead of letting ssl raise FileNotFoundError deep + # inside create_default_context / load_cert_chain. + for label, path in ( + ("ca_bundle", rcv.ca_bundle), + ("client_cert", rcv.client_cert), + ("client_key", rcv.client_key), + ): + if path and not Path(path).exists(): + raise _CertNotReadyError(f"{label} path missing: {path}") ctx = ssl.create_default_context( cafile=str(rcv.ca_bundle) if rcv.ca_bundle else None, ) @@ -74,11 +93,41 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool: class ShipperTransport: def __init__(self, cfg: ShipperConfig) -> None: self.cfg = cfg - self._verify = _build_ssl_context(cfg.receiver) + self._verify: ssl.SSLContext | bool | None = None + self._cert_warned = False + # Try once at construction; if certs aren't on disk yet, defer. + # Each request will retry the build until it succeeds, so + # systemd doesn't crash-loop the unit during first-boot. + self._try_build_verify() + + def _try_build_verify(self) -> bool: + """(Re)build the SSL context. Returns True if the transport is + ready to make requests; False if certs aren't on disk yet.""" + if self._verify is not None: + return True # already built (False for http, context for https) + try: + self._verify = _build_ssl_context(self.cfg.receiver) + except _CertNotReadyError as e: + if not self._cert_warned: + log.warning( + "shipper waiting on mTLS material (%s); will retry each request", + e, + ) + self._cert_warned = True + return False + if self._cert_warned: + log.info("mTLS material now on disk; shipper transport ready") + self._cert_warned = False + return True # ---- ping ---------------------------------------------------------- def ping(self) -> PingResult: + if not self._try_build_verify(): + return PingResult( + ok=False, status_code=0, body=None, + error="mTLS material not yet on disk; waiting for cert delivery", + ) url = f"{self.cfg.receiver.url}/v1/ping" headers = self._common_headers() try: @@ -110,6 +159,12 @@ class ShipperTransport: tarball_path: Path, sha256_hex: str, ) -> ShipResult: + if not self._try_build_verify(): + return ShipResult( + status="transient", status_code=0, + sha256=None, body=None, + error="mTLS material not yet on disk; waiting for cert delivery", + ) url = ( f"{self.cfg.receiver.url}/v1/episodes/" f"{self.cfg.host_id}/{episode_id}.tar.zst" diff --git a/tests/test_shipper.py b/tests/test_shipper.py index a0b959b..e4ab075 100644 --- a/tests/test_shipper.py +++ b/tests/test_shipper.py @@ -191,6 +191,28 @@ def test_ping_fails_when_receiver_unreachable(tmp_path: Path) -> None: assert res.error is not None +def test_transport_defers_when_ca_bundle_missing(tmp_path: Path) -> None: + """Issue #11: first-boot bring-up enables the shipper before the Pi + has issued the mTLS leaf. Construction must not crash; ping/ship + should return a transient error until the cert lands.""" + missing_ca = tmp_path / "not-yet" / "wg-ca.pem" + cfg = ShipperConfig( + host_id="lab1", + data_root=tmp_path / "lab-data", + receiver=ReceiverEndpoint( + url="https://collector.wg", + ca_bundle=missing_ca, + ), + ) + # Construction MUST succeed even though the CA bundle is missing — + # this is the bug fix: previously raised FileNotFoundError out of + # ssl.create_default_context, crashing the systemd unit. + transport = ShipperTransport(cfg) + res = transport.ping() + assert res.ok is False + assert res.error is not None and "mTLS material" in res.error + + # --------------------------------------------------------------------------- # Tar + ship # ---------------------------------------------------------------------------