shipper: defer SSL context build until cert/CA paths exist (closes #11)
First-boot bring-up enables cis490-shipper before the Pi has issued the mTLS leaf, so ssl.create_default_context(cafile=...) raised FileNotFoundError out of __init__ and systemd crash-looped the unit every RestartSec=5. Now the transport pre-flights the configured ca_bundle / client_cert / client_key paths, raises a recoverable _CertNotReadyError, and ping/ship_tarball retry the build on each request — daemon self-heals once the cert lands without a restart. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
95ac56a382
commit
86a088c204
2 changed files with 78 additions and 1 deletions
|
|
@ -50,6 +50,15 @@ class ShipResult:
|
|||
error: str | None
|
||||
|
||||
|
||||
class _CertNotReadyError(Exception):
|
||||
"""Configured cert/CA paths aren't on disk yet.
|
||||
|
||||
Raised during first-boot bring-up: install-lab-host.sh enables the
|
||||
shipper before the Pi has issued the mTLS leaf. The transport
|
||||
catches this, logs once, and retries on each request so the daemon
|
||||
self-heals when the cert lands."""
|
||||
|
||||
|
||||
def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
|
||||
"""Build an SSL context honoring the wg-pki CA bundle + client cert.
|
||||
|
||||
|
|
@ -57,6 +66,16 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
|
|||
we use a context so we can attach the client cert for mTLS."""
|
||||
if not rcv.url.lower().startswith("https://"):
|
||||
return False
|
||||
# Pre-flight check the configured paths so we raise a recoverable
|
||||
# error here instead of letting ssl raise FileNotFoundError deep
|
||||
# inside create_default_context / load_cert_chain.
|
||||
for label, path in (
|
||||
("ca_bundle", rcv.ca_bundle),
|
||||
("client_cert", rcv.client_cert),
|
||||
("client_key", rcv.client_key),
|
||||
):
|
||||
if path and not Path(path).exists():
|
||||
raise _CertNotReadyError(f"{label} path missing: {path}")
|
||||
ctx = ssl.create_default_context(
|
||||
cafile=str(rcv.ca_bundle) if rcv.ca_bundle else None,
|
||||
)
|
||||
|
|
@ -74,11 +93,41 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
|
|||
class ShipperTransport:
|
||||
def __init__(self, cfg: ShipperConfig) -> None:
|
||||
self.cfg = cfg
|
||||
self._verify = _build_ssl_context(cfg.receiver)
|
||||
self._verify: ssl.SSLContext | bool | None = None
|
||||
self._cert_warned = False
|
||||
# Try once at construction; if certs aren't on disk yet, defer.
|
||||
# Each request will retry the build until it succeeds, so
|
||||
# systemd doesn't crash-loop the unit during first-boot.
|
||||
self._try_build_verify()
|
||||
|
||||
def _try_build_verify(self) -> bool:
|
||||
"""(Re)build the SSL context. Returns True if the transport is
|
||||
ready to make requests; False if certs aren't on disk yet."""
|
||||
if self._verify is not None:
|
||||
return True # already built (False for http, context for https)
|
||||
try:
|
||||
self._verify = _build_ssl_context(self.cfg.receiver)
|
||||
except _CertNotReadyError as e:
|
||||
if not self._cert_warned:
|
||||
log.warning(
|
||||
"shipper waiting on mTLS material (%s); will retry each request",
|
||||
e,
|
||||
)
|
||||
self._cert_warned = True
|
||||
return False
|
||||
if self._cert_warned:
|
||||
log.info("mTLS material now on disk; shipper transport ready")
|
||||
self._cert_warned = False
|
||||
return True
|
||||
|
||||
# ---- ping ----------------------------------------------------------
|
||||
|
||||
def ping(self) -> PingResult:
|
||||
if not self._try_build_verify():
|
||||
return PingResult(
|
||||
ok=False, status_code=0, body=None,
|
||||
error="mTLS material not yet on disk; waiting for cert delivery",
|
||||
)
|
||||
url = f"{self.cfg.receiver.url}/v1/ping"
|
||||
headers = self._common_headers()
|
||||
try:
|
||||
|
|
@ -110,6 +159,12 @@ class ShipperTransport:
|
|||
tarball_path: Path,
|
||||
sha256_hex: str,
|
||||
) -> ShipResult:
|
||||
if not self._try_build_verify():
|
||||
return ShipResult(
|
||||
status="transient", status_code=0,
|
||||
sha256=None, body=None,
|
||||
error="mTLS material not yet on disk; waiting for cert delivery",
|
||||
)
|
||||
url = (
|
||||
f"{self.cfg.receiver.url}/v1/episodes/"
|
||||
f"{self.cfg.host_id}/{episode_id}.tar.zst"
|
||||
|
|
|
|||
|
|
@ -191,6 +191,28 @@ def test_ping_fails_when_receiver_unreachable(tmp_path: Path) -> None:
|
|||
assert res.error is not None
|
||||
|
||||
|
||||
def test_transport_defers_when_ca_bundle_missing(tmp_path: Path) -> None:
|
||||
"""Issue #11: first-boot bring-up enables the shipper before the Pi
|
||||
has issued the mTLS leaf. Construction must not crash; ping/ship
|
||||
should return a transient error until the cert lands."""
|
||||
missing_ca = tmp_path / "not-yet" / "wg-ca.pem"
|
||||
cfg = ShipperConfig(
|
||||
host_id="lab1",
|
||||
data_root=tmp_path / "lab-data",
|
||||
receiver=ReceiverEndpoint(
|
||||
url="https://collector.wg",
|
||||
ca_bundle=missing_ca,
|
||||
),
|
||||
)
|
||||
# Construction MUST succeed even though the CA bundle is missing —
|
||||
# this is the bug fix: previously raised FileNotFoundError out of
|
||||
# ssl.create_default_context, crashing the systemd unit.
|
||||
transport = ShipperTransport(cfg)
|
||||
res = transport.ping()
|
||||
assert res.ok is False
|
||||
assert res.error is not None and "mTLS material" in res.error
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tar + ship
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue