diff --git a/pyproject.toml b/pyproject.toml index 9c1d0d9..a1cb62c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ dependencies = [ "starlette>=0.36", "uvicorn[standard]>=0.27", "msgpack>=1.0", # MSF RPC wire format for the Tier-3 exploit driver + "pycdlib>=1.14", # build NoCloud cidata ISOs in pure Python ] [dependency-groups] @@ -17,7 +18,6 @@ dev = [ "matplotlib>=3.8", "tornado>=6", # required by matplotlib's WebAgg interactive backend "paramiko>=3", # SSH client for in-guest control on images that support it - "pycdlib>=1.14", # build NoCloud cidata ISOs in pure Python ] [tool.uv] diff --git a/scripts/install-lab-host.sh b/scripts/install-lab-host.sh index dd96ffa..3b65ec2 100755 --- a/scripts/install-lab-host.sh +++ b/scripts/install-lab-host.sh @@ -199,6 +199,7 @@ if [[ -f "$ALPINE_IMG" && ! -f "$CIDATA_ISO" ]]; then log "WARN: cidata build failed; run tools/build_cidata.py manually" fi # Symlink the canonical paths the launchers look at, when missing. +install -d -o "$SERVICE_USER" -g "$SERVICE_USER" -m 0755 "$INSTALL_ROOT/vm/images" ln -sf "$ALPINE_IMG" "$INSTALL_ROOT/vm/images/alpine-baseline.qcow2" 2>/dev/null || true ln -sf "$CIDATA_ISO" "$INSTALL_ROOT/vm/images/cidata.iso" 2>/dev/null || true diff --git a/shipper/transport.py b/shipper/transport.py index 06fa8bd..bbb64ae 100644 --- a/shipper/transport.py +++ b/shipper/transport.py @@ -50,6 +50,15 @@ class ShipResult: error: str | None +class _CertNotReadyError(Exception): + """Configured cert/CA paths aren't on disk yet. + + Raised during first-boot bring-up: install-lab-host.sh enables the + shipper before the Pi has issued the mTLS leaf. The transport + catches this, logs once, and retries on each request so the daemon + self-heals when the cert lands.""" + + def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool: """Build an SSL context honoring the wg-pki CA bundle + client cert. @@ -57,6 +66,16 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool: we use a context so we can attach the client cert for mTLS.""" if not rcv.url.lower().startswith("https://"): return False + # Pre-flight check the configured paths so we raise a recoverable + # error here instead of letting ssl raise FileNotFoundError deep + # inside create_default_context / load_cert_chain. + for label, path in ( + ("ca_bundle", rcv.ca_bundle), + ("client_cert", rcv.client_cert), + ("client_key", rcv.client_key), + ): + if path and not Path(path).exists(): + raise _CertNotReadyError(f"{label} path missing: {path}") ctx = ssl.create_default_context( cafile=str(rcv.ca_bundle) if rcv.ca_bundle else None, ) @@ -74,11 +93,41 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool: class ShipperTransport: def __init__(self, cfg: ShipperConfig) -> None: self.cfg = cfg - self._verify = _build_ssl_context(cfg.receiver) + self._verify: ssl.SSLContext | bool | None = None + self._cert_warned = False + # Try once at construction; if certs aren't on disk yet, defer. + # Each request will retry the build until it succeeds, so + # systemd doesn't crash-loop the unit during first-boot. + self._try_build_verify() + + def _try_build_verify(self) -> bool: + """(Re)build the SSL context. Returns True if the transport is + ready to make requests; False if certs aren't on disk yet.""" + if self._verify is not None: + return True # already built (False for http, context for https) + try: + self._verify = _build_ssl_context(self.cfg.receiver) + except _CertNotReadyError as e: + if not self._cert_warned: + log.warning( + "shipper waiting on mTLS material (%s); will retry each request", + e, + ) + self._cert_warned = True + return False + if self._cert_warned: + log.info("mTLS material now on disk; shipper transport ready") + self._cert_warned = False + return True # ---- ping ---------------------------------------------------------- def ping(self) -> PingResult: + if not self._try_build_verify(): + return PingResult( + ok=False, status_code=0, body=None, + error="mTLS material not yet on disk; waiting for cert delivery", + ) url = f"{self.cfg.receiver.url}/v1/ping" headers = self._common_headers() try: @@ -110,6 +159,12 @@ class ShipperTransport: tarball_path: Path, sha256_hex: str, ) -> ShipResult: + if not self._try_build_verify(): + return ShipResult( + status="transient", status_code=0, + sha256=None, body=None, + error="mTLS material not yet on disk; waiting for cert delivery", + ) url = ( f"{self.cfg.receiver.url}/v1/episodes/" f"{self.cfg.host_id}/{episode_id}.tar.zst" diff --git a/tests/test_shipper.py b/tests/test_shipper.py index a0b959b..e4ab075 100644 --- a/tests/test_shipper.py +++ b/tests/test_shipper.py @@ -191,6 +191,28 @@ def test_ping_fails_when_receiver_unreachable(tmp_path: Path) -> None: assert res.error is not None +def test_transport_defers_when_ca_bundle_missing(tmp_path: Path) -> None: + """Issue #11: first-boot bring-up enables the shipper before the Pi + has issued the mTLS leaf. Construction must not crash; ping/ship + should return a transient error until the cert lands.""" + missing_ca = tmp_path / "not-yet" / "wg-ca.pem" + cfg = ShipperConfig( + host_id="lab1", + data_root=tmp_path / "lab-data", + receiver=ReceiverEndpoint( + url="https://collector.wg", + ca_bundle=missing_ca, + ), + ) + # Construction MUST succeed even though the CA bundle is missing — + # this is the bug fix: previously raised FileNotFoundError out of + # ssl.create_default_context, crashing the systemd unit. + transport = ShipperTransport(cfg) + res = transport.ping() + assert res.ok is False + assert res.error is not None and "mTLS material" in res.error + + # --------------------------------------------------------------------------- # Tar + ship # --------------------------------------------------------------------------- diff --git a/tools/cis490_doctor.py b/tools/cis490_doctor.py index f58ef46..8bfbb41 100644 --- a/tools/cis490_doctor.py +++ b/tools/cis490_doctor.py @@ -102,9 +102,9 @@ _JSON_MODE = False # --------------------------------------------------------------------------- -def _run(cmd: list[str], *, timeout: float = 5.0) -> tuple[int, str, str]: +def _run(cmd: list[str], *, timeout: float = 5.0, cwd: str | None = None) -> tuple[int, str, str]: try: - p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, cwd=cwd) return p.returncode, p.stdout.strip(), p.stderr.strip() except (FileNotFoundError, subprocess.TimeoutExpired) as e: return -1, "", str(e) @@ -558,7 +558,7 @@ def check_end_to_end(report: Report) -> None: rc, out, err = _run([ "/opt/cis490/.venv/bin/python", "-m", "shipper", "--config", cfg, "--ping", - ], timeout=15.0) + ], timeout=15.0, cwd="/opt/cis490") if rc == 0 and '"ok": true' in out: report.add(Check("e2e: cis490-shipper --ping", "ok", detail="200 OK")) @@ -588,6 +588,8 @@ def main(argv: list[str] | None = None) -> int: _JSON_MODE = args.json repo_root = Path(__file__).resolve().parent.parent + if str(repo_root) not in sys.path: + sys.path.insert(0, str(repo_root)) if not _JSON_MODE: print(f"{_ANSI_BOLD}cis490-doctor{_ANSI_RESET} role={args.role} repo={repo_root}\n")