Merge commit '86a088c' into Dev_REL1_043026

This commit is contained in:
Elliott Kolden 2026-04-30 15:16:41 -06:00
commit 7c35bf7d49
5 changed files with 85 additions and 5 deletions

View file

@ -7,6 +7,7 @@ dependencies = [
"starlette>=0.36",
"uvicorn[standard]>=0.27",
"msgpack>=1.0", # MSF RPC wire format for the Tier-3 exploit driver
"pycdlib>=1.14", # build NoCloud cidata ISOs in pure Python
]
[dependency-groups]
@ -17,7 +18,6 @@ dev = [
"matplotlib>=3.8",
"tornado>=6", # required by matplotlib's WebAgg interactive backend
"paramiko>=3", # SSH client for in-guest control on images that support it
"pycdlib>=1.14", # build NoCloud cidata ISOs in pure Python
]
[tool.uv]

View file

@ -199,6 +199,7 @@ if [[ -f "$ALPINE_IMG" && ! -f "$CIDATA_ISO" ]]; then
log "WARN: cidata build failed; run tools/build_cidata.py manually"
fi
# Symlink the canonical paths the launchers look at, when missing.
install -d -o "$SERVICE_USER" -g "$SERVICE_USER" -m 0755 "$INSTALL_ROOT/vm/images"
ln -sf "$ALPINE_IMG" "$INSTALL_ROOT/vm/images/alpine-baseline.qcow2" 2>/dev/null || true
ln -sf "$CIDATA_ISO" "$INSTALL_ROOT/vm/images/cidata.iso" 2>/dev/null || true

View file

@ -50,6 +50,15 @@ class ShipResult:
error: str | None
class _CertNotReadyError(Exception):
"""Configured cert/CA paths aren't on disk yet.
Raised during first-boot bring-up: install-lab-host.sh enables the
shipper before the Pi has issued the mTLS leaf. The transport
catches this, logs once, and retries on each request so the daemon
self-heals when the cert lands."""
def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
"""Build an SSL context honoring the wg-pki CA bundle + client cert.
@ -57,6 +66,16 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
we use a context so we can attach the client cert for mTLS."""
if not rcv.url.lower().startswith("https://"):
return False
# Pre-flight check the configured paths so we raise a recoverable
# error here instead of letting ssl raise FileNotFoundError deep
# inside create_default_context / load_cert_chain.
for label, path in (
("ca_bundle", rcv.ca_bundle),
("client_cert", rcv.client_cert),
("client_key", rcv.client_key),
):
if path and not Path(path).exists():
raise _CertNotReadyError(f"{label} path missing: {path}")
ctx = ssl.create_default_context(
cafile=str(rcv.ca_bundle) if rcv.ca_bundle else None,
)
@ -74,11 +93,41 @@ def _build_ssl_context(rcv: ReceiverEndpoint) -> ssl.SSLContext | bool:
class ShipperTransport:
def __init__(self, cfg: ShipperConfig) -> None:
self.cfg = cfg
self._verify = _build_ssl_context(cfg.receiver)
self._verify: ssl.SSLContext | bool | None = None
self._cert_warned = False
# Try once at construction; if certs aren't on disk yet, defer.
# Each request will retry the build until it succeeds, so
# systemd doesn't crash-loop the unit during first-boot.
self._try_build_verify()
def _try_build_verify(self) -> bool:
"""(Re)build the SSL context. Returns True if the transport is
ready to make requests; False if certs aren't on disk yet."""
if self._verify is not None:
return True # already built (False for http, context for https)
try:
self._verify = _build_ssl_context(self.cfg.receiver)
except _CertNotReadyError as e:
if not self._cert_warned:
log.warning(
"shipper waiting on mTLS material (%s); will retry each request",
e,
)
self._cert_warned = True
return False
if self._cert_warned:
log.info("mTLS material now on disk; shipper transport ready")
self._cert_warned = False
return True
# ---- ping ----------------------------------------------------------
def ping(self) -> PingResult:
if not self._try_build_verify():
return PingResult(
ok=False, status_code=0, body=None,
error="mTLS material not yet on disk; waiting for cert delivery",
)
url = f"{self.cfg.receiver.url}/v1/ping"
headers = self._common_headers()
try:
@ -110,6 +159,12 @@ class ShipperTransport:
tarball_path: Path,
sha256_hex: str,
) -> ShipResult:
if not self._try_build_verify():
return ShipResult(
status="transient", status_code=0,
sha256=None, body=None,
error="mTLS material not yet on disk; waiting for cert delivery",
)
url = (
f"{self.cfg.receiver.url}/v1/episodes/"
f"{self.cfg.host_id}/{episode_id}.tar.zst"

View file

@ -191,6 +191,28 @@ def test_ping_fails_when_receiver_unreachable(tmp_path: Path) -> None:
assert res.error is not None
def test_transport_defers_when_ca_bundle_missing(tmp_path: Path) -> None:
"""Issue #11: first-boot bring-up enables the shipper before the Pi
has issued the mTLS leaf. Construction must not crash; ping/ship
should return a transient error until the cert lands."""
missing_ca = tmp_path / "not-yet" / "wg-ca.pem"
cfg = ShipperConfig(
host_id="lab1",
data_root=tmp_path / "lab-data",
receiver=ReceiverEndpoint(
url="https://collector.wg",
ca_bundle=missing_ca,
),
)
# Construction MUST succeed even though the CA bundle is missing —
# this is the bug fix: previously raised FileNotFoundError out of
# ssl.create_default_context, crashing the systemd unit.
transport = ShipperTransport(cfg)
res = transport.ping()
assert res.ok is False
assert res.error is not None and "mTLS material" in res.error
# ---------------------------------------------------------------------------
# Tar + ship
# ---------------------------------------------------------------------------

View file

@ -102,9 +102,9 @@ _JSON_MODE = False
# ---------------------------------------------------------------------------
def _run(cmd: list[str], *, timeout: float = 5.0) -> tuple[int, str, str]:
def _run(cmd: list[str], *, timeout: float = 5.0, cwd: str | None = None) -> tuple[int, str, str]:
try:
p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, cwd=cwd)
return p.returncode, p.stdout.strip(), p.stderr.strip()
except (FileNotFoundError, subprocess.TimeoutExpired) as e:
return -1, "", str(e)
@ -558,7 +558,7 @@ def check_end_to_end(report: Report) -> None:
rc, out, err = _run([
"/opt/cis490/.venv/bin/python", "-m", "shipper",
"--config", cfg, "--ping",
], timeout=15.0)
], timeout=15.0, cwd="/opt/cis490")
if rc == 0 and '"ok": true' in out:
report.add(Check("e2e: cis490-shipper --ping", "ok",
detail="200 OK"))
@ -588,6 +588,8 @@ def main(argv: list[str] | None = None) -> int:
_JSON_MODE = args.json
repo_root = Path(__file__).resolve().parent.parent
if str(repo_root) not in sys.path:
sys.path.insert(0, str(repo_root))
if not _JSON_MODE:
print(f"{_ANSI_BOLD}cis490-doctor{_ANSI_RESET} role={args.role} repo={repo_root}\n")