"""End-to-end shipper tests. These run a real Uvicorn server bound to 127.0.0.1 on a free port, hosting the actual receiver Starlette app over an EpisodeStore on a temp dir. The shipper then talks to that server with its real `httpx.Client` — same code path as production. This catches things the receiver-side ASGI tests can't (HTTP framing, header handling, sync httpx behaviour, content-length quirks). """ from __future__ import annotations import json import socket import threading import time from pathlib import Path import httpx import pytest import uvicorn from receiver.app import make_app from receiver.store import EpisodeStore from shipper.config import ReceiverEndpoint, ShipperConfig from shipper.queue import ShipperQueue from shipper.transport import ShipperTransport # --------------------------------------------------------------------------- # Live-receiver fixture # --------------------------------------------------------------------------- def _free_port() -> int: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("127.0.0.1", 0)) return s.getsockname()[1] class _ServerThread(threading.Thread): def __init__(self, app, port: int) -> None: super().__init__(daemon=True) cfg = uvicorn.Config( app, host="127.0.0.1", port=port, log_level="error", lifespan="off", access_log=False, ) self.server = uvicorn.Server(cfg) def run(self) -> None: self.server.run() def stop(self) -> None: self.server.should_exit = True def _wait_for_port(port: int, timeout_s: float = 5.0) -> None: deadline = time.monotonic() + timeout_s while time.monotonic() < deadline: try: with httpx.Client(timeout=0.5) as c: r = c.get(f"http://127.0.0.1:{port}/v1/health") if r.status_code == 200: return except httpx.HTTPError: pass time.sleep(0.05) raise TimeoutError(f"receiver on 127.0.0.1:{port} did not come up") @pytest.fixture def store(tmp_path: Path) -> EpisodeStore: return EpisodeStore( store_root=tmp_path / "rcv-episodes", incoming_root=tmp_path / "rcv-incoming", index_path=tmp_path / "rcv-index.jsonl", ) @pytest.fixture def receiver(store: EpisodeStore): app = make_app(store=store, max_episode_bytes=10_000_000, bearer_token=None) port = _free_port() server = _ServerThread(app, port) server.start() try: _wait_for_port(port) yield f"http://127.0.0.1:{port}", store finally: server.stop() server.join(timeout=2) @pytest.fixture def receiver_with_bearer(store: EpisodeStore): app = make_app(store=store, max_episode_bytes=10_000_000, bearer_token="s3cret") port = _free_port() server = _ServerThread(app, port) server.start() try: _wait_for_port(port) yield f"http://127.0.0.1:{port}", store finally: server.stop() server.join(timeout=2) def _make_shipper( tmp_path: Path, receiver_url: str, *, host_id: str = "lab1", bearer: str | None = None, ) -> tuple[ShipperConfig, ShipperTransport, ShipperQueue]: data_root = tmp_path / "lab-data" cfg = ShipperConfig( host_id=host_id, data_root=data_root, receiver=ReceiverEndpoint(url=receiver_url, bearer_token=bearer), scan_interval_s=0.05, ) transport = ShipperTransport(cfg) queue = ShipperQueue(cfg, transport) return cfg, transport, queue def _make_episode(cfg: ShipperConfig, episode_id: str, *, content: bytes = b"data") -> Path: ep = cfg.episodes_dir / episode_id ep.mkdir(parents=True, exist_ok=True) (ep / "meta.json").write_bytes(content) (ep / "events.jsonl").write_text("{}\n") (ep / "labels.jsonl").write_text("{}\n") (ep / "telemetry-proc.jsonl").write_text("{}\n") (ep / "done.marker").touch() return ep # --------------------------------------------------------------------------- # Ping # --------------------------------------------------------------------------- def test_ping_returns_ok_against_running_receiver(tmp_path: Path, receiver) -> None: url, _ = receiver _, transport, _ = _make_shipper(tmp_path, url) res = transport.ping() assert res.ok is True assert res.status_code == 200 assert res.body is not None assert res.body["ok"] is True assert res.body["host_id"] == "lab1" assert res.body["schema_version"] == 1 def test_ping_writes_nothing_to_index(tmp_path: Path, receiver) -> None: url, store = receiver _, transport, _ = _make_shipper(tmp_path, url) transport.ping() transport.ping() transport.ping() assert store.index_path.read_text() == "" def test_ping_fails_with_wrong_bearer(tmp_path: Path, receiver_with_bearer) -> None: url, _ = receiver_with_bearer _, transport, _ = _make_shipper(tmp_path, url, bearer="WRONG") res = transport.ping() assert res.ok is False assert res.status_code == 401 def test_ping_succeeds_with_right_bearer(tmp_path: Path, receiver_with_bearer) -> None: url, _ = receiver_with_bearer _, transport, _ = _make_shipper(tmp_path, url, bearer="s3cret") res = transport.ping() assert res.ok is True assert res.status_code == 200 def test_ping_fails_when_receiver_unreachable(tmp_path: Path) -> None: # Pick a free port and don't bind it — connect must fail. port = _free_port() _, transport, _ = _make_shipper(tmp_path, f"http://127.0.0.1:{port}") res = transport.ping() assert res.ok is False assert res.status_code == 0 assert res.error is not None def test_transport_defers_when_ca_bundle_missing(tmp_path: Path) -> None: """Issue #11: first-boot bring-up enables the shipper before the Pi has issued the mTLS leaf. Construction must not crash; ping/ship should return a transient error until the cert lands.""" missing_ca = tmp_path / "not-yet" / "wg-ca.pem" cfg = ShipperConfig( host_id="lab1", data_root=tmp_path / "lab-data", receiver=ReceiverEndpoint( url="https://collector.wg", ca_bundle=missing_ca, ), ) # Construction MUST succeed even though the CA bundle is missing — # this is the bug fix: previously raised FileNotFoundError out of # ssl.create_default_context, crashing the systemd unit. transport = ShipperTransport(cfg) res = transport.ping() assert res.ok is False assert res.error is not None and "mTLS material" in res.error # --------------------------------------------------------------------------- # Tar + ship # --------------------------------------------------------------------------- def test_run_once_ships_one_done_episode(tmp_path: Path, receiver) -> None: url, store = receiver cfg, _, queue = _make_shipper(tmp_path, url) _make_episode(cfg, "01EPISODE") result = queue.run_once() assert result.scanned == 1 assert result.shipped == 1 assert result.transient_failures == 0 # Episode dir moved to shipped/. assert not (cfg.episodes_dir / "01EPISODE").exists() assert (cfg.shipped_dir / "01EPISODE").exists() # Outbox tarball cleaned up. assert list(cfg.outbox_dir.iterdir()) == [] # Receiver stored it and indexed it. assert store.final_path("lab1", "01EPISODE").exists() rows = [json.loads(l) for l in store.index_path.read_text().splitlines()] assert len(rows) == 1 assert rows[0]["host_id"] == "lab1" assert rows[0]["episode_id"] == "01EPISODE" def test_run_once_skips_episodes_without_done_marker(tmp_path: Path, receiver) -> None: url, store = receiver cfg, _, queue = _make_shipper(tmp_path, url) ep = cfg.episodes_dir / "01PARTIAL" ep.mkdir(parents=True) (ep / "meta.json").write_text("{}") # Note: NO done.marker. result = queue.run_once() assert result.scanned == 0 assert result.shipped == 0 assert ep.exists() # untouched assert store.index_path.read_text() == "" def test_run_once_idempotent_re_ship_returns_already_present(tmp_path: Path, receiver) -> None: """If a prior run shipped an episode but crashed before retiring it, the next run must re-ship the same bytes successfully (200) and retire the dir, not flag it as a conflict.""" url, store = receiver cfg, _, queue = _make_shipper(tmp_path, url) _make_episode(cfg, "01REPLAY", content=b"same-bytes") queue.run_once() assert (cfg.shipped_dir / "01REPLAY").exists() # Simulate a crash: move it back as if retire never happened. (cfg.shipped_dir / "01REPLAY").rename(cfg.episodes_dir / "01REPLAY") result = queue.run_once() assert result.scanned == 1 assert result.shipped == 1 assert (cfg.shipped_dir / "01REPLAY").exists() # Index didn't double up. rows = store.index_path.read_text().splitlines() assert len(rows) == 1 def test_run_once_handles_409_conflict(tmp_path: Path, receiver) -> None: """If the same episode_id was previously shipped with *different* bytes, the receiver returns 409 and the shipper must NOT retire the local dir — operator triage required.""" url, _ = receiver cfg, _, queue = _make_shipper(tmp_path, url) _make_episode(cfg, "01CONFLICT", content=b"first") result = queue.run_once() assert result.shipped == 1 # Simulate a re-do with different content but the same id (e.g., a # botched re-run on the lab host). (cfg.shipped_dir / "01CONFLICT").rename(cfg.episodes_dir / "01CONFLICT") (cfg.episodes_dir / "01CONFLICT" / "meta.json").write_bytes(b"tampered") result = queue.run_once() assert result.scanned == 1 assert result.shipped == 0 assert result.conflicts == 1 # Local dir survives — operator can decide what to do. assert (cfg.episodes_dir / "01CONFLICT").exists() def test_run_once_handles_transient_when_receiver_is_down(tmp_path: Path) -> None: port = _free_port() cfg, _, queue = _make_shipper(tmp_path, f"http://127.0.0.1:{port}") _make_episode(cfg, "01DOWN") result = queue.run_once() assert result.scanned == 1 assert result.shipped == 0 assert result.transient_failures == 1 # Episode dir + tarball both stay in place for the next pass. assert (cfg.episodes_dir / "01DOWN").exists() assert (cfg.outbox_dir / "01DOWN.tar.zst").exists() def test_quarantine_cleanup_removes_old_entries(tmp_path: Path) -> None: """Without an upper bound, quarantine/ grows forever. The cleanup pass walks it once per cleanup_interval and drops anything past keep_days — bounded by directory size since it just statx()s each entry's mtime. We run with cleanup_interval_s=0 so the gate fires on every pass, and overload `os.utime` to age a fixture entry past the cutoff without sleeping for real time.""" import os as _os import time as _time cfg = ShipperConfig( host_id="lab1", data_root=tmp_path / "lab-data", receiver=ReceiverEndpoint(url="http://127.0.0.1:1"), scan_interval_s=0.05, quarantine_keep_days=7, quarantine_cleanup_interval_s=0.0, # always run on every pass ) class _NoTransport: def ship_tarball(self, *a, **kw): raise AssertionError("not used") queue = ShipperQueue(cfg, _NoTransport()) old = cfg.quarantine_dir / "01OLD" old.mkdir() (old / "meta.json").write_text("{}") new = cfg.quarantine_dir / "01NEW" new.mkdir() (new / "meta.json").write_text("{}") # Backdate the OLD entry by 8 days. The directory's own mtime # is what cleanup checks. eight_days_ago = _time.time() - (8 * 86400) _os.utime(old, (eight_days_ago, eight_days_ago)) queue._maybe_cleanup_quarantine() assert not old.exists(), "8-day-old entry should be cleaned up" assert new.exists(), "fresh entry should survive" def test_quarantine_cleanup_disabled_when_keep_days_zero(tmp_path: Path) -> None: import os as _os import time as _time cfg = ShipperConfig( host_id="lab1", data_root=tmp_path / "lab-data", receiver=ReceiverEndpoint(url="http://127.0.0.1:1"), scan_interval_s=0.05, quarantine_keep_days=0, # disabled quarantine_cleanup_interval_s=0.0, ) class _NoTransport: def ship_tarball(self, *a, **kw): raise AssertionError("not used") queue = ShipperQueue(cfg, _NoTransport()) old = cfg.quarantine_dir / "01OLD" old.mkdir() eight_days_ago = _time.time() - (8 * 86400) _os.utime(old, (eight_days_ago, eight_days_ago)) queue._maybe_cleanup_quarantine() assert old.exists(), "cleanup must be a no-op when keep_days=0" def test_quarantine_cleanup_respects_interval_gate(tmp_path: Path) -> None: """The interval gate prevents the 5s scan tick from statx()-ing the whole quarantine tree on every pass.""" import os as _os import time as _time cfg = ShipperConfig( host_id="lab1", data_root=tmp_path / "lab-data", receiver=ReceiverEndpoint(url="http://127.0.0.1:1"), scan_interval_s=0.05, quarantine_keep_days=7, quarantine_cleanup_interval_s=3600.0, ) class _NoTransport: def ship_tarball(self, *a, **kw): raise AssertionError("not used") queue = ShipperQueue(cfg, _NoTransport()) # First pass: gate's 0.0 sentinel means we sweep. queue._maybe_cleanup_quarantine() first_at = queue._last_quarantine_cleanup_at assert first_at > 0 # Stage an old entry AFTER the first sweep. The gate should # block the next sweep until cleanup_interval_s has elapsed. old = cfg.quarantine_dir / "01OLD" old.mkdir() _os.utime(old, (_time.time() - 8 * 86400,) * 2) queue._maybe_cleanup_quarantine() assert old.exists(), "gate should defer the next sweep" assert queue._last_quarantine_cleanup_at == first_at def test_run_forever_calls_heartbeat(tmp_path: Path) -> None: """The heartbeat callback fires once per completed pass. In production this is wired to sd_notify(WATCHDOG=1) so systemd's WatchdogSec catches a hung scan loop.""" import threading cfg = ShipperConfig( host_id="lab1", data_root=tmp_path / "lab-data", receiver=ReceiverEndpoint(url="http://127.0.0.1:1"), scan_interval_s=0.05, ) class _NoTransport: def ship_tarball(self, *a, **kw): raise AssertionError("not used") queue = ShipperQueue(cfg, _NoTransport()) beats = [] stop = threading.Event() def _heartbeat() -> None: beats.append(time.monotonic()) if len(beats) >= 3: stop.set() queue.run_forever(stop_check=stop.is_set, heartbeat=_heartbeat) assert len(beats) >= 3 def test_run_forever_survives_heartbeat_exception(tmp_path: Path) -> None: """A broken heartbeat (e.g. NOTIFY_SOCKET vanished) must not take down the daemon — the loss of watchdog is tolerable; the loss of the ship loop is not.""" cfg = ShipperConfig( host_id="lab1", data_root=tmp_path / "lab-data", receiver=ReceiverEndpoint(url="http://127.0.0.1:1"), scan_interval_s=0.05, ) class _NoTransport: def ship_tarball(self, *a, **kw): raise AssertionError("not used") queue = ShipperQueue(cfg, _NoTransport()) pass_count = [0] def _stop() -> bool: return pass_count[0] >= 3 def _broken_heartbeat() -> None: pass_count[0] += 1 raise RuntimeError("simulated NOTIFY_SOCKET failure") # Should NOT raise. queue.run_forever(stop_check=_stop, heartbeat=_broken_heartbeat) assert pass_count[0] >= 3 def test_run_once_sweeps_orphaned_outbox_tarball(tmp_path: Path, receiver) -> None: """A tarball in outbox/ with no matching episode dir should get cleaned up at the start of the next scan. The lifecycle invariant is `outbox/.tar.zst exists ⇒ episodes// exists`; a violation means external interference (operator rm-ed the episode, OS crash, leftover from older buggy code) and we'd otherwise carry dead bytes forever.""" url, _ = receiver cfg, _, queue = _make_shipper(tmp_path, url) # Stage an orphan: a tarball in outbox/ with no corresponding # episodes/01ORPHAN/ directory. cfg.outbox_dir.mkdir(parents=True, exist_ok=True) orphan = cfg.outbox_dir / "01ORPHAN.tar.zst" orphan.write_bytes(b"\x28\xb5\x2f\xfd") # zstd magic, not a real tarball # Also a partial — same orphan rule applies. partial = cfg.outbox_dir / "01PARTIAL.tar.zst.partial" partial.write_bytes(b"x") # And a non-orphan: tarball backed by an actual episode dir. ep = _make_episode(cfg, "01LIVE") queue.run_once() assert not orphan.exists(), "orphan tarball must be swept" assert not partial.exists(), "orphan partial must be swept" # 01LIVE got shipped+retired in the same pass; both its tarball # and its episode dir are gone (moved to shipped/). assert (cfg.shipped_dir / "01LIVE").exists() def test_run_once_quarantines_fatal_episode(tmp_path: Path) -> None: """A 4xx-other-than-409 (e.g. 400 missing-commit) means re-shipping won't succeed. The shipper must move the episode out of the live queue so the next scan doesn't burn a PUT on the same dir, AND drop the outbox tarball so disk doesn't fill up with stale .zst. Regression: pre-fix queue.py left fatal episodes in episodes/ on every pass, so 4465+ pre-stamp episodes on k-gamingcom kept fatal-looping at ~1 PUT/sec for 5+ hours after the receiver gate went live.""" class _Always400Transport: """Stub transport that always rejects with a fatal 400. Mirrors transport.py's own behaviour for 4xx-not-409.""" def __init__(self) -> None: self.calls = 0 def ship_tarball(self, episode_id, tarball_path, sha256_hex, commit=None): self.calls += 1 from shipper.transport import ShipResult return ShipResult( status="fatal", status_code=400, sha256=None, body={"error": "missing X-Cis490-Code-Commit header", "remediation": "pull and reinstall"}, error="client error 400", ) cfg = ShipperConfig( host_id="lab1", data_root=tmp_path / "lab-data", receiver=ReceiverEndpoint(url="http://127.0.0.1:1"), scan_interval_s=0.05, ) queue = ShipperQueue(cfg, _Always400Transport()) _make_episode(cfg, "01PRESTAMP") result = queue.run_once() assert result.scanned == 1 assert result.fatal == 1 assert result.shipped == 0 # Episode dir is OUT of episodes/ and IN quarantine/. assert not (cfg.episodes_dir / "01PRESTAMP").exists() assert (cfg.quarantine_dir / "01PRESTAMP").exists() assert (cfg.quarantine_dir / "01PRESTAMP" / "meta.json").exists() # The reason file carries enough context for triage. reason = json.loads( (cfg.quarantine_dir / "01PRESTAMP" / "quarantine_reason.json").read_text() ) assert reason["status_code"] == 400 assert reason["error"] == "client error 400" assert reason["body"]["error"] == "missing X-Cis490-Code-Commit header" assert "quarantined_at_wall" in reason # Outbox is empty — no stale tarball. assert list(cfg.outbox_dir.iterdir()) == [] # Critically: a second pass is a no-op. The fix would be useless if # quarantined episodes leaked back in. result2 = queue.run_once() assert result2.scanned == 0 assert result2.fatal == 0 def test_tarball_round_trips_episode_dir(tmp_path: Path, receiver) -> None: """The receiver-side tarball must extract back to the original episode dir layout (modulo file order). Verifies the tar+zstd pipe is intact.""" import subprocess import tarfile url, _ = receiver cfg, _, queue = _make_shipper(tmp_path, url) ep = _make_episode(cfg, "01ROUND", content=b"meta-bytes") expected_files = sorted(p.name for p in ep.iterdir()) queue.run_once() # The receiver stored it; pull the bytes back, decompress + untar. rcv_path = next((tmp_path / "rcv-episodes" / "lab1").glob("01ROUND.tar.zst")) decompressed = tmp_path / "01ROUND.tar" subprocess.check_call( ["zstd", "-q", "-d", "-o", str(decompressed), str(rcv_path)], ) extract_dir = tmp_path / "extracted" extract_dir.mkdir() with tarfile.open(decompressed) as tf: tf.extractall(extract_dir) got_files = sorted(p.name for p in (extract_dir / "01ROUND").iterdir()) assert got_files == expected_files