CIS490/shipper/queue.py
max 7c9f9582ca Lab-host shipper + receiver /v1/ping + install scripts
Implements the deployment loop end-to-end on the CIS490 side:

shipper/
  config.py      ShipperConfig (host_id, paths, receiver endpoint, mTLS)
  transport.py   httpx-based PUT + ping with mTLS + bearer support
  queue.py       scan data/episodes/, tar+zstd via system zstd, ship,
                 retire to data/shipped/. Idempotent across crashes per
                 the state machine in docs/transport.md.
  __main__.py    CLI: --ping (smoke test), --once (one pass), or daemon

receiver/app.py: new POST /v1/ping that requires the same auth as PUT
  /v1/episodes but writes nothing. Used by `cis490-shipper --ping`
  during lab-host bring-up to verify the WG/Caddy/mTLS path before
  shipping any real bytes.

etc/
  cis490-shipper.service       systemd unit for the lab-host shipper
  cis490-orchestrator.service  systemd unit for the lab-host queue
                               (kept disabled by default until queue
                               mode lands)
  lab-host.toml.example        config template

scripts/
  install-lab-host.sh   idempotent installer; verifies prereqs,
                        creates cis490 service user, syncs repo to
                        /opt/cis490, builds venv, drops systemd units
                        and config template
  install-receiver.sh   same, for the receiver role on the central WG
                        node (Pi5 in our setup)

tests/test_shipper.py  11 end-to-end tests against a real Uvicorn
                       server hosting the receiver app. Exercises
                       ping, tar+ship, idempotent re-ship, 409
                       conflict, transient (receiver down), tarball
                       round-trip via system zstd.

AGENTS.md  guidance for AI agents working on this and sibling repos.
           Headline: when you hit an issue you can't fully fix in
           scope, file a Forgejo issue rather than leaving a TODO.

51/51 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 23:41:32 -05:00

195 lines
6.7 KiB
Python

"""Shipper episode queue — scan, compress, ship, retire.
State machine, mirroring docs/transport.md:
data/episodes/<id>/done.marker
|
v
tar+zstd → data/outbox/<id>.tar.zst.partial
|
v
rename → data/outbox/<id>.tar.zst
|
v
PUT to receiver
|
+-- 200/201 → mv data/episodes/<id> → data/shipped/<id>
| rm data/outbox/<id>.tar.zst
|
+-- 409 → leave files in place (the local + remote tarball
| differ; manual triage)
|
+-- 5xx/net → leave outbox tarball; retry on next pass
|
+-- 4xx → log + skip (caller-side bug, doesn't self-heal)
Idempotent on every pass. A crash mid-tar leaves only a ``.partial``
which the next pass overwrites. A crash mid-PUT leaves the tarball in
``outbox/`` and the next pass re-ships it; the receiver responds 200
on a matching sha256, 409 on a divergent one.
"""
from __future__ import annotations
import logging
import shutil
import subprocess
import tarfile
import tempfile
import time
from dataclasses import dataclass
from pathlib import Path
from .config import ShipperConfig
from .transport import ShipperTransport, ShipResult, hash_file
log = logging.getLogger("cis490.shipper.queue")
@dataclass(frozen=True)
class PassResult:
scanned: int
shipped: int
transient_failures: int
conflicts: int
fatal: int
class ShipperQueue:
def __init__(self, cfg: ShipperConfig, transport: ShipperTransport) -> None:
self.cfg = cfg
self.transport = transport
cfg.episodes_dir.mkdir(parents=True, exist_ok=True)
cfg.outbox_dir.mkdir(parents=True, exist_ok=True)
cfg.shipped_dir.mkdir(parents=True, exist_ok=True)
# ---- main entry point ---------------------------------------------
def run_once(self) -> PassResult:
"""One scan pass. Returns counts for logging / tests."""
ready = self._ready_episodes()
scanned = len(ready)
shipped = 0
transient = 0
conflicts = 0
fatal = 0
for ep_dir in ready:
episode_id = ep_dir.name
try:
tarball, sha = self._tar_episode(ep_dir)
except Exception:
log.exception("tar failed for %s", episode_id)
transient += 1
continue
res = self.transport.ship_tarball(episode_id, tarball, sha)
log.info(
"ship %s -> %s (%d) %s",
episode_id, res.status, res.status_code, res.error or "",
)
if res.status in ("stored", "already-present"):
self._retire(ep_dir, tarball)
shipped += 1
elif res.status == "conflict":
conflicts += 1
# Keep the tarball + episode dir in place. Operator must
# decide whether to drop our copy or fix the remote one.
elif res.status == "transient":
transient += 1
else: # fatal
fatal += 1
return PassResult(
scanned=scanned,
shipped=shipped,
transient_failures=transient,
conflicts=conflicts,
fatal=fatal,
)
def run_forever(self, *, stop_check=lambda: False) -> None:
while not stop_check():
try:
self.run_once()
except Exception:
log.exception("scan pass crashed; sleeping anyway")
# Coarse sleep: we don't need precise scheduling and we
# don't want a tight loop on errors.
t0 = time.monotonic()
while time.monotonic() - t0 < self.cfg.scan_interval_s:
if stop_check():
return
time.sleep(0.5)
# ---- internals -----------------------------------------------------
def _ready_episodes(self) -> list[Path]:
out: list[Path] = []
if not self.cfg.episodes_dir.exists():
return out
for ep in sorted(self.cfg.episodes_dir.iterdir()):
if ep.is_dir() and (ep / "done.marker").exists():
out.append(ep)
return out
def _tar_episode(self, ep_dir: Path) -> tuple[Path, str]:
"""Tar+zstd the episode dir into outbox. Idempotent — overwrites
any prior partial. Returns ``(tarball_path, sha256_hex)``."""
episode_id = ep_dir.name
outbox = self.cfg.outbox_dir
partial = outbox / f"{episode_id}.tar.zst.partial"
final = outbox / f"{episode_id}.tar.zst"
partial.unlink(missing_ok=True)
# We use the system `zstd` for streaming compression: pipe a
# tar stream into `zstd -T0 -19` to get a deterministic tarball
# without buffering the whole tar in memory or pulling in the
# python-zstandard dependency. Falls back to in-process `zstd`
# via the python wheel if the binary isn't on PATH.
if _which_zstd():
with partial.open("wb") as zout:
proc = subprocess.Popen(
["zstd", "-q", "-T0", "-19", "--stdout"],
stdin=subprocess.PIPE, stdout=zout,
)
assert proc.stdin is not None
with tarfile.open(fileobj=proc.stdin, mode="w|") as tf:
tf.add(ep_dir, arcname=episode_id, recursive=True)
proc.stdin.close()
rc = proc.wait()
if rc != 0:
partial.unlink(missing_ok=True)
raise RuntimeError(f"zstd exited {rc}")
else:
# Fallback: pipe through python's built-in zlib via gzip is
# NOT compatible (we want zstd). Surface the missing binary
# rather than silently producing a non-zstd tarball.
partial.unlink(missing_ok=True)
raise RuntimeError(
"the `zstd` binary is required on the lab host. "
"Install it via your package manager."
)
sha = hash_file(partial)
partial.replace(final)
return final, sha
def _retire(self, ep_dir: Path, tarball: Path) -> None:
"""Move episode dir → shipped/, drop the tarball."""
target = self.cfg.shipped_dir / ep_dir.name
if target.exists():
# Belt-and-suspenders: re-shipping an already-retired
# episode shouldn't happen (the dir was moved), but if it
# does, prefer the existing copy and just clean up.
shutil.rmtree(ep_dir, ignore_errors=True)
else:
ep_dir.replace(target)
tarball.unlink(missing_ok=True)
def _which_zstd() -> bool:
return shutil.which("zstd") is not None