Implements the deployment loop end-to-end on the CIS490 side:
shipper/
config.py ShipperConfig (host_id, paths, receiver endpoint, mTLS)
transport.py httpx-based PUT + ping with mTLS + bearer support
queue.py scan data/episodes/, tar+zstd via system zstd, ship,
retire to data/shipped/. Idempotent across crashes per
the state machine in docs/transport.md.
__main__.py CLI: --ping (smoke test), --once (one pass), or daemon
receiver/app.py: new POST /v1/ping that requires the same auth as PUT
/v1/episodes but writes nothing. Used by `cis490-shipper --ping`
during lab-host bring-up to verify the WG/Caddy/mTLS path before
shipping any real bytes.
etc/
cis490-shipper.service systemd unit for the lab-host shipper
cis490-orchestrator.service systemd unit for the lab-host queue
(kept disabled by default until queue
mode lands)
lab-host.toml.example config template
scripts/
install-lab-host.sh idempotent installer; verifies prereqs,
creates cis490 service user, syncs repo to
/opt/cis490, builds venv, drops systemd units
and config template
install-receiver.sh same, for the receiver role on the central WG
node (Pi5 in our setup)
tests/test_shipper.py 11 end-to-end tests against a real Uvicorn
server hosting the receiver app. Exercises
ping, tar+ship, idempotent re-ship, 409
conflict, transient (receiver down), tarball
round-trip via system zstd.
AGENTS.md guidance for AI agents working on this and sibling repos.
Headline: when you hit an issue you can't fully fix in
scope, file a Forgejo issue rather than leaving a TODO.
51/51 tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
195 lines
6.7 KiB
Python
195 lines
6.7 KiB
Python
"""Shipper episode queue — scan, compress, ship, retire.
|
|
|
|
State machine, mirroring docs/transport.md:
|
|
|
|
data/episodes/<id>/done.marker
|
|
|
|
|
v
|
|
tar+zstd → data/outbox/<id>.tar.zst.partial
|
|
|
|
|
v
|
|
rename → data/outbox/<id>.tar.zst
|
|
|
|
|
v
|
|
PUT to receiver
|
|
|
|
|
+-- 200/201 → mv data/episodes/<id> → data/shipped/<id>
|
|
| rm data/outbox/<id>.tar.zst
|
|
|
|
|
+-- 409 → leave files in place (the local + remote tarball
|
|
| differ; manual triage)
|
|
|
|
|
+-- 5xx/net → leave outbox tarball; retry on next pass
|
|
|
|
|
+-- 4xx → log + skip (caller-side bug, doesn't self-heal)
|
|
|
|
Idempotent on every pass. A crash mid-tar leaves only a ``.partial``
|
|
which the next pass overwrites. A crash mid-PUT leaves the tarball in
|
|
``outbox/`` and the next pass re-ships it; the receiver responds 200
|
|
on a matching sha256, 409 on a divergent one.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import shutil
|
|
import subprocess
|
|
import tarfile
|
|
import tempfile
|
|
import time
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
from .config import ShipperConfig
|
|
from .transport import ShipperTransport, ShipResult, hash_file
|
|
|
|
|
|
log = logging.getLogger("cis490.shipper.queue")
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class PassResult:
|
|
scanned: int
|
|
shipped: int
|
|
transient_failures: int
|
|
conflicts: int
|
|
fatal: int
|
|
|
|
|
|
class ShipperQueue:
|
|
def __init__(self, cfg: ShipperConfig, transport: ShipperTransport) -> None:
|
|
self.cfg = cfg
|
|
self.transport = transport
|
|
cfg.episodes_dir.mkdir(parents=True, exist_ok=True)
|
|
cfg.outbox_dir.mkdir(parents=True, exist_ok=True)
|
|
cfg.shipped_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# ---- main entry point ---------------------------------------------
|
|
|
|
def run_once(self) -> PassResult:
|
|
"""One scan pass. Returns counts for logging / tests."""
|
|
ready = self._ready_episodes()
|
|
scanned = len(ready)
|
|
shipped = 0
|
|
transient = 0
|
|
conflicts = 0
|
|
fatal = 0
|
|
|
|
for ep_dir in ready:
|
|
episode_id = ep_dir.name
|
|
try:
|
|
tarball, sha = self._tar_episode(ep_dir)
|
|
except Exception:
|
|
log.exception("tar failed for %s", episode_id)
|
|
transient += 1
|
|
continue
|
|
|
|
res = self.transport.ship_tarball(episode_id, tarball, sha)
|
|
log.info(
|
|
"ship %s -> %s (%d) %s",
|
|
episode_id, res.status, res.status_code, res.error or "",
|
|
)
|
|
|
|
if res.status in ("stored", "already-present"):
|
|
self._retire(ep_dir, tarball)
|
|
shipped += 1
|
|
elif res.status == "conflict":
|
|
conflicts += 1
|
|
# Keep the tarball + episode dir in place. Operator must
|
|
# decide whether to drop our copy or fix the remote one.
|
|
elif res.status == "transient":
|
|
transient += 1
|
|
else: # fatal
|
|
fatal += 1
|
|
|
|
return PassResult(
|
|
scanned=scanned,
|
|
shipped=shipped,
|
|
transient_failures=transient,
|
|
conflicts=conflicts,
|
|
fatal=fatal,
|
|
)
|
|
|
|
def run_forever(self, *, stop_check=lambda: False) -> None:
|
|
while not stop_check():
|
|
try:
|
|
self.run_once()
|
|
except Exception:
|
|
log.exception("scan pass crashed; sleeping anyway")
|
|
# Coarse sleep: we don't need precise scheduling and we
|
|
# don't want a tight loop on errors.
|
|
t0 = time.monotonic()
|
|
while time.monotonic() - t0 < self.cfg.scan_interval_s:
|
|
if stop_check():
|
|
return
|
|
time.sleep(0.5)
|
|
|
|
# ---- internals -----------------------------------------------------
|
|
|
|
def _ready_episodes(self) -> list[Path]:
|
|
out: list[Path] = []
|
|
if not self.cfg.episodes_dir.exists():
|
|
return out
|
|
for ep in sorted(self.cfg.episodes_dir.iterdir()):
|
|
if ep.is_dir() and (ep / "done.marker").exists():
|
|
out.append(ep)
|
|
return out
|
|
|
|
def _tar_episode(self, ep_dir: Path) -> tuple[Path, str]:
|
|
"""Tar+zstd the episode dir into outbox. Idempotent — overwrites
|
|
any prior partial. Returns ``(tarball_path, sha256_hex)``."""
|
|
episode_id = ep_dir.name
|
|
outbox = self.cfg.outbox_dir
|
|
partial = outbox / f"{episode_id}.tar.zst.partial"
|
|
final = outbox / f"{episode_id}.tar.zst"
|
|
|
|
partial.unlink(missing_ok=True)
|
|
|
|
# We use the system `zstd` for streaming compression: pipe a
|
|
# tar stream into `zstd -T0 -19` to get a deterministic tarball
|
|
# without buffering the whole tar in memory or pulling in the
|
|
# python-zstandard dependency. Falls back to in-process `zstd`
|
|
# via the python wheel if the binary isn't on PATH.
|
|
if _which_zstd():
|
|
with partial.open("wb") as zout:
|
|
proc = subprocess.Popen(
|
|
["zstd", "-q", "-T0", "-19", "--stdout"],
|
|
stdin=subprocess.PIPE, stdout=zout,
|
|
)
|
|
assert proc.stdin is not None
|
|
with tarfile.open(fileobj=proc.stdin, mode="w|") as tf:
|
|
tf.add(ep_dir, arcname=episode_id, recursive=True)
|
|
proc.stdin.close()
|
|
rc = proc.wait()
|
|
if rc != 0:
|
|
partial.unlink(missing_ok=True)
|
|
raise RuntimeError(f"zstd exited {rc}")
|
|
else:
|
|
# Fallback: pipe through python's built-in zlib via gzip is
|
|
# NOT compatible (we want zstd). Surface the missing binary
|
|
# rather than silently producing a non-zstd tarball.
|
|
partial.unlink(missing_ok=True)
|
|
raise RuntimeError(
|
|
"the `zstd` binary is required on the lab host. "
|
|
"Install it via your package manager."
|
|
)
|
|
|
|
sha = hash_file(partial)
|
|
partial.replace(final)
|
|
return final, sha
|
|
|
|
def _retire(self, ep_dir: Path, tarball: Path) -> None:
|
|
"""Move episode dir → shipped/, drop the tarball."""
|
|
target = self.cfg.shipped_dir / ep_dir.name
|
|
if target.exists():
|
|
# Belt-and-suspenders: re-shipping an already-retired
|
|
# episode shouldn't happen (the dir was moved), but if it
|
|
# does, prefer the existing copy and just clean up.
|
|
shutil.rmtree(ep_dir, ignore_errors=True)
|
|
else:
|
|
ep_dir.replace(target)
|
|
tarball.unlink(missing_ok=True)
|
|
|
|
|
|
def _which_zstd() -> bool:
|
|
return shutil.which("zstd") is not None
|