robustness: gate falls back to local git, queue sweeps stale tarballs

Two follow-ups from the post-cutover diagnosis:

1. version_gate: forgejo → local git fallback. If forgejo refresh
   returns empty AND a local repo path is configured, retry against
   `git log` from the local checkout. The receiver service runs on
   the same Pi as forgejo, so a simultaneous restart used to leave
   the gate's cache empty and reject every PUT with not-in-window.
   Auto-detects /opt/cis490/.git when the operator hasn't set
   local_repo_path explicitly — that path is always present on a
   production receiver and ProtectSystem=strict still allows reads.
   Logs `source=git-fallback` so this isn't silent.

2. shipper/queue: sweep orphaned outbox tarballs. The lifecycle
   invariant is `outbox/<id>.tar.zst exists ⇒ episodes/<id>/ exists`
   — broken historically by the now-fixed fatal-loop, by operator
   `rm` of an episode dir, or by an OS crash between rename(2) and
   the post-ship cleanup. Without sweeping, dead bytes pile up
   forever. New _sweep_outbox runs at the start of every scan,
   bounded by the file count in outbox/.

Tests cover: fallback fires when forgejo unreachable + repo_path set;
no fallback when repo_path None (opt-in); orphan tarball + partial
get swept on the next pass; live tarballs untouched.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
max 2026-05-01 11:49:38 -05:00
parent f294e97875
commit 5cebe7096a
5 changed files with 138 additions and 3 deletions

View file

@ -3,6 +3,7 @@ from __future__ import annotations
import argparse
import logging
import os
from pathlib import Path
import uvicorn
@ -34,8 +35,16 @@ def main() -> None:
)
version_gate = None
if cfg.version_gate_enabled:
# Auto-detect /opt/cis490/.git as a fallback so a forgejo blip
# at startup doesn't reject every PUT with not-in-window. The
# receiver service has read access to /opt under
# ProtectSystem=strict, and that path is where the production
# install lands — so it's the natural local source of truth.
repo_path = cfg.version_gate_local_repo
if repo_path is None and Path("/opt/cis490/.git").is_dir():
repo_path = Path("/opt/cis490")
version_gate = VersionGate(
repo_path=cfg.version_gate_local_repo,
repo_path=repo_path,
window=cfg.version_gate_window,
forgejo_url=cfg.version_gate_forgejo_url,
repo_owner=cfg.version_gate_repo_owner,

View file

@ -84,8 +84,22 @@ class VersionGate:
# ---- backend dispatch -----------------------------------------------
def _refresh(self) -> None:
# Try forgejo first when configured. If it fails AND we also
# have a local repo path, fall back to git — this lets the
# receiver keep working through a forgejo blip (e.g. both
# services restarting on the same host) without flat-rejecting
# every PUT with not-in-window. The local repo on the
# receiver's own host is the closest thing to ground truth we
# have when the canonical source is unreachable.
source = "forgejo" if self.forgejo_url else "git"
if self.forgejo_url:
hashes, head = self._refresh_forgejo()
if not hashes and self.repo_path is not None:
log.info("forgejo refresh empty; falling back to local git "
"at %s", self.repo_path)
hashes, head = self._refresh_git()
if hashes:
source = "git-fallback"
else:
hashes, head = self._refresh_git()
if not hashes:
@ -98,8 +112,7 @@ class VersionGate:
self._cached_at = time.monotonic()
self._head = head
log.info("version-gate refreshed: %d valid hashes, head=%s, source=%s",
len(hashes), head[:12] if head else "?",
"forgejo" if self.forgejo_url else "git")
len(hashes), head[:12] if head else "?", source)
def _refresh_forgejo(self) -> tuple[set[str], str | None]:
"""GET /api/v1/repos/<owner>/<name>/commits?sha=<branch>&limit=<n>."""

View file

@ -70,6 +70,13 @@ class ShipperQueue:
def run_once(self) -> PassResult:
"""One scan pass. Returns counts for logging / tests."""
# Sweep stale outbox tarballs first. Normal lifecycle has the
# tarball deleted alongside retire / quarantine, but operator
# intervention (rm of an episode dir) or a crash between
# rename(2) and the post-ship cleanup can leave one orphaned.
# Bounded by the number of files in outbox/, so cheap to do
# every pass.
self._sweep_outbox()
ready = self._ready_episodes()
scanned = len(ready)
shipped = 0
@ -156,6 +163,34 @@ class ShipperQueue:
out.append(ep)
return out
def _sweep_outbox(self) -> None:
"""Delete tarballs in outbox/ that have no matching episode dir.
The invariant the queue maintains: ``outbox/<id>.tar.zst``
only exists while ``episodes/<id>/`` is also present.
retire+quarantine both delete the tarball when they move the
episode out, and tar overwrites any prior ``.partial`` on
each pass. So a stray tarball means an external actor (or an
OS crash) broke the invariant clean it up rather than
carrying dead bytes on disk forever."""
outbox = self.cfg.outbox_dir
if not outbox.exists():
return
for f in outbox.iterdir():
name = f.name
if name.endswith(".tar.zst"):
ep_id = name[: -len(".tar.zst")]
elif name.endswith(".tar.zst.partial"):
ep_id = name[: -len(".tar.zst.partial")]
else:
continue
if not (self.cfg.episodes_dir / ep_id).exists():
try:
f.unlink()
log.info("swept orphan tarball %s", name)
except OSError:
log.exception("failed to sweep orphan tarball %s", name)
def _read_episode_commit(self, ep_dir: Path) -> str | None:
"""Pull meta.json::code_version.commit so the shipper can send
it as X-Cis490-Code-Commit. Returns None if the file is

View file

@ -320,6 +320,38 @@ def test_run_once_handles_transient_when_receiver_is_down(tmp_path: Path) -> Non
assert (cfg.outbox_dir / "01DOWN.tar.zst").exists()
def test_run_once_sweeps_orphaned_outbox_tarball(tmp_path: Path, receiver) -> None:
"""A tarball in outbox/ with no matching episode dir should get
cleaned up at the start of the next scan. The lifecycle invariant
is `outbox/<id>.tar.zst exists episodes/<id>/ exists`; a
violation means external interference (operator rm-ed the
episode, OS crash, leftover from older buggy code) and we'd
otherwise carry dead bytes forever."""
url, _ = receiver
cfg, _, queue = _make_shipper(tmp_path, url)
# Stage an orphan: a tarball in outbox/ with no corresponding
# episodes/01ORPHAN/ directory.
cfg.outbox_dir.mkdir(parents=True, exist_ok=True)
orphan = cfg.outbox_dir / "01ORPHAN.tar.zst"
orphan.write_bytes(b"\x28\xb5\x2f\xfd") # zstd magic, not a real tarball
# Also a partial — same orphan rule applies.
partial = cfg.outbox_dir / "01PARTIAL.tar.zst.partial"
partial.write_bytes(b"x")
# And a non-orphan: tarball backed by an actual episode dir.
ep = _make_episode(cfg, "01LIVE")
queue.run_once()
assert not orphan.exists(), "orphan tarball must be swept"
assert not partial.exists(), "orphan partial must be swept"
# 01LIVE got shipped+retired in the same pass; both its tarball
# and its episode dir are gone (moved to shipped/).
assert (cfg.shipped_dir / "01LIVE").exists()
def test_run_once_quarantines_fatal_episode(tmp_path: Path) -> None:
"""A 4xx-other-than-409 (e.g. 400 missing-commit) means re-shipping
won't succeed. The shipper must move the episode out of the live

View file

@ -248,6 +248,52 @@ def test_e2e_receiver_returns_400_when_commit_header_missing(repo: Path, tmp_pat
assert "missing" in r.json()["error"].lower()
def test_forgejo_falls_back_to_local_git_when_unreachable(repo: Path) -> None:
"""If forgejo is unreachable AND a local repo path is configured,
the gate must fall back to git so the receiver doesn't reject
every PUT during a forgejo blip (e.g. the receiver and forgejo
restarting together on the same Pi). Without the fallback, the
cache stays empty and `check()` returns not-in-window for
everything breaking all lab hosts simultaneously even though
their commits are perfectly valid."""
# Pick a port nothing's bound to. urllib will fail-fast.
import socket as _s
with _s.socket(_s.AF_INET, _s.SOCK_STREAM) as sock:
sock.bind(("127.0.0.1", 0))
dead_port = sock.getsockname()[1]
g = VersionGate(
repo_path=repo,
forgejo_url=f"http://127.0.0.1:{dead_port}",
repo_owner="spectral", repo_name="CIS490", branch="main",
window=50, cache_ttl_s=0,
)
head = _commits(repo)[0]
# Forgejo is unreachable, but the local repo has the commit.
# check() must still return ok.
ok, reason = g.check(head)
assert ok, f"fallback didn't trigger: reason={reason}, head={head}"
assert g.valid_count() >= 1
def test_forgejo_no_fallback_without_local_repo(repo: Path) -> None:
"""Without a local repo configured, a forgejo blip is fatal — the
gate has nothing to fall back to and returns not-in-window. This
confirms the fallback is opt-in via repo_path, not a free side
effect of the forgejo backend."""
import socket as _s
with _s.socket(_s.AF_INET, _s.SOCK_STREAM) as sock:
sock.bind(("127.0.0.1", 0))
dead_port = sock.getsockname()[1]
g = VersionGate(
forgejo_url=f"http://127.0.0.1:{dead_port}",
repo_owner="spectral", repo_name="CIS490", branch="main",
window=50, cache_ttl_s=0,
)
ok, reason = g.check("a" * 40)
assert not ok and reason == "not-in-window"
assert g.valid_count() == 0
def test_missing_repo_keeps_prior_cache(repo: Path) -> None:
"""If the maintainer's clone disappears (or git fails), the gate
keeps its last-known allow-list better than locking out every