Why services weren't starting after the gate went live: 1. install-lab-host.sh self-copy. The receiver's 400 remediation tells the agent to `cd /opt/cis490 && git pull && sudo ./scripts/install-lab-host.sh`. That makes REPO_ROOT==INSTALL_ROOT and `cp -aT $REPO_ROOT $INSTALL_ROOT` errors with "are the same file"; `set -e` aborts before the systemd units install or anything restarts. Detect the same-dir case and skip the cp; chown still runs. 2. Services never restart. install-lab-host.sh and install-tier-3-4.sh both ended by *telling the operator* to restart, then exiting. The running shipper/orchestrator kept executing pre-gate code from the old module objects, so new `code_version` stamping never reached an episode. Both scripts now `systemctl restart` the units they own when those units are enabled. 3. Shipper queue fatal-loop. queue.py incremented `fatal++` but didn't move the episode out of `data/episodes/`. Next scan re-tarred and re-PUT the same dir, getting 400 again. With 4465+ pre-stamp episodes on k-gamingcom this burned ~1 PUT/sec for 5+ hours of receiver log. Fatal episodes now move to data/quarantine/<id>/ with a quarantine_reason.json beside them; the outbox tarball is deleted. 4. Pre-stamp backlog drain. tools/quarantine_unstamped.py is a one-shot that scans data/episodes/ and quarantines anything without a 40-char-hex code_version.commit. Wired into install-lab-host.sh step 9 so a re-install drains the queue automatically. Idempotent; safe to run while the shipper is active. Tests cover the queue's new fatal-quarantine path and every drain behaviour (kept/quarantined/dry-run/idempotent/missing-meta/collision). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
99 lines
3.5 KiB
Python
99 lines
3.5 KiB
Python
"""Lab-host shipper config — loaded from /etc/cis490/lab-host.toml."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import tomllib
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ReceiverEndpoint:
|
|
url: str # e.g. "https://collector.wg"
|
|
ca_bundle: Path | None = None
|
|
client_cert: Path | None = None
|
|
client_key: Path | None = None
|
|
bearer_token: str | None = None
|
|
verify_tls: bool = True
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ShipperConfig:
|
|
host_id: str
|
|
data_root: Path # Lab-host data root; episodes/, outbox/, shipped/ live here.
|
|
receiver: ReceiverEndpoint
|
|
# Daemon mode: how often to scan for new done.marker files.
|
|
scan_interval_s: float = 5.0
|
|
# PUT timeout per episode. Tarballs are bounded by max_episode_bytes;
|
|
# at WG speeds this is well under 60s for a typical episode.
|
|
request_timeout_s: float = 60.0
|
|
# Backoff schedule on transient (5xx / network) failures, in seconds,
|
|
# capped at the last entry. The shipper's scan loop will pick the
|
|
# episode up again on the next pass regardless.
|
|
backoff_seconds: tuple[float, ...] = (1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 60.0, 120.0, 300.0)
|
|
# Local retention before pruning data/shipped/.
|
|
keep_local_for_days: int = 7
|
|
|
|
@property
|
|
def episodes_dir(self) -> Path:
|
|
return self.data_root / "episodes"
|
|
|
|
@property
|
|
def outbox_dir(self) -> Path:
|
|
return self.data_root / "outbox"
|
|
|
|
@property
|
|
def shipped_dir(self) -> Path:
|
|
return self.data_root / "shipped"
|
|
|
|
@property
|
|
def quarantine_dir(self) -> Path:
|
|
# Episodes the receiver has refused permanently (4xx other than
|
|
# 409 — typically 400 missing-commit or 412 not-in-window). They
|
|
# don't belong in shipped/ (we have nothing to compare against)
|
|
# and re-shipping them would just re-burn the queue.
|
|
return self.data_root / "quarantine"
|
|
|
|
@classmethod
|
|
def load(cls, path: str | Path) -> "ShipperConfig":
|
|
with open(path, "rb") as f:
|
|
data = tomllib.load(f)
|
|
|
|
host_id = data.get("host_id")
|
|
if not isinstance(host_id, str) or not host_id:
|
|
raise ValueError("lab-host config: host_id (string) required at top level")
|
|
|
|
paths = data.get("paths", {})
|
|
data_root = Path(paths.get("data_root", "/var/lib/cis490/data")).resolve()
|
|
|
|
rcv = data.get("receiver", {})
|
|
url = rcv.get("url")
|
|
if not isinstance(url, str) or not url:
|
|
raise ValueError("lab-host config: receiver.url required")
|
|
|
|
receiver = ReceiverEndpoint(
|
|
url=url.rstrip("/"),
|
|
ca_bundle=_optional_path(rcv.get("ca_bundle")),
|
|
client_cert=_optional_path(rcv.get("client_cert")),
|
|
client_key=_optional_path(rcv.get("client_key")),
|
|
bearer_token=rcv.get("bearer_token"),
|
|
verify_tls=bool(rcv.get("verify_tls", True)),
|
|
)
|
|
|
|
retention = data.get("retention", {})
|
|
return cls(
|
|
host_id=host_id,
|
|
data_root=data_root,
|
|
receiver=receiver,
|
|
scan_interval_s=float(data.get("shipper", {}).get("scan_interval_s", 5.0)),
|
|
request_timeout_s=float(data.get("shipper", {}).get("request_timeout_s", 60.0)),
|
|
keep_local_for_days=int(retention.get("keep_local_for_days", 7)),
|
|
)
|
|
|
|
|
|
def _optional_path(v: object) -> Path | None:
|
|
if v in (None, ""):
|
|
return None
|
|
if isinstance(v, str):
|
|
return Path(v).expanduser()
|
|
raise TypeError(f"expected path string, got {type(v).__name__}")
|