CIS490/tests/test_quarantine_unstamped.py
max eda6164897 fix: lab-host install loop after commit-gate cutover
Why services weren't starting after the gate went live:

1. install-lab-host.sh self-copy. The receiver's 400 remediation tells
   the agent to `cd /opt/cis490 && git pull && sudo
   ./scripts/install-lab-host.sh`. That makes REPO_ROOT==INSTALL_ROOT
   and `cp -aT $REPO_ROOT $INSTALL_ROOT` errors with "are the same
   file"; `set -e` aborts before the systemd units install or anything
   restarts. Detect the same-dir case and skip the cp; chown still
   runs.

2. Services never restart. install-lab-host.sh and install-tier-3-4.sh
   both ended by *telling the operator* to restart, then exiting. The
   running shipper/orchestrator kept executing pre-gate code from the
   old module objects, so new `code_version` stamping never reached an
   episode. Both scripts now `systemctl restart` the units they own
   when those units are enabled.

3. Shipper queue fatal-loop. queue.py incremented `fatal++` but didn't
   move the episode out of `data/episodes/`. Next scan re-tarred and
   re-PUT the same dir, getting 400 again. With 4465+ pre-stamp
   episodes on k-gamingcom this burned ~1 PUT/sec for 5+ hours of
   receiver log. Fatal episodes now move to data/quarantine/<id>/ with
   a quarantine_reason.json beside them; the outbox tarball is
   deleted.

4. Pre-stamp backlog drain. tools/quarantine_unstamped.py is a
   one-shot that scans data/episodes/ and quarantines anything without
   a 40-char-hex code_version.commit. Wired into install-lab-host.sh
   step 9 so a re-install drains the queue automatically. Idempotent;
   safe to run while the shipper is active.

Tests cover the queue's new fatal-quarantine path and every drain
behaviour (kept/quarantined/dry-run/idempotent/missing-meta/collision).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 11:36:21 -05:00

149 lines
5.5 KiB
Python

"""Tests for tools/quarantine_unstamped.py.
This is the one-shot drain that we run on each lab host once after the
commit-gate goes live. The behaviour we care about:
- episodes WITH a 40-char-hex code_version.commit stay put
- episodes WITHOUT that field move to quarantine/
- episodes lacking done.marker (still being written) are untouched
- quarantine/<id>/quarantine_reason.json gets dropped beside it
- re-running is a no-op (idempotent — pre-stamp episodes are gone,
valid ones aren't touched)
"""
from __future__ import annotations
import importlib.util
import json
import sys
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parent.parent
spec = importlib.util.spec_from_file_location(
"quarantine_unstamped", REPO_ROOT / "tools" / "quarantine_unstamped.py"
)
qu = importlib.util.module_from_spec(spec)
sys.modules["quarantine_unstamped"] = qu
spec.loader.exec_module(qu)
def _ep(root: Path, name: str, *, meta: dict | None, done: bool = True) -> Path:
"""Stage a fake episode under <root>/episodes/<name>/."""
d = root / "episodes" / name
d.mkdir(parents=True)
if meta is not None:
(d / "meta.json").write_text(json.dumps(meta))
if done:
(d / "done.marker").touch()
return d
def test_drain_moves_unstamped_to_quarantine(tmp_path: Path) -> None:
_ep(tmp_path, "01OLD", meta={"host_id": "lab1"}) # no code_version
res = qu.drain(tmp_path)
assert res.scanned == 1
assert res.quarantined == 1
assert res.kept_stamped == 0
assert not (tmp_path / "episodes" / "01OLD").exists()
assert (tmp_path / "quarantine" / "01OLD" / "meta.json").exists()
reason = json.loads(
(tmp_path / "quarantine" / "01OLD" / "quarantine_reason.json").read_text()
)
assert reason["status_code"] == 400
assert "pre-stamp" in reason["error"]
def test_drain_keeps_stamped_episode(tmp_path: Path) -> None:
"""A stamped episode (40-char-hex commit) belongs in the live
queue — the shipper will succeed against the receiver gate."""
_ep(tmp_path, "01NEW", meta={
"code_version": {"commit": "a" * 40, "branch": "main", "dirty": False},
})
res = qu.drain(tmp_path)
assert res.scanned == 1
assert res.quarantined == 0
assert res.kept_stamped == 1
assert (tmp_path / "episodes" / "01NEW").exists()
assert not (tmp_path / "quarantine" / "01NEW").exists()
def test_drain_rejects_short_commit(tmp_path: Path) -> None:
"""A truncated/garbled commit is not accepted as 'stamped'
receiver would 400 it as bad-format anyway."""
_ep(tmp_path, "01SHORT", meta={"code_version": {"commit": "abc123"}})
res = qu.drain(tmp_path)
assert res.quarantined == 1
def test_drain_rejects_non_hex_commit(tmp_path: Path) -> None:
bad = "z" * 40
_ep(tmp_path, "01BADHEX", meta={"code_version": {"commit": bad}})
res = qu.drain(tmp_path)
assert res.quarantined == 1
def test_drain_skips_in_progress_episode(tmp_path: Path) -> None:
"""No done.marker means the orchestrator is still writing to the
dir — leave it alone, drainer is for 'finished and queued' only."""
_ep(tmp_path, "01PARTIAL", meta=None, done=False)
res = qu.drain(tmp_path)
assert res.scanned == 1
assert res.skipped_no_marker == 1
assert res.quarantined == 0
assert (tmp_path / "episodes" / "01PARTIAL").exists()
def test_drain_handles_missing_meta_json(tmp_path: Path) -> None:
"""A done episode with no meta.json is corrupt — should be
quarantined, not kept (it'd fail the gate too)."""
_ep(tmp_path, "01NOMETA", meta=None, done=True)
res = qu.drain(tmp_path)
assert res.quarantined == 1
assert (tmp_path / "quarantine" / "01NOMETA" / "quarantine_reason.json").exists()
def test_drain_is_idempotent(tmp_path: Path) -> None:
_ep(tmp_path, "01OLD", meta={"host_id": "lab1"})
_ep(tmp_path, "01NEW", meta={"code_version": {"commit": "a" * 40}})
qu.drain(tmp_path)
res2 = qu.drain(tmp_path)
# Second pass: only the still-live stamped episode is scanned.
assert res2.scanned == 1
assert res2.kept_stamped == 1
assert res2.quarantined == 0
def test_drain_missing_data_root_is_noop(tmp_path: Path) -> None:
"""First-boot: episodes/ may not exist yet. Drain shouldn't crash."""
res = qu.drain(tmp_path / "does-not-exist")
assert res.scanned == 0
assert res.quarantined == 0
def test_drain_dry_run_moves_nothing(tmp_path: Path, capsys: pytest.CaptureFixture) -> None:
_ep(tmp_path, "01OLD", meta={"host_id": "lab1"})
res = qu.drain(tmp_path, dry_run=True)
assert res.quarantined == 1 # counted as if quarantined
# But the episode is still in episodes/ — nothing actually moved.
assert (tmp_path / "episodes" / "01OLD").exists()
assert not (tmp_path / "quarantine" / "01OLD").exists()
out = capsys.readouterr().out
assert "would-quarantine 01OLD" in out
def test_drain_collision_keeps_quarantine_copy(tmp_path: Path) -> None:
"""Re-running after a previous drain put the same id into
quarantine. Should silently drop the live copy (matches the
queue's _quarantine path semantics)."""
_ep(tmp_path, "01DUP", meta={"host_id": "lab1"})
# Pre-existing quarantine entry from a previous run:
(tmp_path / "quarantine" / "01DUP").mkdir(parents=True)
(tmp_path / "quarantine" / "01DUP" / "meta.json").write_text("{}")
res = qu.drain(tmp_path)
assert res.quarantined == 1
assert not (tmp_path / "episodes" / "01DUP").exists()
assert (tmp_path / "quarantine" / "01DUP" / "meta.json").exists()