Three robustness items off the future-work list:
1. Shipper sd_notify watchdog. Type=notify + WatchdogSec=180. The
daemon sends READY=1 after queue construction and WATCHDOG=1 once
per scan pass via a heartbeat callback wired into run_forever.
Restart=on-failure only catches process death — silent stalls
(deadlock, hung tar subprocess, blocked I/O past timeout) used to
leave a zombie running with the data backlog growing. Now systemd
kills + restarts the daemon if no WATCHDOG=1 arrives within 180s.
Verified end-to-end against systemd via `systemd-run --transient
--property=Type=notify --property=WatchdogSec=10`: unit transitions
to active on READY=1; SIGSTOP'ing the process triggers
`Watchdog timeout (limit 10s)! Killing process N with SIGABRT` at
exactly t+10s, then unit goes failed → restart cycle.
2. Quarantine cleanup. Without an upper bound, data/quarantine/ grew
forever as fatal episodes piled up. New ShipperConfig fields:
quarantine_keep_days = 30 # opt-out: 0 disables
quarantine_cleanup_interval_s = 3600 # gate so 5s tick doesn't
# statx() the whole tree
Cleanup runs at the start of run_once() but is gated to once per
hour. Removed entries logged.
3. Doctor surfaces shipping errors. Tails 10 minutes of cis490-shipper
journal and surfaces 412/400/transient patterns as red/yellow rows
with the canonical fix command. An on-device agent running
cis490_doctor.py now sees one line ("12 ship(s) rejected as
out-of-window") instead of needing to grep the journal.
Tests: 200/200 (was 188). New coverage: heartbeat callback fires +
survives exceptions; quarantine cleanup respects keep_days, gate, and
opt-out; doctor parser correctly classifies 412/400/transient/clean/
empty/journalctl-denied; both error classes prioritise 412 (more
actionable) when present together.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
141 lines
4.4 KiB
Python
141 lines
4.4 KiB
Python
"""``cis490-shipper`` CLI entrypoint.
|
|
|
|
Modes:
|
|
|
|
--ping hit /v1/ping; exit 0 if 200/ok, non-zero otherwise.
|
|
No tarball flow; index.jsonl on the receiver is untouched.
|
|
--once one scan pass over data/episodes/, ship anything done, exit.
|
|
(default) long-running daemon; rescans every scan_interval_s.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import signal
|
|
import socket
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from .config import ShipperConfig
|
|
from .queue import ShipperQueue
|
|
from .transport import ShipperTransport
|
|
|
|
|
|
def _sd_notify(msg: str) -> None:
|
|
"""Send ``msg`` to systemd's notify socket. No-op when running
|
|
outside systemd (NOTIFY_SOCKET unset) so the same binary works
|
|
fine under `--once`, manual invocation, or tests.
|
|
|
|
See sd_notify(3). The protocol is one-line key=value messages
|
|
over an AF_UNIX SOCK_DGRAM socket. We don't need the libsystemd
|
|
dep — talking to the socket directly is stdlib."""
|
|
sock_path = os.environ.get("NOTIFY_SOCKET")
|
|
if not sock_path:
|
|
return
|
|
if sock_path.startswith("@"):
|
|
# Abstract socket: prepend NUL and strip the leading '@'.
|
|
sock_path = "\0" + sock_path[1:]
|
|
try:
|
|
with socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) as s:
|
|
s.sendto(msg.encode("ascii"), sock_path)
|
|
except OSError:
|
|
# Failing to notify isn't fatal — at worst systemd's
|
|
# WatchdogSec fires and we get restarted, which is the
|
|
# behaviour the watchdog exists to provide.
|
|
pass
|
|
|
|
|
|
def _setup_logging(level: str) -> None:
|
|
logging.basicConfig(
|
|
level=getattr(logging, level.upper(), logging.INFO),
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
)
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
parser = argparse.ArgumentParser(prog="cis490-shipper")
|
|
parser.add_argument(
|
|
"--config",
|
|
default="/etc/cis490/lab-host.toml",
|
|
help="Path to lab-host config (TOML)",
|
|
)
|
|
parser.add_argument(
|
|
"--ping",
|
|
action="store_true",
|
|
help="Hit /v1/ping on the receiver and exit",
|
|
)
|
|
parser.add_argument(
|
|
"--once",
|
|
action="store_true",
|
|
help="One scan pass, then exit (default is long-running daemon)",
|
|
)
|
|
parser.add_argument("--log-level", default="INFO")
|
|
args = parser.parse_args(argv)
|
|
|
|
_setup_logging(args.log_level)
|
|
log = logging.getLogger("cis490.shipper")
|
|
|
|
try:
|
|
cfg = ShipperConfig.load(args.config)
|
|
except (FileNotFoundError, ValueError) as e:
|
|
log.error("config error: %s", e)
|
|
return 2
|
|
|
|
transport = ShipperTransport(cfg)
|
|
|
|
if args.ping:
|
|
result = transport.ping()
|
|
# Print structured one-liner for CI / test pipelines.
|
|
print(json.dumps({
|
|
"ok": result.ok,
|
|
"status_code": result.status_code,
|
|
"host_id": cfg.host_id,
|
|
"receiver": cfg.receiver.url,
|
|
"body": result.body,
|
|
"error": result.error,
|
|
}))
|
|
return 0 if result.ok else 1
|
|
|
|
queue = ShipperQueue(cfg, transport)
|
|
if args.once:
|
|
result = queue.run_once()
|
|
log.info(
|
|
"scan complete: scanned=%d shipped=%d transient=%d conflicts=%d fatal=%d",
|
|
result.scanned, result.shipped, result.transient_failures,
|
|
result.conflicts, result.fatal,
|
|
)
|
|
# Exit code reflects fatal-only; transient failures aren't an error
|
|
# because the next pass / pod restart will retry.
|
|
return 1 if result.fatal else 0
|
|
|
|
# Daemon mode
|
|
stopping = False
|
|
def _stop(signum, frame): # noqa: ARG001
|
|
nonlocal stopping
|
|
log.info("received signal %s; finishing pass and exiting", signum)
|
|
stopping = True
|
|
signal.signal(signal.SIGTERM, _stop)
|
|
signal.signal(signal.SIGINT, _stop)
|
|
|
|
log.info(
|
|
"shipper starting: host_id=%s data_root=%s receiver=%s",
|
|
cfg.host_id, cfg.data_root, cfg.receiver.url,
|
|
)
|
|
# Tell systemd we're ready to take work — gates Type=notify in
|
|
# the unit file. The systemd unit's WatchdogSec= will then expect
|
|
# WATCHDOG=1 messages at least every <WatchdogSec> seconds; a
|
|
# missed one means stalled-mid-loop and triggers a kill+restart.
|
|
_sd_notify("READY=1")
|
|
queue.run_forever(
|
|
stop_check=lambda: stopping,
|
|
heartbeat=lambda: _sd_notify("WATCHDOG=1"),
|
|
)
|
|
_sd_notify("STOPPING=1")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|