CIS490/shipper/__main__.py

"""``cis490-shipper`` CLI entrypoint.

Modes:

  --ping       hit /v1/ping; exit 0 if 200/ok, non-zero otherwise.
               No tarball flow; index.jsonl on the receiver is untouched.
  --once       one scan pass over data/episodes/, ship anything done, exit.
  (default)    long-running daemon; rescans every scan_interval_s.
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import signal
import socket
import sys
from pathlib import Path

from .config import ShipperConfig
from .queue import ShipperQueue
from .transport import ShipperTransport


def _sd_notify(msg: str) -> None:
    """Send ``msg`` to systemd's notify socket. No-op when running
    outside systemd (NOTIFY_SOCKET unset) so the same binary works
    fine under `--once`, manual invocation, or tests.

    See sd_notify(3). The protocol is one-line key=value messages
    over an AF_UNIX SOCK_DGRAM socket. We don't need the libsystemd
    dep — talking to the socket directly is stdlib."""
    sock_path = os.environ.get("NOTIFY_SOCKET")
    if not sock_path:
        return
    if sock_path.startswith("@"):
        # Abstract socket: prepend NUL and strip the leading '@'.
        sock_path = "\0" + sock_path[1:]
    try:
        with socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) as s:
            s.sendto(msg.encode("ascii"), sock_path)
    except OSError:
        # Failing to notify isn't fatal — at worst systemd's
        # WatchdogSec fires and we get restarted, which is the
        # behaviour the watchdog exists to provide.
        pass


def _setup_logging(level: str) -> None:
    logging.basicConfig(
        level=getattr(logging, level.upper(), logging.INFO),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
    )


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(prog="cis490-shipper")
    parser.add_argument(
        "--config",
        default="/etc/cis490/lab-host.toml",
        help="Path to lab-host config (TOML)",
    )
    parser.add_argument(
        "--ping",
        action="store_true",
        help="Hit /v1/ping on the receiver and exit",
    )
    parser.add_argument(
        "--once",
        action="store_true",
        help="One scan pass, then exit (default is long-running daemon)",
    )
    parser.add_argument("--log-level", default="INFO")
    args = parser.parse_args(argv)

    _setup_logging(args.log_level)
    log = logging.getLogger("cis490.shipper")

    try:
        cfg = ShipperConfig.load(args.config)
    except (FileNotFoundError, ValueError) as e:
        log.error("config error: %s", e)
        return 2

    transport = ShipperTransport(cfg)

    if args.ping:
        result = transport.ping()
        # Print structured one-liner for CI / test pipelines.
        print(json.dumps({
            "ok": result.ok,
            "status_code": result.status_code,
            "host_id": cfg.host_id,
            "receiver": cfg.receiver.url,
            "body": result.body,
            "error": result.error,
        }))
        return 0 if result.ok else 1

    queue = ShipperQueue(cfg, transport)
    if args.once:
        result = queue.run_once()
        log.info(
            "scan complete: scanned=%d shipped=%d transient=%d conflicts=%d fatal=%d",
            result.scanned, result.shipped, result.transient_failures,
            result.conflicts, result.fatal,
        )
        # Exit code reflects fatal-only; transient failures aren't an error
        # because the next pass / pod restart will retry.
        return 1 if result.fatal else 0

    # Daemon mode
    stopping = False
    def _stop(signum, frame):  # noqa: ARG001
        nonlocal stopping
        log.info("received signal %s; finishing pass and exiting", signum)
        stopping = True
    signal.signal(signal.SIGTERM, _stop)
    signal.signal(signal.SIGINT, _stop)

    log.info(
        "shipper starting: host_id=%s data_root=%s receiver=%s",
        cfg.host_id, cfg.data_root, cfg.receiver.url,
    )
    # Tell systemd we're ready to take work — gates Type=notify in
    # the unit file. The systemd unit's WatchdogSec= will then expect
    # WATCHDOG=1 messages at least every <WatchdogSec> seconds; a
    # missed one means stalled-mid-loop and triggers a kill+restart.
    _sd_notify("READY=1")
    queue.run_forever(
        stop_check=lambda: stopping,
        heartbeat=lambda: _sd_notify("WATCHDOG=1"),
    )
    _sd_notify("STOPPING=1")
    return 0


if __name__ == "__main__":
    sys.exit(main())