CIS490/scripts/fetch-lab-host-cert.sh

#!/usr/bin/env bash
# Fetch this lab-host's mTLS leaf cert from the Pi's bootstrap endpoint.
#
# Idempotent. Safe to run repeatedly:
#   - If certs are already on disk, exit 0 immediately (no-op).
#   - If host_id is unset / still REPLACE_ME, exit 0 — the operator
#     hasn't told us who we are yet, so there's nothing to fetch.
#   - If bootstrap.wg can't be reached, exit 0 — network blip; let the
#     timer retry.
#   - On a successful fetch, install certs into $ETC_ROOT/certs/
#     atomically and `systemctl try-restart cis490-shipper` so the
#     running daemon picks up the cert without waiting for its lazy
#     retry.
#
# Run by cis490-cert-fetch.timer (every 5 min) AND by
# install-lab-host.sh on every install. Also safe for an operator to
# invoke manually.
#
# Why this exists as its own script instead of an inline block in
# install-lab-host.sh: install-lab-host.sh does a LOT (cp, venv,
# Tier-3+4 deploy, queue drain, daemon restart) — re-running it
# every 5 min for the cert is overkill and disruptive. Lift just the
# cert step into a fast, idempotent oneshot.

set -euo pipefail

INSTALL_ROOT="${INSTALL_ROOT:-/opt/cis490}"
ETC_ROOT="${ETC_ROOT:-/etc/cis490}"
SERVICE_USER="${SERVICE_USER:-cis490}"

log() { printf '[fetch-lab-host-cert] %s\n' "$*" >&2; }

[[ $EUID -eq 0 ]] || { log "must run as root (writes /etc/cis490/certs)"; exit 2; }

# Already on disk? No-op. We DON'T validate cert expiry / chain here —
# that's the shipper's job (the SSL context build catches a corrupt or
# expired cert; the operator gets the warning in journalctl). Refresh
# logic for cert renewal would belong in a separate script.
if [[ -f "$ETC_ROOT/certs/lab-host.pem" \
   && -f "$ETC_ROOT/certs/lab-host.key" \
   && -f "$ETC_ROOT/certs/wg-ca.pem" ]]; then
    log "certs already present; nothing to do"
    exit 0
fi

# host_id not set yet? Wait for the operator.
if [[ ! -f "$ETC_ROOT/lab-host.toml" ]]; then
    log "no $ETC_ROOT/lab-host.toml yet; nothing to fetch"
    exit 0
fi
HOST_ID="$(grep -E '^host_id\s*=' "$ETC_ROOT/lab-host.toml" 2>/dev/null \
    | head -1 | sed -E 's/^host_id\s*=\s*"([^"]+)".*/\1/' || true)"
if [[ -z "$HOST_ID" || "$HOST_ID" == "REPLACE_ME" ]]; then
    log "host_id not set in $ETC_ROOT/lab-host.toml — operator must edit it first"
    exit 0
fi

# We need the Caddy root CA to verify bootstrap.wg's TLS cert. It's
# bundled in the repo. If it's missing, our checkout is broken — that's
# a real failure.
CA_BUNDLE="$INSTALL_ROOT/etc/caddy-root.crt"
[[ -f "$CA_BUNDLE" ]] || { log "missing $CA_BUNDLE — install broken"; exit 1; }

install -d -m 0755 -o root -g "$SERVICE_USER" "$ETC_ROOT/certs"

# Use a per-pid tarball so concurrent runs (timer + manual operator)
# don't stomp each other.
TAR="/tmp/cis490-bootstrap-$$.tar"
trap 'rm -f "$TAR"' EXIT

log "fetching leaf cert for host_id=$HOST_ID from https://bootstrap.wg/"
if ! curl -fsS --cacert "$CA_BUNDLE" \
        --connect-timeout 10 --max-time 60 \
        "https://bootstrap.wg/v1/cert/$HOST_ID" -o "$TAR"; then
    log "bootstrap.wg fetch failed — will retry on next timer tick"
    log "  if this persists, check:"
    log "  - /etc/hosts: 'getent hosts bootstrap.wg' should return 10.100.0.1"
    log "  - wg0: 'sudo wg show' should list the Pi as a peer"
    log "  - Pi-side: cis490-bootstrap.service active on 10.100.0.1"
    # exit 0 (not 1) so transient network blips don't pin the unit as
    # failed. The timer fires every few minutes — pile of failures isn't
    # what we want in journalctl.
    exit 0
fi

# Stage into a sibling temp dir then atomically rename, so a partial
# extract never leaves us with mixed-version cert + key on disk.
STAGE="$(mktemp -d "$ETC_ROOT/certs/.stage.XXXXXX")"
trap 'rm -rf "$STAGE" "$TAR"' EXIT

if ! tar -C "$STAGE" -xf "$TAR"; then
    log "ERROR: tarball is malformed"
    exit 1
fi

# Validate the expected files are there before we install. Better to
# fail loudly than half-install.
for f in "ca.crt" "$HOST_ID.pem" "$HOST_ID.key"; do
    [[ -f "$STAGE/$f" ]] || { log "ERROR: bootstrap tarball missing $f"; exit 1; }
done

mv "$STAGE/ca.crt"           "$ETC_ROOT/certs/wg-ca.pem"
mv "$STAGE/$HOST_ID.pem"     "$ETC_ROOT/certs/lab-host.pem"
mv "$STAGE/$HOST_ID.key"     "$ETC_ROOT/certs/lab-host.key"
chown root:"$SERVICE_USER" \
    "$ETC_ROOT/certs/wg-ca.pem" \
    "$ETC_ROOT/certs/lab-host.pem" \
    "$ETC_ROOT/certs/lab-host.key"
chmod 0644 "$ETC_ROOT/certs/wg-ca.pem" "$ETC_ROOT/certs/lab-host.pem"
chmod 0640 "$ETC_ROOT/certs/lab-host.key"

log "installed mTLS leaf for $HOST_ID"

# Try-restart the shipper so it picks up the cert immediately — but
# only if the unit's already enabled (don't auto-start a unit the
# operator deliberately didn't enable yet).
if systemctl is-enabled --quiet cis490-shipper 2>/dev/null; then
    log "restarting cis490-shipper to load new cert"
    systemctl try-restart cis490-shipper || \
        log "WARN: cis490-shipper try-restart failed"
fi