install-lab-host: auto-install perf + tcpdump on Arch / Debian / RHEL

PIPELINE.md §4.4 requires every collector in the active set to actually
work end-to-end. On k-gamingcom (commit dac03d2 episode at 02:21Z) the
new perf_unavailable lifecycle event surfaced a concrete cause:
`reason: binary_not_on_path` — perf is enabled but the binary isn't
installed. Same story with tcpdump on k-gamingcom (pcap_unavailable
events with `error: tcpdump not found`).

The canonical install script is the right place to ensure the deps
are present. detect_os reads /etc/os-release; ensure_collector_packages
installs `perf` (Arch / RHEL) or `linux-perf` + `linux-tools-generic`
(Debian/Ubuntu) plus `tcpdump`. After the install attempt the script
re-checks `command -v` and dies loudly if either is still missing —
silent silent silent forbidden per §1, so install failure has to be
observable.

Idempotent (`--needed` / equivalent skips already-installed packages).
Operator owns full system upgrades; this only does targeted package
install. On unknown distros logs a warning and dies on the followup
check, with a clear pointer to install perf/tcpdump by hand.

The next autoupdate tick on k-gamingcom should pull this and
self-install perf + tcpdump, after which rows_perf > 0 and pcap should
start producing bytes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Max Gorog 2026-05-03 21:26:28 -05:00
parent dac03d2eff
commit c41763bd28

View file

@ -28,6 +28,78 @@ SERVICE_USER="${SERVICE_USER:-cis490}"
log() { printf '[install-lab-host] %s\n' "$*" >&2; }
die() { log "FATAL: $*"; exit 1; }
# Detect distro family from /etc/os-release. Returns one of
# "arch" | "debian" | "rhel" | "other". Used to install collector
# binaries (perf, tcpdump) that the collectors need but which aren't
# always pre-installed. Per PIPELINE.md §1, a host that can't host the
# canonical experiment must produce zero episodes — this function is
# the install-time path to actually installing the deps so the host
# CAN host the experiment, rather than the bandaid path of running
# without them.
detect_os() {
[[ -f /etc/os-release ]] || { echo other; return; }
# shellcheck disable=SC1091
. /etc/os-release
case "${ID:-}${ID_LIKE:-}" in
*arch*) echo arch ;;
*debian*|*ubuntu*) echo debian ;;
*rhel*|*fedora*|*centos*) echo rhel ;;
*) echo other ;;
esac
}
# Install collector binaries that aren't already on PATH. Idempotent;
# `--needed` / equivalent skip already-installed packages. We do NOT
# do a full system upgrade (-Syu / dist-upgrade) — operator owns that.
# If a package install fails the script falls through to the existing
# `command -v` checks below, which die loudly with a clear message.
ensure_collector_packages() {
local need_perf=0 need_tcpdump=0
command -v perf >/dev/null || need_perf=1
command -v tcpdump >/dev/null || need_tcpdump=1
[[ $need_perf -eq 0 && $need_tcpdump -eq 0 ]] && return 0
local os; os=$(detect_os)
log "missing collector binaries (perf=$need_perf tcpdump=$need_tcpdump); os=$os"
case "$os" in
arch)
# Arch ships perf in the `perf` package (from extra). The
# canonical kernel is `linux`; if the operator runs a
# different kernel flavor (linux-lts, linux-zen) the perf
# package is the same — perf is kernel-version-aware via
# `uname -r` at runtime.
local pkgs=()
[[ $need_perf -eq 1 ]] && pkgs+=(perf)
[[ $need_tcpdump -eq 1 ]] && pkgs+=(tcpdump)
log "pacman -S --needed --noconfirm ${pkgs[*]}"
pacman -S --needed --noconfirm "${pkgs[@]}" \
|| log "WARN: pacman install failed; falling through to die-on-missing"
;;
debian)
local pkgs=()
# On Debian/Ubuntu perf comes from linux-perf or
# linux-tools-generic depending on release; try both.
[[ $need_perf -eq 1 ]] && pkgs+=(linux-perf linux-tools-generic)
[[ $need_tcpdump -eq 1 ]] && pkgs+=(tcpdump)
log "apt-get install -y ${pkgs[*]}"
DEBIAN_FRONTEND=noninteractive apt-get install -y "${pkgs[@]}" \
|| log "WARN: apt-get install failed; falling through to die-on-missing"
;;
rhel)
local pkgs=()
[[ $need_perf -eq 1 ]] && pkgs+=(perf)
[[ $need_tcpdump -eq 1 ]] && pkgs+=(tcpdump)
log "dnf install -y ${pkgs[*]}"
dnf install -y "${pkgs[@]}" \
|| log "WARN: dnf install failed; falling through to die-on-missing"
;;
*)
log "WARN: unknown distro; cannot auto-install perf/tcpdump"
log " install them by hand for collectors §4.4 to pass"
;;
esac
}
# --- 1. prereqs --------------------------------------------------------
log "checking prereqs"
@ -36,9 +108,22 @@ if [[ $EUID -ne 0 ]]; then
fi
command -v systemctl >/dev/null || die "systemd not found"
command -v qemu-system-x86_64 >/dev/null || die "qemu-system-x86_64 not on PATH"
command -v zstd >/dev/null || die "zstd not on PATH (apt install zstd)"
command -v zstd >/dev/null || die "zstd not on PATH (install via your package manager)"
[[ -e /dev/kvm ]] || die "/dev/kvm missing — KVM not available"
# Auto-install collector binaries that aren't on PATH. Done after the
# always-required checks above (qemu, kvm, zstd, systemd) which the
# operator has to provide themselves at OS install time.
ensure_collector_packages
# Re-check perf/tcpdump after the install attempt and die loudly if
# still missing. Per §4.4 collectors that can't run shouldn't ship
# silently — the host fails install instead so operator notices.
command -v perf >/dev/null || die \
"perf not on PATH after install attempt — collector source 3 (oracle perf-stat) requires it. See ensure_collector_packages above for what was tried."
command -v tcpdump >/dev/null || die \
"tcpdump not on PATH after install attempt — collector source 4 (bridge pcap + netflow) requires it. See ensure_collector_packages above for what was tried."
# uv is preferred (lockfile-driven). Fall back to system pip if absent.
USE_UV=0
if command -v uv >/dev/null; then USE_UV=1; fi