install-lab-host: auto-install perf + tcpdump on Arch / Debian / RHEL
PIPELINE.md §4.4 requires every collector in the active set to actually
work end-to-end. On k-gamingcom (commit dac03d2 episode at 02:21Z) the
new perf_unavailable lifecycle event surfaced a concrete cause:
`reason: binary_not_on_path` — perf is enabled but the binary isn't
installed. Same story with tcpdump on k-gamingcom (pcap_unavailable
events with `error: tcpdump not found`).
The canonical install script is the right place to ensure the deps
are present. detect_os reads /etc/os-release; ensure_collector_packages
installs `perf` (Arch / RHEL) or `linux-perf` + `linux-tools-generic`
(Debian/Ubuntu) plus `tcpdump`. After the install attempt the script
re-checks `command -v` and dies loudly if either is still missing —
silent silent silent forbidden per §1, so install failure has to be
observable.
Idempotent (`--needed` / equivalent skips already-installed packages).
Operator owns full system upgrades; this only does targeted package
install. On unknown distros logs a warning and dies on the followup
check, with a clear pointer to install perf/tcpdump by hand.
The next autoupdate tick on k-gamingcom should pull this and
self-install perf + tcpdump, after which rows_perf > 0 and pcap should
start producing bytes.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
dac03d2eff
commit
c41763bd28
1 changed files with 86 additions and 1 deletions
|
|
@ -28,6 +28,78 @@ SERVICE_USER="${SERVICE_USER:-cis490}"
|
|||
log() { printf '[install-lab-host] %s\n' "$*" >&2; }
|
||||
die() { log "FATAL: $*"; exit 1; }
|
||||
|
||||
# Detect distro family from /etc/os-release. Returns one of
|
||||
# "arch" | "debian" | "rhel" | "other". Used to install collector
|
||||
# binaries (perf, tcpdump) that the collectors need but which aren't
|
||||
# always pre-installed. Per PIPELINE.md §1, a host that can't host the
|
||||
# canonical experiment must produce zero episodes — this function is
|
||||
# the install-time path to actually installing the deps so the host
|
||||
# CAN host the experiment, rather than the bandaid path of running
|
||||
# without them.
|
||||
detect_os() {
|
||||
[[ -f /etc/os-release ]] || { echo other; return; }
|
||||
# shellcheck disable=SC1091
|
||||
. /etc/os-release
|
||||
case "${ID:-}${ID_LIKE:-}" in
|
||||
*arch*) echo arch ;;
|
||||
*debian*|*ubuntu*) echo debian ;;
|
||||
*rhel*|*fedora*|*centos*) echo rhel ;;
|
||||
*) echo other ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Install collector binaries that aren't already on PATH. Idempotent;
|
||||
# `--needed` / equivalent skip already-installed packages. We do NOT
|
||||
# do a full system upgrade (-Syu / dist-upgrade) — operator owns that.
|
||||
# If a package install fails the script falls through to the existing
|
||||
# `command -v` checks below, which die loudly with a clear message.
|
||||
ensure_collector_packages() {
|
||||
local need_perf=0 need_tcpdump=0
|
||||
command -v perf >/dev/null || need_perf=1
|
||||
command -v tcpdump >/dev/null || need_tcpdump=1
|
||||
[[ $need_perf -eq 0 && $need_tcpdump -eq 0 ]] && return 0
|
||||
|
||||
local os; os=$(detect_os)
|
||||
log "missing collector binaries (perf=$need_perf tcpdump=$need_tcpdump); os=$os"
|
||||
case "$os" in
|
||||
arch)
|
||||
# Arch ships perf in the `perf` package (from extra). The
|
||||
# canonical kernel is `linux`; if the operator runs a
|
||||
# different kernel flavor (linux-lts, linux-zen) the perf
|
||||
# package is the same — perf is kernel-version-aware via
|
||||
# `uname -r` at runtime.
|
||||
local pkgs=()
|
||||
[[ $need_perf -eq 1 ]] && pkgs+=(perf)
|
||||
[[ $need_tcpdump -eq 1 ]] && pkgs+=(tcpdump)
|
||||
log "pacman -S --needed --noconfirm ${pkgs[*]}"
|
||||
pacman -S --needed --noconfirm "${pkgs[@]}" \
|
||||
|| log "WARN: pacman install failed; falling through to die-on-missing"
|
||||
;;
|
||||
debian)
|
||||
local pkgs=()
|
||||
# On Debian/Ubuntu perf comes from linux-perf or
|
||||
# linux-tools-generic depending on release; try both.
|
||||
[[ $need_perf -eq 1 ]] && pkgs+=(linux-perf linux-tools-generic)
|
||||
[[ $need_tcpdump -eq 1 ]] && pkgs+=(tcpdump)
|
||||
log "apt-get install -y ${pkgs[*]}"
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y "${pkgs[@]}" \
|
||||
|| log "WARN: apt-get install failed; falling through to die-on-missing"
|
||||
;;
|
||||
rhel)
|
||||
local pkgs=()
|
||||
[[ $need_perf -eq 1 ]] && pkgs+=(perf)
|
||||
[[ $need_tcpdump -eq 1 ]] && pkgs+=(tcpdump)
|
||||
log "dnf install -y ${pkgs[*]}"
|
||||
dnf install -y "${pkgs[@]}" \
|
||||
|| log "WARN: dnf install failed; falling through to die-on-missing"
|
||||
;;
|
||||
*)
|
||||
log "WARN: unknown distro; cannot auto-install perf/tcpdump"
|
||||
log " install them by hand for collectors §4.4 to pass"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# --- 1. prereqs --------------------------------------------------------
|
||||
log "checking prereqs"
|
||||
|
||||
|
|
@ -36,9 +108,22 @@ if [[ $EUID -ne 0 ]]; then
|
|||
fi
|
||||
command -v systemctl >/dev/null || die "systemd not found"
|
||||
command -v qemu-system-x86_64 >/dev/null || die "qemu-system-x86_64 not on PATH"
|
||||
command -v zstd >/dev/null || die "zstd not on PATH (apt install zstd)"
|
||||
command -v zstd >/dev/null || die "zstd not on PATH (install via your package manager)"
|
||||
[[ -e /dev/kvm ]] || die "/dev/kvm missing — KVM not available"
|
||||
|
||||
# Auto-install collector binaries that aren't on PATH. Done after the
|
||||
# always-required checks above (qemu, kvm, zstd, systemd) which the
|
||||
# operator has to provide themselves at OS install time.
|
||||
ensure_collector_packages
|
||||
|
||||
# Re-check perf/tcpdump after the install attempt and die loudly if
|
||||
# still missing. Per §4.4 collectors that can't run shouldn't ship
|
||||
# silently — the host fails install instead so operator notices.
|
||||
command -v perf >/dev/null || die \
|
||||
"perf not on PATH after install attempt — collector source 3 (oracle perf-stat) requires it. See ensure_collector_packages above for what was tried."
|
||||
command -v tcpdump >/dev/null || die \
|
||||
"tcpdump not on PATH after install attempt — collector source 4 (bridge pcap + netflow) requires it. See ensure_collector_packages above for what was tried."
|
||||
|
||||
# uv is preferred (lockfile-driven). Fall back to system pip if absent.
|
||||
USE_UV=0
|
||||
if command -v uv >/dev/null; then USE_UV=1; fi
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue