From c1c8e98180b70cf1c2c966df2f16c4a8fd18ccfd Mon Sep 17 00:00:00 2001 From: Max Date: Fri, 8 May 2026 14:12:34 -0500 Subject: [PATCH] =?UTF-8?q?scripts/train-pi-cpu-models.sh=20=E2=80=94=20se?= =?UTF-8?q?quential=20Pi-side=20trainer=20chain?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pi has 4 cores; only KNN and tree-based models are realistic to train here without GPU. While Lambda runs the full 16-job manifest in parallel (~1.7h), this chain trains the CPU-friendly subset on the Pi (~30 min) so scenes 8 & 12 populate with multi-model numbers within minutes instead of waiting on Lambda's full cycle. Order: gbt-realistic, knn-realistic, knn-oracle, knn_semi-realistic, knn_semi-oracle. Skips models whose .ckpt.json already exists (idempotent restart). Each is a subprocess of training/trainer/run.py so XGBoost/numpy/sklearn don't fight each other for cores. Caller is expected to start gbt-oracle separately (it's the longest single training and we kicked it off before invoking this script). Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/train-pi-cpu-models.sh | 76 ++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100755 scripts/train-pi-cpu-models.sh diff --git a/scripts/train-pi-cpu-models.sh b/scripts/train-pi-cpu-models.sh new file mode 100755 index 0000000..7aac131 --- /dev/null +++ b/scripts/train-pi-cpu-models.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Sequentially train the CPU-friendly subset of the manifest on the Pi. +# Lambda is training the full set in parallel (~1.7h); this fills the +# dashboard with multi-model numbers in ~30 min while we wait. +# +# Order: by job priority (GBT first, then KNN, then KNN-semi). Each +# model is a subprocess so XGBoost/numpy/sklearn don't fight each +# other for cores. After each finishes, we publish its model_metric +# + model_perf (sticky tick) so scenes 8 & 12 light up incrementally. +# +# Skips any model whose checkpoint already exists (same idempotency +# rule the lambda bootstrap uses). +set -uo pipefail + +REPO=/home/max/.env/CIS490 +VENV=$REPO/.venv-training/bin/python +DATA=$REPO/data/processed +ART=$REPO/artifacts +RPT=$REPO/reports/eval +LOGS=$REPO/data/logs + +mkdir -p "$ART" "$RPT" "$LOGS" + +# (model, mode, label) tuples — the CPU-friendly slice +JOBS=( + "gbt:realistic:gbt_realistic" + "knn:realistic:knn_realistic" + "knn:oracle:knn_oracle" + "knn_semi:realistic:knn_semi_realistic" + "knn_semi:oracle:knn_semi_oracle" +) +# gbt:oracle is presumed already running or done — caller starts it +# explicitly before invoking this script. + +n_done=0 +n_skipped=0 +n_failed=0 + +for entry in "${JOBS[@]}"; do + IFS=':' read -r model mode label <<<"$entry" + ckpt="$ART/${label}.ckpt.json" + log="$LOGS/pi_${label}.log" + + if [[ -f "$ckpt" ]]; then + echo "[skip] $label (already trained)" + n_skipped=$((n_skipped + 1)) + continue + fi + + echo "[train] $label → $log" + started=$(date +%s) + if "$VENV" -m training.trainer.run \ + --model "$model" --mode "$mode" \ + --validation "$DATA/validation_v1.parquet" \ + --summary "$DATA/features_window_v1.parquet" \ + --tensors "$DATA/tensor_window_v1" \ + --schema "$DATA/feature_schema_v1.json" \ + --out-dir "$ART" \ + --reports-dir "$RPT" \ + --train-hosts elliott-thinkpad \ + > "$log" 2>&1; then + elapsed=$(( $(date +%s) - started )) + f1=$(grep -oE "TEST macro_f1 = [0-9.]+" "$log" | tail -1 | awk '{print $NF}') + echo "[done] $label ${elapsed}s test_macro_f1=${f1:-?}" + n_done=$((n_done + 1)) + else + echo "[FAIL] $label — last 10 lines:" + tail -10 "$log" + n_failed=$((n_failed + 1)) + fi +done + +echo +echo "summary: done=$n_done skipped=$n_skipped failed=$n_failed" +echo "artifacts:" +ls -lh "$ART"/*.ckpt.json 2>/dev/null || echo " (none yet)"