scripts/train-pi-cpu-models.sh — sequential Pi-side trainer chain

Pi has 4 cores; only KNN and tree-based models are realistic to train here without GPU. While Lambda runs the full 16-job manifest in parallel (~1.7h), this chain trains the CPU-friendly subset on the Pi (~30 min) so scenes 8 & 12 populate with multi-model numbers within minutes instead of waiting on Lambda's full cycle. Order: gbt-realistic, knn-realistic, knn-oracle, knn_semi-realistic, knn_semi-oracle. Skips models whose .ckpt.json already exists (idempotent restart). Each is a subprocess of training/trainer/run.py so XGBoost/numpy/sklearn don't fight each other for cores. Caller is expected to start gbt-oracle separately (it's the longest single training and we kicked it off before invoking this script). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 14:12:34 -05:00 · 2026-05-08 14:12:34 -05:00 · c1c8e98180
commit c1c8e98180
parent 05bccac29f
1 changed files with 76 additions and 0 deletions
--- a/scripts/train-pi-cpu-models.sh
+++ b/scripts/train-pi-cpu-models.sh
@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# Sequentially train the CPU-friendly subset of the manifest on the Pi.
+# Lambda is training the full set in parallel (~1.7h); this fills the
+# dashboard with multi-model numbers in ~30 min while we wait.
+#
+# Order: by job priority (GBT first, then KNN, then KNN-semi). Each
+# model is a subprocess so XGBoost/numpy/sklearn don't fight each
+# other for cores. After each finishes, we publish its model_metric
+# + model_perf (sticky tick) so scenes 8 & 12 light up incrementally.
+#
+# Skips any model whose checkpoint already exists (same idempotency
+# rule the lambda bootstrap uses).
+set -uo pipefail
+
+REPO=/home/max/.env/CIS490
+VENV=$REPO/.venv-training/bin/python
+DATA=$REPO/data/processed
+ART=$REPO/artifacts
+RPT=$REPO/reports/eval
+LOGS=$REPO/data/logs
+
+mkdir -p "$ART" "$RPT" "$LOGS"
+
+# (model, mode, label) tuples — the CPU-friendly slice
+JOBS=(
+    "gbt:realistic:gbt_realistic"
+    "knn:realistic:knn_realistic"
+    "knn:oracle:knn_oracle"
+    "knn_semi:realistic:knn_semi_realistic"
+    "knn_semi:oracle:knn_semi_oracle"
+)
+# gbt:oracle is presumed already running or done — caller starts it
+# explicitly before invoking this script.
+
+n_done=0
+n_skipped=0
+n_failed=0
+
+for entry in "${JOBS[@]}"; do
+    IFS=':' read -r model mode label <<<"$entry"
+    ckpt="$ART/${label}.ckpt.json"
+    log="$LOGS/pi_${label}.log"
+
+    if [[ -f "$ckpt" ]]; then
+        echo "[skip] $label (already trained)"
+        n_skipped=$((n_skipped + 1))
+        continue
+    fi
+
+    echo "[train] $label  → $log"
+    started=$(date +%s)
+    if "$VENV" -m training.trainer.run \
+        --model "$model" --mode "$mode" \
+        --validation "$DATA/validation_v1.parquet" \
+        --summary "$DATA/features_window_v1.parquet" \
+        --tensors "$DATA/tensor_window_v1" \
+        --schema "$DATA/feature_schema_v1.json" \
+        --out-dir "$ART" \
+        --reports-dir "$RPT" \
+        --train-hosts elliott-thinkpad \
+        > "$log" 2>&1; then
+        elapsed=$(( $(date +%s) - started ))
+        f1=$(grep -oE "TEST macro_f1 = [0-9.]+" "$log" | tail -1 | awk '{print $NF}')
+        echo "[done] $label  ${elapsed}s  test_macro_f1=${f1:-?}"
+        n_done=$((n_done + 1))
+    else
+        echo "[FAIL] $label — last 10 lines:"
+        tail -10 "$log"
+        n_failed=$((n_failed + 1))
+    fi
+done
+
+echo
+echo "summary: done=$n_done  skipped=$n_skipped  failed=$n_failed"
+echo "artifacts:"
+ls -lh "$ART"/*.ckpt.json 2>/dev/null || echo "  (none yet)"