From c1c8e98180b70cf1c2c966df2f16c4a8fd18ccfd Mon Sep 17 00:00:00 2001
From: Max <mgorog@gmail.com>
Date: Fri, 8 May 2026 14:12:34 -0500
Subject: [PATCH] =?UTF-8?q?scripts/train-pi-cpu-models.sh=20=E2=80=94=20se?=
 =?UTF-8?q?quential=20Pi-side=20trainer=20chain?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pi has 4 cores; only KNN and tree-based models are realistic to train
here without GPU. While Lambda runs the full 16-job manifest in
parallel (~1.7h), this chain trains the CPU-friendly subset on the
Pi (~30 min) so scenes 8 & 12 populate with multi-model numbers
within minutes instead of waiting on Lambda's full cycle.

Order: gbt-realistic, knn-realistic, knn-oracle, knn_semi-realistic,
knn_semi-oracle. Skips models whose .ckpt.json already exists
(idempotent restart). Each is a subprocess of training/trainer/run.py
so XGBoost/numpy/sklearn don't fight each other for cores.

Caller is expected to start gbt-oracle separately (it's the longest
single training and we kicked it off before invoking this script).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/train-pi-cpu-models.sh | 76 ++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100755 scripts/train-pi-cpu-models.sh

diff --git a/scripts/train-pi-cpu-models.sh b/scripts/train-pi-cpu-models.sh
new file mode 100755
index 0000000..7aac131
--- /dev/null
+++ b/scripts/train-pi-cpu-models.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# Sequentially train the CPU-friendly subset of the manifest on the Pi.
+# Lambda is training the full set in parallel (~1.7h); this fills the
+# dashboard with multi-model numbers in ~30 min while we wait.
+#
+# Order: by job priority (GBT first, then KNN, then KNN-semi). Each
+# model is a subprocess so XGBoost/numpy/sklearn don't fight each
+# other for cores. After each finishes, we publish its model_metric
+# + model_perf (sticky tick) so scenes 8 & 12 light up incrementally.
+#
+# Skips any model whose checkpoint already exists (same idempotency
+# rule the lambda bootstrap uses).
+set -uo pipefail
+
+REPO=/home/max/.env/CIS490
+VENV=$REPO/.venv-training/bin/python
+DATA=$REPO/data/processed
+ART=$REPO/artifacts
+RPT=$REPO/reports/eval
+LOGS=$REPO/data/logs
+
+mkdir -p "$ART" "$RPT" "$LOGS"
+
+# (model, mode, label) tuples — the CPU-friendly slice
+JOBS=(
+    "gbt:realistic:gbt_realistic"
+    "knn:realistic:knn_realistic"
+    "knn:oracle:knn_oracle"
+    "knn_semi:realistic:knn_semi_realistic"
+    "knn_semi:oracle:knn_semi_oracle"
+)
+# gbt:oracle is presumed already running or done — caller starts it
+# explicitly before invoking this script.
+
+n_done=0
+n_skipped=0
+n_failed=0
+
+for entry in "${JOBS[@]}"; do
+    IFS=':' read -r model mode label <<<"$entry"
+    ckpt="$ART/${label}.ckpt.json"
+    log="$LOGS/pi_${label}.log"
+
+    if [[ -f "$ckpt" ]]; then
+        echo "[skip] $label (already trained)"
+        n_skipped=$((n_skipped + 1))
+        continue
+    fi
+
+    echo "[train] $label  → $log"
+    started=$(date +%s)
+    if "$VENV" -m training.trainer.run \
+        --model "$model" --mode "$mode" \
+        --validation "$DATA/validation_v1.parquet" \
+        --summary "$DATA/features_window_v1.parquet" \
+        --tensors "$DATA/tensor_window_v1" \
+        --schema "$DATA/feature_schema_v1.json" \
+        --out-dir "$ART" \
+        --reports-dir "$RPT" \
+        --train-hosts elliott-thinkpad \
+        > "$log" 2>&1; then
+        elapsed=$(( $(date +%s) - started ))
+        f1=$(grep -oE "TEST macro_f1 = [0-9.]+" "$log" | tail -1 | awk '{print $NF}')
+        echo "[done] $label  ${elapsed}s  test_macro_f1=${f1:-?}"
+        n_done=$((n_done + 1))
+    else
+        echo "[FAIL] $label — last 10 lines:"
+        tail -10 "$log"
+        n_failed=$((n_failed + 1))
+    fi
+done
+
+echo
+echo "summary: done=$n_done  skipped=$n_skipped  failed=$n_failed"
+echo "artifacts:"
+ls -lh "$ART"/*.ckpt.json 2>/dev/null || echo "  (none yet)"