diff --git a/scripts/train-pi-cpu-models.sh b/scripts/train-pi-cpu-models.sh new file mode 100755 index 0000000..7aac131 --- /dev/null +++ b/scripts/train-pi-cpu-models.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Sequentially train the CPU-friendly subset of the manifest on the Pi. +# Lambda is training the full set in parallel (~1.7h); this fills the +# dashboard with multi-model numbers in ~30 min while we wait. +# +# Order: by job priority (GBT first, then KNN, then KNN-semi). Each +# model is a subprocess so XGBoost/numpy/sklearn don't fight each +# other for cores. After each finishes, we publish its model_metric +# + model_perf (sticky tick) so scenes 8 & 12 light up incrementally. +# +# Skips any model whose checkpoint already exists (same idempotency +# rule the lambda bootstrap uses). +set -uo pipefail + +REPO=/home/max/.env/CIS490 +VENV=$REPO/.venv-training/bin/python +DATA=$REPO/data/processed +ART=$REPO/artifacts +RPT=$REPO/reports/eval +LOGS=$REPO/data/logs + +mkdir -p "$ART" "$RPT" "$LOGS" + +# (model, mode, label) tuples — the CPU-friendly slice +JOBS=( + "gbt:realistic:gbt_realistic" + "knn:realistic:knn_realistic" + "knn:oracle:knn_oracle" + "knn_semi:realistic:knn_semi_realistic" + "knn_semi:oracle:knn_semi_oracle" +) +# gbt:oracle is presumed already running or done — caller starts it +# explicitly before invoking this script. + +n_done=0 +n_skipped=0 +n_failed=0 + +for entry in "${JOBS[@]}"; do + IFS=':' read -r model mode label <<<"$entry" + ckpt="$ART/${label}.ckpt.json" + log="$LOGS/pi_${label}.log" + + if [[ -f "$ckpt" ]]; then + echo "[skip] $label (already trained)" + n_skipped=$((n_skipped + 1)) + continue + fi + + echo "[train] $label → $log" + started=$(date +%s) + if "$VENV" -m training.trainer.run \ + --model "$model" --mode "$mode" \ + --validation "$DATA/validation_v1.parquet" \ + --summary "$DATA/features_window_v1.parquet" \ + --tensors "$DATA/tensor_window_v1" \ + --schema "$DATA/feature_schema_v1.json" \ + --out-dir "$ART" \ + --reports-dir "$RPT" \ + --train-hosts elliott-thinkpad \ + > "$log" 2>&1; then + elapsed=$(( $(date +%s) - started )) + f1=$(grep -oE "TEST macro_f1 = [0-9.]+" "$log" | tail -1 | awk '{print $NF}') + echo "[done] $label ${elapsed}s test_macro_f1=${f1:-?}" + n_done=$((n_done + 1)) + else + echo "[FAIL] $label — last 10 lines:" + tail -10 "$log" + n_failed=$((n_failed + 1)) + fi +done + +echo +echo "summary: done=$n_done skipped=$n_skipped failed=$n_failed" +echo "artifacts:" +ls -lh "$ART"/*.ckpt.json 2>/dev/null || echo " (none yet)"