CIS490/scripts/train-pi-cpu-models.sh

#!/usr/bin/env bash
# Sequentially train the CPU-friendly subset of the manifest on the Pi.
# Lambda is training the full set in parallel (~1.7h); this fills the
# dashboard with multi-model numbers in ~30 min while we wait.
#
# Order: by job priority (GBT first, then KNN, then KNN-semi). Each
# model is a subprocess so XGBoost/numpy/sklearn don't fight each
# other for cores. After each finishes, we publish its model_metric
# + model_perf (sticky tick) so scenes 8 & 12 light up incrementally.
#
# Skips any model whose checkpoint already exists (same idempotency
# rule the lambda bootstrap uses).
set -uo pipefail

REPO=/home/max/.env/CIS490
VENV=$REPO/.venv-training/bin/python
DATA=$REPO/data/processed
ART=$REPO/artifacts
RPT=$REPO/reports/eval
LOGS=$REPO/data/logs

mkdir -p "$ART" "$RPT" "$LOGS"

# (model, mode, label) tuples — the CPU-friendly slice
JOBS=(
    "gbt:realistic:gbt_realistic"
    "knn:realistic:knn_realistic"
    "knn:oracle:knn_oracle"
    "knn_semi:realistic:knn_semi_realistic"
    "knn_semi:oracle:knn_semi_oracle"
)
# gbt:oracle is presumed already running or done — caller starts it
# explicitly before invoking this script.

n_done=0
n_skipped=0
n_failed=0

for entry in "${JOBS[@]}"; do
    IFS=':' read -r model mode label <<<"$entry"
    ckpt="$ART/${label}.ckpt.json"
    log="$LOGS/pi_${label}.log"

    if [[ -f "$ckpt" ]]; then
        echo "[skip] $label (already trained)"
        n_skipped=$((n_skipped + 1))
        continue
    fi

    echo "[train] $label  → $log"
    started=$(date +%s)
    if "$VENV" -m training.trainer.run \
        --model "$model" --mode "$mode" \
        --validation "$DATA/validation_v1.parquet" \
        --summary "$DATA/features_window_v1.parquet" \
        --tensors "$DATA/tensor_window_v1" \
        --schema "$DATA/feature_schema_v1.json" \
        --out-dir "$ART" \
        --reports-dir "$RPT" \
        --train-hosts elliott-thinkpad \
        > "$log" 2>&1; then
        elapsed=$(( $(date +%s) - started ))
        f1=$(grep -oE "TEST macro_f1 = [0-9.]+" "$log" | tail -1 | awk '{print $NF}')
        echo "[done] $label  ${elapsed}s  test_macro_f1=${f1:-?}"
        n_done=$((n_done + 1))
    else
        echo "[FAIL] $label — last 10 lines:"
        tail -10 "$log"
        n_failed=$((n_failed + 1))
    fi
done

echo
echo "summary: done=$n_done  skipped=$n_skipped  failed=$n_failed"
echo "artifacts:"
ls -lh "$ART"/*.ckpt.json 2>/dev/null || echo "  (none yet)"