scripts/train-pi-cpu-models.sh — sequential Pi-side trainer chain
Pi has 4 cores; only KNN and tree-based models are realistic to train here without GPU. While Lambda runs the full 16-job manifest in parallel (~1.7h), this chain trains the CPU-friendly subset on the Pi (~30 min) so scenes 8 & 12 populate with multi-model numbers within minutes instead of waiting on Lambda's full cycle. Order: gbt-realistic, knn-realistic, knn-oracle, knn_semi-realistic, knn_semi-oracle. Skips models whose .ckpt.json already exists (idempotent restart). Each is a subprocess of training/trainer/run.py so XGBoost/numpy/sklearn don't fight each other for cores. Caller is expected to start gbt-oracle separately (it's the longest single training and we kicked it off before invoking this script). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
05bccac29f
commit
c1c8e98180
1 changed files with 76 additions and 0 deletions
76
scripts/train-pi-cpu-models.sh
Executable file
76
scripts/train-pi-cpu-models.sh
Executable file
|
|
@ -0,0 +1,76 @@
|
|||
#!/usr/bin/env bash
|
||||
# Sequentially train the CPU-friendly subset of the manifest on the Pi.
|
||||
# Lambda is training the full set in parallel (~1.7h); this fills the
|
||||
# dashboard with multi-model numbers in ~30 min while we wait.
|
||||
#
|
||||
# Order: by job priority (GBT first, then KNN, then KNN-semi). Each
|
||||
# model is a subprocess so XGBoost/numpy/sklearn don't fight each
|
||||
# other for cores. After each finishes, we publish its model_metric
|
||||
# + model_perf (sticky tick) so scenes 8 & 12 light up incrementally.
|
||||
#
|
||||
# Skips any model whose checkpoint already exists (same idempotency
|
||||
# rule the lambda bootstrap uses).
|
||||
set -uo pipefail
|
||||
|
||||
REPO=/home/max/.env/CIS490
|
||||
VENV=$REPO/.venv-training/bin/python
|
||||
DATA=$REPO/data/processed
|
||||
ART=$REPO/artifacts
|
||||
RPT=$REPO/reports/eval
|
||||
LOGS=$REPO/data/logs
|
||||
|
||||
mkdir -p "$ART" "$RPT" "$LOGS"
|
||||
|
||||
# (model, mode, label) tuples — the CPU-friendly slice
|
||||
JOBS=(
|
||||
"gbt:realistic:gbt_realistic"
|
||||
"knn:realistic:knn_realistic"
|
||||
"knn:oracle:knn_oracle"
|
||||
"knn_semi:realistic:knn_semi_realistic"
|
||||
"knn_semi:oracle:knn_semi_oracle"
|
||||
)
|
||||
# gbt:oracle is presumed already running or done — caller starts it
|
||||
# explicitly before invoking this script.
|
||||
|
||||
n_done=0
|
||||
n_skipped=0
|
||||
n_failed=0
|
||||
|
||||
for entry in "${JOBS[@]}"; do
|
||||
IFS=':' read -r model mode label <<<"$entry"
|
||||
ckpt="$ART/${label}.ckpt.json"
|
||||
log="$LOGS/pi_${label}.log"
|
||||
|
||||
if [[ -f "$ckpt" ]]; then
|
||||
echo "[skip] $label (already trained)"
|
||||
n_skipped=$((n_skipped + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "[train] $label → $log"
|
||||
started=$(date +%s)
|
||||
if "$VENV" -m training.trainer.run \
|
||||
--model "$model" --mode "$mode" \
|
||||
--validation "$DATA/validation_v1.parquet" \
|
||||
--summary "$DATA/features_window_v1.parquet" \
|
||||
--tensors "$DATA/tensor_window_v1" \
|
||||
--schema "$DATA/feature_schema_v1.json" \
|
||||
--out-dir "$ART" \
|
||||
--reports-dir "$RPT" \
|
||||
--train-hosts elliott-thinkpad \
|
||||
> "$log" 2>&1; then
|
||||
elapsed=$(( $(date +%s) - started ))
|
||||
f1=$(grep -oE "TEST macro_f1 = [0-9.]+" "$log" | tail -1 | awk '{print $NF}')
|
||||
echo "[done] $label ${elapsed}s test_macro_f1=${f1:-?}"
|
||||
n_done=$((n_done + 1))
|
||||
else
|
||||
echo "[FAIL] $label — last 10 lines:"
|
||||
tail -10 "$log"
|
||||
n_failed=$((n_failed + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
echo
|
||||
echo "summary: done=$n_done skipped=$n_skipped failed=$n_failed"
|
||||
echo "artifacts:"
|
||||
ls -lh "$ART"/*.ckpt.json 2>/dev/null || echo " (none yet)"
|
||||
Loading…
Add table
Reference in a new issue