Pi has 4 cores; only KNN and tree-based models are realistic to train here without GPU. While Lambda runs the full 16-job manifest in parallel (~1.7h), this chain trains the CPU-friendly subset on the Pi (~30 min) so scenes 8 & 12 populate with multi-model numbers within minutes instead of waiting on Lambda's full cycle. Order: gbt-realistic, knn-realistic, knn-oracle, knn_semi-realistic, knn_semi-oracle. Skips models whose .ckpt.json already exists (idempotent restart). Each is a subprocess of training/trainer/run.py so XGBoost/numpy/sklearn don't fight each other for cores. Caller is expected to start gbt-oracle separately (it's the longest single training and we kicked it off before invoking this script). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
76 lines
2.4 KiB
Bash
Executable file
76 lines
2.4 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Sequentially train the CPU-friendly subset of the manifest on the Pi.
|
|
# Lambda is training the full set in parallel (~1.7h); this fills the
|
|
# dashboard with multi-model numbers in ~30 min while we wait.
|
|
#
|
|
# Order: by job priority (GBT first, then KNN, then KNN-semi). Each
|
|
# model is a subprocess so XGBoost/numpy/sklearn don't fight each
|
|
# other for cores. After each finishes, we publish its model_metric
|
|
# + model_perf (sticky tick) so scenes 8 & 12 light up incrementally.
|
|
#
|
|
# Skips any model whose checkpoint already exists (same idempotency
|
|
# rule the lambda bootstrap uses).
|
|
set -uo pipefail
|
|
|
|
REPO=/home/max/.env/CIS490
|
|
VENV=$REPO/.venv-training/bin/python
|
|
DATA=$REPO/data/processed
|
|
ART=$REPO/artifacts
|
|
RPT=$REPO/reports/eval
|
|
LOGS=$REPO/data/logs
|
|
|
|
mkdir -p "$ART" "$RPT" "$LOGS"
|
|
|
|
# (model, mode, label) tuples — the CPU-friendly slice
|
|
JOBS=(
|
|
"gbt:realistic:gbt_realistic"
|
|
"knn:realistic:knn_realistic"
|
|
"knn:oracle:knn_oracle"
|
|
"knn_semi:realistic:knn_semi_realistic"
|
|
"knn_semi:oracle:knn_semi_oracle"
|
|
)
|
|
# gbt:oracle is presumed already running or done — caller starts it
|
|
# explicitly before invoking this script.
|
|
|
|
n_done=0
|
|
n_skipped=0
|
|
n_failed=0
|
|
|
|
for entry in "${JOBS[@]}"; do
|
|
IFS=':' read -r model mode label <<<"$entry"
|
|
ckpt="$ART/${label}.ckpt.json"
|
|
log="$LOGS/pi_${label}.log"
|
|
|
|
if [[ -f "$ckpt" ]]; then
|
|
echo "[skip] $label (already trained)"
|
|
n_skipped=$((n_skipped + 1))
|
|
continue
|
|
fi
|
|
|
|
echo "[train] $label → $log"
|
|
started=$(date +%s)
|
|
if "$VENV" -m training.trainer.run \
|
|
--model "$model" --mode "$mode" \
|
|
--validation "$DATA/validation_v1.parquet" \
|
|
--summary "$DATA/features_window_v1.parquet" \
|
|
--tensors "$DATA/tensor_window_v1" \
|
|
--schema "$DATA/feature_schema_v1.json" \
|
|
--out-dir "$ART" \
|
|
--reports-dir "$RPT" \
|
|
--train-hosts elliott-thinkpad \
|
|
> "$log" 2>&1; then
|
|
elapsed=$(( $(date +%s) - started ))
|
|
f1=$(grep -oE "TEST macro_f1 = [0-9.]+" "$log" | tail -1 | awk '{print $NF}')
|
|
echo "[done] $label ${elapsed}s test_macro_f1=${f1:-?}"
|
|
n_done=$((n_done + 1))
|
|
else
|
|
echo "[FAIL] $label — last 10 lines:"
|
|
tail -10 "$log"
|
|
n_failed=$((n_failed + 1))
|
|
fi
|
|
done
|
|
|
|
echo
|
|
echo "summary: done=$n_done skipped=$n_skipped failed=$n_failed"
|
|
echo "artifacts:"
|
|
ls -lh "$ART"/*.ckpt.json 2>/dev/null || echo " (none yet)"
|