The previous version did `(cd repo && "${cmd[@]}")` to "cd into repo
for module imports." But PYTHONPATH was already set to $PWD/repo at
the top of the script — so the cd was redundant for imports AND
broke relative paths: the trainer expects to find
data/processed/validation_v1.parquet from $HOME/cis490, not from
$HOME/cis490/repo/.
Symptom: every training job failed immediately with
FileNotFoundError: data/processed/validation_v1.parquet
Drop the cd; PYTHONPATH already does the import work.
Found while running on the A100 today; trainer relaunched manually
in-place via a stand-in bootstrap2.sh; this commit makes the next
bundle clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
233 lines
9.1 KiB
Bash
Executable file
233 lines
9.1 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Runs ON the Lambda instance after the bundle is extracted to ~/cis490.
|
|
# Installs Python deps, iterates the training manifest, runs each job,
|
|
# tars the resulting artifacts so run-on-lambda.sh can rsync them back.
|
|
#
|
|
# Inputs (cwd = ~/cis490):
|
|
# bootstrap.sh ← THIS FILE
|
|
# training_manifest.toml ← canonical job list
|
|
# BUNDLE_MANIFEST.json ← code commit + sanity stamps
|
|
# repo/ ← code snapshot
|
|
# data/processed/ ← pre-built parquet + tensor shards
|
|
#
|
|
# Outputs (cwd = ~/cis490):
|
|
# artifacts/ ← <model>_<mode>.{ckpt.json,pt,xgb.json}
|
|
# reports/eval/ ← per-model train.json + comparison_v2.md
|
|
# logs/<model>_<mode>.log ← per-job training log (full stdout/stderr)
|
|
#
|
|
# Idempotency: each iteration checks for an existing
|
|
# artifacts/<model>_<mode>.ckpt.json before training. Re-running picks
|
|
# up where it left off.
|
|
set -euo pipefail
|
|
|
|
cd "$HOME/cis490"
|
|
|
|
echo "=== bundle manifest ==="
|
|
cat BUNDLE_MANIFEST.json
|
|
echo
|
|
|
|
echo "=== gpu inventory ==="
|
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
|
nvidia-smi -L
|
|
nvidia-smi --query-gpu=name,memory.total,memory.free,driver_version --format=csv
|
|
else
|
|
echo "nvidia-smi not found — running without CUDA?" >&2
|
|
fi
|
|
echo
|
|
|
|
# ─────────────────────────────────────────────────────────────────────
|
|
# 1. Python venv with training deps
|
|
# ─────────────────────────────────────────────────────────────────────
|
|
|
|
if [[ ! -x .venv/bin/python ]]; then
|
|
echo "=== creating .venv ==="
|
|
python3 -m venv .venv
|
|
fi
|
|
. .venv/bin/activate
|
|
python -m pip install -q --upgrade pip
|
|
echo "=== installing training deps ==="
|
|
# CUDA-enabled torch from PyTorch's index. Lambda's A100 supports cu121/cu124;
|
|
# default to whichever is the latest stable matching the host driver.
|
|
pip install -q torch --index-url https://download.pytorch.org/whl/cu121
|
|
pip install -q xgboost numpy scipy pyarrow polars scikit-learn matplotlib zstandard
|
|
pip install -q -e ./repo
|
|
|
|
python - <<'PY'
|
|
import torch, xgboost
|
|
print(f"torch {torch.__version__} cuda? {torch.cuda.is_available()} "
|
|
f"device count={torch.cuda.device_count()}")
|
|
if torch.cuda.is_available():
|
|
print(f" device 0: {torch.cuda.get_device_name(0)}")
|
|
print(f"xgboost {xgboost.__version__}")
|
|
PY
|
|
|
|
# ─────────────────────────────────────────────────────────────────────
|
|
# 2. Iterate the manifest, run trainer per job
|
|
# ─────────────────────────────────────────────────────────────────────
|
|
|
|
mkdir -p artifacts reports/eval logs
|
|
export PYTHONPATH="$PWD/repo"
|
|
|
|
# Render manifest jobs to a list `<model> <mode>` lines (one per job).
|
|
mapfile -t JOBS < <(python - <<PY
|
|
from pathlib import Path
|
|
import sys
|
|
sys.path.insert(0, "repo")
|
|
from training.fleet.manifest import load
|
|
m = load(Path("training_manifest.toml"))
|
|
for j in sorted(m.jobs, key=lambda x: -x.priority):
|
|
# Compose hyper as --key value pairs
|
|
hyper = " ".join(f"--{k.replace('_','-')} {v}" for k, v in j.hyper.items())
|
|
print(f"{j.model}\t{j.mode}\t{hyper}")
|
|
PY
|
|
)
|
|
|
|
if [[ ${#JOBS[@]} -eq 0 ]]; then
|
|
echo "no jobs in manifest!" >&2; exit 3
|
|
fi
|
|
|
|
echo "=== running ${#JOBS[@]} training jobs (parallelism=${PARALLEL:-2}) ==="
|
|
# Bounded-concurrency rolling launcher. Two trainings run on the same
|
|
# A100 simultaneously: at our model sizes (max ~250K params, max batch
|
|
# 512) each uses ~1 GiB VRAM, so a 40 GiB A100 is far from contention.
|
|
# If you change MAX_PARALLEL, recompute against the actual model sizes
|
|
# in training/models/ — don't blindly raise it.
|
|
MAX_PARALLEL="${PARALLEL:-2}"
|
|
|
|
declare -i n_done=0 n_skipped=0 n_failed=0
|
|
declare -a FAILED=()
|
|
# Map pid → job_label, started_ts so we can report per-job timing.
|
|
declare -A PID_TO_LABEL=()
|
|
declare -A PID_TO_STARTED=()
|
|
|
|
# Helper: reap any finished children, update counters, drop their pids
|
|
# from the running set. Returns through stdout the new running pid list.
|
|
reap_finished() {
|
|
local still_running=()
|
|
local p label started rc elapsed
|
|
for p in "$@"; do
|
|
if kill -0 "$p" 2>/dev/null; then
|
|
still_running+=("$p")
|
|
else
|
|
label="${PID_TO_LABEL[$p]:-?}"
|
|
started="${PID_TO_STARTED[$p]:-0}"
|
|
wait "$p" 2>/dev/null
|
|
rc=$?
|
|
elapsed=$(( $(date +%s) - started ))
|
|
if [[ $rc -eq 0 ]]; then
|
|
echo " ✓ $label done in ${elapsed}s"
|
|
n_done+=1
|
|
else
|
|
echo " ✗ $label FAILED (rc=$rc, ${elapsed}s) — last 20 lines:"
|
|
tail -20 "logs/${label}.log"
|
|
FAILED+=("$label")
|
|
n_failed+=1
|
|
fi
|
|
unset "PID_TO_LABEL[$p]" "PID_TO_STARTED[$p]"
|
|
fi
|
|
done
|
|
echo "${still_running[@]}"
|
|
}
|
|
|
|
PIDS=()
|
|
for entry in "${JOBS[@]}"; do
|
|
IFS=$'\t' read -r model mode hyper <<<"$entry"
|
|
job_label="${model}_${mode}"
|
|
ckpt="artifacts/${job_label}.ckpt.json"
|
|
log="logs/${job_label}.log"
|
|
|
|
if [[ -f "$ckpt" ]]; then
|
|
echo " skip $job_label (already present)"
|
|
n_skipped+=1
|
|
continue
|
|
fi
|
|
|
|
# Throttle: wait until there's room for one more.
|
|
while [[ ${#PIDS[@]} -ge $MAX_PARALLEL ]]; do
|
|
sleep 1
|
|
read -ra PIDS <<<"$(reap_finished "${PIDS[@]}")"
|
|
done
|
|
|
|
echo "── launch $job_label ────────────────────────────────────"
|
|
started=$(date +%s)
|
|
|
|
if [[ "$model" == "transformer_ssl" ]]; then
|
|
cmd=(python -m training.trainer.run_ssl
|
|
--mode "$mode"
|
|
--validation data/processed/validation_v1.parquet
|
|
--tensors data/processed/tensor_window_v1
|
|
--out-dir artifacts
|
|
--reports-dir reports/eval)
|
|
else
|
|
cmd=(python -m training.trainer.run
|
|
--model "$model" --mode "$mode"
|
|
--validation data/processed/validation_v1.parquet
|
|
--summary data/processed/features_window_v1.parquet
|
|
--tensors data/processed/tensor_window_v1
|
|
--schema data/processed/feature_schema_v1.json
|
|
--out-dir artifacts
|
|
--reports-dir reports/eval
|
|
--train-hosts elliott-thinkpad)
|
|
fi
|
|
if [[ -n "$hyper" ]]; then
|
|
# shellcheck disable=SC2206
|
|
extra_args=($hyper)
|
|
cmd+=("${extra_args[@]}")
|
|
fi
|
|
|
|
# Launch in background. PYTHONPATH is set to $PWD/repo at the top
|
|
# of this script so we DO NOT cd into repo/ — relative paths to
|
|
# data/processed/* must resolve from $HOME/cis490, not from repo/.
|
|
"${cmd[@]}" > "$log" 2>&1 &
|
|
pid=$!
|
|
PIDS+=("$pid")
|
|
PID_TO_LABEL[$pid]="$job_label"
|
|
PID_TO_STARTED[$pid]="$started"
|
|
done
|
|
|
|
# Drain remaining
|
|
while [[ ${#PIDS[@]} -gt 0 ]]; do
|
|
sleep 1
|
|
read -ra PIDS <<<"$(reap_finished "${PIDS[@]}")"
|
|
done
|
|
|
|
echo
|
|
echo "=== training done ==="
|
|
echo " done: $n_done"
|
|
echo " skipped: $n_skipped"
|
|
echo " failed: $n_failed"
|
|
if [[ $n_failed -gt 0 ]]; then
|
|
echo " failed jobs: ${FAILED[*]}"
|
|
fi
|
|
|
|
# ─────────────────────────────────────────────────────────────────────
|
|
# 3. Eval suite (writes reports/eval/comparison_v2.md + per-model JSON)
|
|
# ─────────────────────────────────────────────────────────────────────
|
|
|
|
echo
|
|
echo "=== eval suite ==="
|
|
(cd repo && python -m training.eval_.run \
|
|
--validation data/processed/validation_v1.parquet \
|
|
--artifacts ../artifacts \
|
|
--summary ../data/processed/features_window_v1.parquet \
|
|
--tensors ../data/processed/tensor_window_v1 \
|
|
--reports-dir ../reports/eval) || echo "eval reported errors — see logs/eval.log"
|
|
|
|
# ─────────────────────────────────────────────────────────────────────
|
|
# 4. Stamp + summarize
|
|
# ─────────────────────────────────────────────────────────────────────
|
|
|
|
cat > artifacts/RUN_SUMMARY.json <<EOF
|
|
{
|
|
"started_via": "lambda-bootstrap.sh",
|
|
"completed_at": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
|
|
"host_id": "$(hostname)",
|
|
"n_done": $n_done,
|
|
"n_skipped": $n_skipped,
|
|
"n_failed": $n_failed,
|
|
"failed_jobs": [$(IFS=,; echo "${FAILED[*]/#/\"}" | sed 's/,/",/g')$([[ ${#FAILED[@]} -gt 0 ]] && echo '"' )]
|
|
}
|
|
EOF
|
|
|
|
echo
|
|
echo "✓ bootstrap.sh complete. artifacts/ + reports/eval/ ready for rsync back."
|