#!/usr/bin/env bash # Runs ON the Lambda instance after the bundle is extracted to ~/cis490. # Installs Python deps, iterates the training manifest, runs each job, # tars the resulting artifacts so run-on-lambda.sh can rsync them back. # # Inputs (cwd = ~/cis490): # bootstrap.sh ← THIS FILE # training_manifest.toml ← canonical job list # BUNDLE_MANIFEST.json ← code commit + sanity stamps # repo/ ← code snapshot # data/processed/ ← pre-built parquet + tensor shards # # Outputs (cwd = ~/cis490): # artifacts/ ← _.{ckpt.json,pt,xgb.json} # reports/eval/ ← per-model train.json + comparison_v2.md # logs/_.log ← per-job training log (full stdout/stderr) # # Idempotency: each iteration checks for an existing # artifacts/_.ckpt.json before training. Re-running picks # up where it left off. set -euo pipefail cd "$HOME/cis490" echo "=== bundle manifest ===" cat BUNDLE_MANIFEST.json echo echo "=== gpu inventory ===" if command -v nvidia-smi >/dev/null 2>&1; then nvidia-smi -L nvidia-smi --query-gpu=name,memory.total,memory.free,driver_version --format=csv else echo "nvidia-smi not found — running without CUDA?" >&2 fi echo # ───────────────────────────────────────────────────────────────────── # 1. Python venv with training deps # ───────────────────────────────────────────────────────────────────── if [[ ! -x .venv/bin/python ]]; then echo "=== creating .venv ===" python3 -m venv .venv fi . .venv/bin/activate python -m pip install -q --upgrade pip echo "=== installing training deps ===" # CUDA-enabled torch from PyTorch's index. Lambda's A100 supports cu121/cu124; # default to whichever is the latest stable matching the host driver. pip install -q torch --index-url https://download.pytorch.org/whl/cu121 pip install -q xgboost numpy scipy pyarrow polars scikit-learn matplotlib zstandard pip install -q -e ./repo python - <<'PY' import torch, xgboost print(f"torch {torch.__version__} cuda? {torch.cuda.is_available()} " f"device count={torch.cuda.device_count()}") if torch.cuda.is_available(): print(f" device 0: {torch.cuda.get_device_name(0)}") print(f"xgboost {xgboost.__version__}") PY # ───────────────────────────────────────────────────────────────────── # 2. Iterate the manifest, run trainer per job # ───────────────────────────────────────────────────────────────────── mkdir -p artifacts reports/eval logs export PYTHONPATH="$PWD/repo" # Render manifest jobs to a list ` ` lines (one per job). mapfile -t JOBS < <(python - <&2; exit 3 fi echo "=== running ${#JOBS[@]} training jobs ===" declare -i n_done=0 n_skipped=0 n_failed=0 declare -a FAILED=() for entry in "${JOBS[@]}"; do IFS=$'\t' read -r model mode hyper <<<"$entry" job_label="${model}_${mode}" ckpt="artifacts/${job_label}.ckpt.json" log="logs/${job_label}.log" if [[ -f "$ckpt" ]]; then echo " skip $job_label (already present)" n_skipped+=1 continue fi echo echo "── $job_label ────────────────────────────────────" started=$(date +%s) if [[ "$model" == "transformer_ssl" ]]; then cmd=(python -m training.trainer.run_ssl --mode "$mode" --validation data/processed/validation_v1.parquet --tensors data/processed/tensor_window_v1 --out-dir artifacts --reports-dir reports/eval) else cmd=(python -m training.trainer.run --model "$model" --mode "$mode" --validation data/processed/validation_v1.parquet --summary data/processed/features_window_v1.parquet --tensors data/processed/tensor_window_v1 --schema data/processed/feature_schema_v1.json --out-dir artifacts --reports-dir reports/eval --train-hosts elliott-thinkpad) fi # Tack on hyperparameters from the manifest if [[ -n "$hyper" ]]; then # shellcheck disable=SC2206 extra_args=($hyper) cmd+=("${extra_args[@]}") fi if (cd repo && "${cmd[@]}") > "$log" 2>&1; then elapsed=$(( $(date +%s) - started )) echo " ✓ $job_label done in ${elapsed}s" n_done+=1 else rc=$? elapsed=$(( $(date +%s) - started )) echo " ✗ $job_label FAILED (rc=$rc, ${elapsed}s) — last 20 lines of log:" tail -20 "$log" FAILED+=("$job_label") n_failed+=1 fi done echo echo "=== training done ===" echo " done: $n_done" echo " skipped: $n_skipped" echo " failed: $n_failed" if [[ $n_failed -gt 0 ]]; then echo " failed jobs: ${FAILED[*]}" fi # ───────────────────────────────────────────────────────────────────── # 3. Eval suite (writes reports/eval/comparison_v2.md + per-model JSON) # ───────────────────────────────────────────────────────────────────── echo echo "=== eval suite ===" (cd repo && python -m training.eval_.run \ --validation data/processed/validation_v1.parquet \ --artifacts ../artifacts \ --summary ../data/processed/features_window_v1.parquet \ --tensors ../data/processed/tensor_window_v1 \ --reports-dir ../reports/eval) || echo "eval reported errors — see logs/eval.log" # ───────────────────────────────────────────────────────────────────── # 4. Stamp + summarize # ───────────────────────────────────────────────────────────────────── cat > artifacts/RUN_SUMMARY.json <