CIS490/scripts/lambda-bootstrap.sh

#!/usr/bin/env bash
# Runs ON the Lambda instance after the bundle is extracted to ~/cis490.
# Installs Python deps, iterates the training manifest, runs each job,
# tars the resulting artifacts so run-on-lambda.sh can rsync them back.
#
# Inputs (cwd = ~/cis490):
#   bootstrap.sh                 ← THIS FILE
#   training_manifest.toml       ← canonical job list
#   BUNDLE_MANIFEST.json         ← code commit + sanity stamps
#   repo/                        ← code snapshot
#   data/processed/              ← pre-built parquet + tensor shards
#
# Outputs (cwd = ~/cis490):
#   artifacts/                   ← <model>_<mode>.{ckpt.json,pt,xgb.json}
#   reports/eval/                ← per-model train.json + comparison_v2.md
#   logs/<model>_<mode>.log      ← per-job training log (full stdout/stderr)
#
# Idempotency: each iteration checks for an existing
# artifacts/<model>_<mode>.ckpt.json before training. Re-running picks
# up where it left off.
set -euo pipefail

cd "$HOME/cis490"

echo "=== bundle manifest ==="
cat BUNDLE_MANIFEST.json
echo

echo "=== gpu inventory ==="
if command -v nvidia-smi >/dev/null 2>&1; then
    nvidia-smi -L
    nvidia-smi --query-gpu=name,memory.total,memory.free,driver_version --format=csv
else
    echo "nvidia-smi not found — running without CUDA?" >&2
fi
echo

# ─────────────────────────────────────────────────────────────────────
# 1. Python venv with training deps
# ─────────────────────────────────────────────────────────────────────

if [[ ! -x .venv/bin/python ]]; then
    echo "=== creating .venv ==="
    python3 -m venv .venv
fi
. .venv/bin/activate
python -m pip install -q --upgrade pip
echo "=== installing training deps ==="
# CUDA-enabled torch from PyTorch's index. Lambda's A100 supports cu121/cu124;
# default to whichever is the latest stable matching the host driver.
pip install -q torch --index-url https://download.pytorch.org/whl/cu121
pip install -q xgboost numpy scipy pyarrow polars scikit-learn matplotlib zstandard
pip install -q -e ./repo

python - <<'PY'
import torch, xgboost
print(f"torch {torch.__version__}  cuda? {torch.cuda.is_available()}  "
       f"device count={torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"  device 0: {torch.cuda.get_device_name(0)}")
print(f"xgboost {xgboost.__version__}")
PY

# ─────────────────────────────────────────────────────────────────────
# 2. Iterate the manifest, run trainer per job
# ─────────────────────────────────────────────────────────────────────

mkdir -p artifacts reports/eval logs
export PYTHONPATH="$PWD/repo"

# Render manifest jobs to a list `<model> <mode>` lines (one per job).
mapfile -t JOBS < <(python - <<PY
from pathlib import Path
import sys
sys.path.insert(0, "repo")
from training.fleet.manifest import load
m = load(Path("training_manifest.toml"))
for j in sorted(m.jobs, key=lambda x: -x.priority):
    # Compose hyper as --key value pairs
    hyper = " ".join(f"--{k.replace('_','-')} {v}" for k, v in j.hyper.items())
    print(f"{j.model}\t{j.mode}\t{hyper}")
PY
)

if [[ ${#JOBS[@]} -eq 0 ]]; then
    echo "no jobs in manifest!" >&2; exit 3
fi

echo "=== running ${#JOBS[@]} training jobs ==="
declare -i n_done=0 n_skipped=0 n_failed=0
declare -a FAILED=()

for entry in "${JOBS[@]}"; do
    IFS=$'\t' read -r model mode hyper <<<"$entry"
    job_label="${model}_${mode}"
    ckpt="artifacts/${job_label}.ckpt.json"
    log="logs/${job_label}.log"

    if [[ -f "$ckpt" ]]; then
        echo "  skip  $job_label  (already present)"
        n_skipped+=1
        continue
    fi

    echo
    echo "── $job_label ────────────────────────────────────"
    started=$(date +%s)

    if [[ "$model" == "transformer_ssl" ]]; then
        cmd=(python -m training.trainer.run_ssl
             --mode "$mode"
             --validation data/processed/validation_v1.parquet
             --tensors    data/processed/tensor_window_v1
             --out-dir    artifacts
             --reports-dir reports/eval)
    else
        cmd=(python -m training.trainer.run
             --model "$model" --mode "$mode"
             --validation data/processed/validation_v1.parquet
             --summary    data/processed/features_window_v1.parquet
             --tensors    data/processed/tensor_window_v1
             --schema     data/processed/feature_schema_v1.json
             --out-dir    artifacts
             --reports-dir reports/eval
             --train-hosts elliott-thinkpad)
    fi
    # Tack on hyperparameters from the manifest
    if [[ -n "$hyper" ]]; then
        # shellcheck disable=SC2206
        extra_args=($hyper)
        cmd+=("${extra_args[@]}")
    fi

    if (cd repo && "${cmd[@]}") > "$log" 2>&1; then
        elapsed=$(( $(date +%s) - started ))
        echo "  ✓ $job_label done in ${elapsed}s"
        n_done+=1
    else
        rc=$?
        elapsed=$(( $(date +%s) - started ))
        echo "  ✗ $job_label FAILED (rc=$rc, ${elapsed}s) — last 20 lines of log:"
        tail -20 "$log"
        FAILED+=("$job_label")
        n_failed+=1
    fi
done

echo
echo "=== training done ==="
echo "  done:    $n_done"
echo "  skipped: $n_skipped"
echo "  failed:  $n_failed"
if [[ $n_failed -gt 0 ]]; then
    echo "  failed jobs: ${FAILED[*]}"
fi

# ─────────────────────────────────────────────────────────────────────
# 3. Eval suite (writes reports/eval/comparison_v2.md + per-model JSON)
# ─────────────────────────────────────────────────────────────────────

echo
echo "=== eval suite ==="
(cd repo && python -m training.eval_.run \
    --validation data/processed/validation_v1.parquet \
    --artifacts  ../artifacts \
    --summary    ../data/processed/features_window_v1.parquet \
    --tensors    ../data/processed/tensor_window_v1 \
    --reports-dir ../reports/eval) || echo "eval reported errors — see logs/eval.log"

# ─────────────────────────────────────────────────────────────────────
# 4. Stamp + summarize
# ─────────────────────────────────────────────────────────────────────

cat > artifacts/RUN_SUMMARY.json <<EOF
{
  "started_via": "lambda-bootstrap.sh",
  "completed_at": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
  "host_id": "$(hostname)",
  "n_done": $n_done,
  "n_skipped": $n_skipped,
  "n_failed": $n_failed,
  "failed_jobs": [$(IFS=,; echo "${FAILED[*]/#/\"}" | sed 's/,/",/g')$([[ ${#FAILED[@]} -gt 0 ]] && echo '"' )]
}
EOF

echo
echo "✓ bootstrap.sh complete. artifacts/ + reports/eval/ ready for rsync back."