training: parallelize lambda bootstrap (2 jobs at a time on the A100)

At our model sizes (max ~250 K params, max batch 512), each training process uses ~1 GiB VRAM. A 40 GiB A100 is far from contention with two concurrent jobs. Bounded-concurrency rolling launcher cuts sequential ~3.5 h → parallel ~1.7 h for the full 14-job manifest. PARALLEL=2 (default) — override via env var if running on a smaller GPU or testing the queue logic. Per-job logs still land at logs/<model>_<mode>.log; failure reporting is the same. Idempotent: skipping already-present checkpoints unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 12:35:41 -05:00 · 2026-05-08 12:35:41 -05:00 · 69c563275a
commit 69c563275a
parent bee40a6ae9
1 changed files with 62 additions and 16 deletions
--- a/scripts/lambda-bootstrap.sh
+++ b/scripts/lambda-bootstrap.sh
@ -86,10 +86,50 @@ if [[ ${#JOBS[@]} -eq 0 ]]; then
    echo "no jobs in manifest!" >&2; exit 3
 fi

-echo "=== running ${#JOBS[@]} training jobs ==="
+echo "=== running ${#JOBS[@]} training jobs (parallelism=${PARALLEL:-2}) ==="
+# Bounded-concurrency rolling launcher. Two trainings run on the same
+# A100 simultaneously: at our model sizes (max ~250K params, max batch
+# 512) each uses ~1 GiB VRAM, so a 40 GiB A100 is far from contention.
+# If you change MAX_PARALLEL, recompute against the actual model sizes
+# in training/models/ — don't blindly raise it.
+MAX_PARALLEL="${PARALLEL:-2}"
+
 declare -i n_done=0 n_skipped=0 n_failed=0
 declare -a FAILED=()
+# Map pid → job_label, started_ts so we can report per-job timing.
+declare -A PID_TO_LABEL=()
+declare -A PID_TO_STARTED=()

+# Helper: reap any finished children, update counters, drop their pids
+# from the running set. Returns through stdout the new running pid list.
+reap_finished() {
+    local still_running=()
+    local p label started rc elapsed
+    for p in "$@"; do
+        if kill -0 "$p" 2>/dev/null; then
+            still_running+=("$p")
+        else
+            label="${PID_TO_LABEL[$p]:-?}"
+            started="${PID_TO_STARTED[$p]:-0}"
+            wait "$p" 2>/dev/null
+            rc=$?
+            elapsed=$(( $(date +%s) - started ))
+            if [[ $rc -eq 0 ]]; then
+                echo "  ✓ $label done in ${elapsed}s"
+                n_done+=1
+            else
+                echo "  ✗ $label FAILED (rc=$rc, ${elapsed}s) — last 20 lines:"
+                tail -20 "logs/${label}.log"
+                FAILED+=("$label")
+                n_failed+=1
+            fi
+            unset "PID_TO_LABEL[$p]" "PID_TO_STARTED[$p]"
+        fi
+    done
+    echo "${still_running[@]}"
+}
+
+PIDS=()
 for entry in "${JOBS[@]}"; do
    IFS=$'\t' read -r model mode hyper <<<"$entry"
    job_label="${model}_${mode}"
@ -102,8 +142,13 @@ for entry in "${JOBS[@]}"; do
        continue
    fi

-    echo
-    echo "── $job_label ────────────────────────────────────"
+    # Throttle: wait until there's room for one more.
+    while [[ ${#PIDS[@]} -ge $MAX_PARALLEL ]]; do
+        sleep 1
+        read -ra PIDS <<<"$(reap_finished "${PIDS[@]}")"
+    done
+
+    echo "── launch $job_label ────────────────────────────────────"
    started=$(date +%s)

    if [[ "$model" == "transformer_ssl" ]]; then
@ -124,25 +169,26 @@ for entry in "${JOBS[@]}"; do
             --reports-dir reports/eval
             --train-hosts elliott-thinkpad)
    fi
-    # Tack on hyperparameters from the manifest
    if [[ -n "$hyper" ]]; then
        # shellcheck disable=SC2206
        extra_args=($hyper)
        cmd+=("${extra_args[@]}")
    fi

-    if (cd repo && "${cmd[@]}") > "$log" 2>&1; then
-        elapsed=$(( $(date +%s) - started ))
-        echo "  ✓ $job_label done in ${elapsed}s"
-        n_done+=1
-    else
-        rc=$?
-        elapsed=$(( $(date +%s) - started ))
-        echo "  ✗ $job_label FAILED (rc=$rc, ${elapsed}s) — last 20 lines of log:"
-        tail -20 "$log"
-        FAILED+=("$job_label")
-        n_failed+=1
-    fi
+    # Launch in background. Each subshell cd's into repo/ for module
+    # imports; output redirected to per-job log; trainer + torch handle
+    # multi-process CUDA OK on a single A100.
+    (cd repo && "${cmd[@]}") > "$log" 2>&1 &
+    pid=$!
+    PIDS+=("$pid")
+    PID_TO_LABEL[$pid]="$job_label"
+    PID_TO_STARTED[$pid]="$started"
+done
+
+# Drain remaining
+while [[ ${#PIDS[@]} -gt 0 ]]; do
+    sleep 1
+    read -ra PIDS <<<"$(reap_finished "${PIDS[@]}")"
 done

 echo