training: parallelize lambda bootstrap (2 jobs at a time on the A100)

At our model sizes (max ~250 K params, max batch 512), each training
process uses ~1 GiB VRAM. A 40 GiB A100 is far from contention with
two concurrent jobs. Bounded-concurrency rolling launcher cuts
sequential ~3.5 h → parallel ~1.7 h for the full 14-job manifest.

  PARALLEL=2 (default) — override via env var if running on a smaller GPU
  or testing the queue logic.

Per-job logs still land at logs/<model>_<mode>.log; failure reporting
is the same. Idempotent: skipping already-present checkpoints unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Max 2026-05-08 12:35:41 -05:00
parent bee40a6ae9
commit 69c563275a

View file

@ -86,10 +86,50 @@ if [[ ${#JOBS[@]} -eq 0 ]]; then
echo "no jobs in manifest!" >&2; exit 3
fi
echo "=== running ${#JOBS[@]} training jobs ==="
echo "=== running ${#JOBS[@]} training jobs (parallelism=${PARALLEL:-2}) ==="
# Bounded-concurrency rolling launcher. Two trainings run on the same
# A100 simultaneously: at our model sizes (max ~250K params, max batch
# 512) each uses ~1 GiB VRAM, so a 40 GiB A100 is far from contention.
# If you change MAX_PARALLEL, recompute against the actual model sizes
# in training/models/ — don't blindly raise it.
MAX_PARALLEL="${PARALLEL:-2}"
declare -i n_done=0 n_skipped=0 n_failed=0
declare -a FAILED=()
# Map pid → job_label, started_ts so we can report per-job timing.
declare -A PID_TO_LABEL=()
declare -A PID_TO_STARTED=()
# Helper: reap any finished children, update counters, drop their pids
# from the running set. Returns through stdout the new running pid list.
reap_finished() {
local still_running=()
local p label started rc elapsed
for p in "$@"; do
if kill -0 "$p" 2>/dev/null; then
still_running+=("$p")
else
label="${PID_TO_LABEL[$p]:-?}"
started="${PID_TO_STARTED[$p]:-0}"
wait "$p" 2>/dev/null
rc=$?
elapsed=$(( $(date +%s) - started ))
if [[ $rc -eq 0 ]]; then
echo "$label done in ${elapsed}s"
n_done+=1
else
echo "$label FAILED (rc=$rc, ${elapsed}s) — last 20 lines:"
tail -20 "logs/${label}.log"
FAILED+=("$label")
n_failed+=1
fi
unset "PID_TO_LABEL[$p]" "PID_TO_STARTED[$p]"
fi
done
echo "${still_running[@]}"
}
PIDS=()
for entry in "${JOBS[@]}"; do
IFS=$'\t' read -r model mode hyper <<<"$entry"
job_label="${model}_${mode}"
@ -102,8 +142,13 @@ for entry in "${JOBS[@]}"; do
continue
fi
echo
echo "── $job_label ────────────────────────────────────"
# Throttle: wait until there's room for one more.
while [[ ${#PIDS[@]} -ge $MAX_PARALLEL ]]; do
sleep 1
read -ra PIDS <<<"$(reap_finished "${PIDS[@]}")"
done
echo "── launch $job_label ────────────────────────────────────"
started=$(date +%s)
if [[ "$model" == "transformer_ssl" ]]; then
@ -124,25 +169,26 @@ for entry in "${JOBS[@]}"; do
--reports-dir reports/eval
--train-hosts elliott-thinkpad)
fi
# Tack on hyperparameters from the manifest
if [[ -n "$hyper" ]]; then
# shellcheck disable=SC2206
extra_args=($hyper)
cmd+=("${extra_args[@]}")
fi
if (cd repo && "${cmd[@]}") > "$log" 2>&1; then
elapsed=$(( $(date +%s) - started ))
echo "$job_label done in ${elapsed}s"
n_done+=1
else
rc=$?
elapsed=$(( $(date +%s) - started ))
echo "$job_label FAILED (rc=$rc, ${elapsed}s) — last 20 lines of log:"
tail -20 "$log"
FAILED+=("$job_label")
n_failed+=1
fi
# Launch in background. Each subshell cd's into repo/ for module
# imports; output redirected to per-job log; trainer + torch handle
# multi-process CUDA OK on a single A100.
(cd repo && "${cmd[@]}") > "$log" 2>&1 &
pid=$!
PIDS+=("$pid")
PID_TO_LABEL[$pid]="$job_label"
PID_TO_STARTED[$pid]="$started"
done
# Drain remaining
while [[ ${#PIDS[@]} -gt 0 ]]; do
sleep 1
read -ra PIDS <<<"$(reap_finished "${PIDS[@]}")"
done
echo