training: parallelize lambda bootstrap (2 jobs at a time on the A100)
At our model sizes (max ~250 K params, max batch 512), each training process uses ~1 GiB VRAM. A 40 GiB A100 is far from contention with two concurrent jobs. Bounded-concurrency rolling launcher cuts sequential ~3.5 h → parallel ~1.7 h for the full 14-job manifest. PARALLEL=2 (default) — override via env var if running on a smaller GPU or testing the queue logic. Per-job logs still land at logs/<model>_<mode>.log; failure reporting is the same. Idempotent: skipping already-present checkpoints unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bee40a6ae9
commit
69c563275a
1 changed files with 62 additions and 16 deletions
|
|
@ -86,10 +86,50 @@ if [[ ${#JOBS[@]} -eq 0 ]]; then
|
|||
echo "no jobs in manifest!" >&2; exit 3
|
||||
fi
|
||||
|
||||
echo "=== running ${#JOBS[@]} training jobs ==="
|
||||
echo "=== running ${#JOBS[@]} training jobs (parallelism=${PARALLEL:-2}) ==="
|
||||
# Bounded-concurrency rolling launcher. Two trainings run on the same
|
||||
# A100 simultaneously: at our model sizes (max ~250K params, max batch
|
||||
# 512) each uses ~1 GiB VRAM, so a 40 GiB A100 is far from contention.
|
||||
# If you change MAX_PARALLEL, recompute against the actual model sizes
|
||||
# in training/models/ — don't blindly raise it.
|
||||
MAX_PARALLEL="${PARALLEL:-2}"
|
||||
|
||||
declare -i n_done=0 n_skipped=0 n_failed=0
|
||||
declare -a FAILED=()
|
||||
# Map pid → job_label, started_ts so we can report per-job timing.
|
||||
declare -A PID_TO_LABEL=()
|
||||
declare -A PID_TO_STARTED=()
|
||||
|
||||
# Helper: reap any finished children, update counters, drop their pids
|
||||
# from the running set. Returns through stdout the new running pid list.
|
||||
reap_finished() {
|
||||
local still_running=()
|
||||
local p label started rc elapsed
|
||||
for p in "$@"; do
|
||||
if kill -0 "$p" 2>/dev/null; then
|
||||
still_running+=("$p")
|
||||
else
|
||||
label="${PID_TO_LABEL[$p]:-?}"
|
||||
started="${PID_TO_STARTED[$p]:-0}"
|
||||
wait "$p" 2>/dev/null
|
||||
rc=$?
|
||||
elapsed=$(( $(date +%s) - started ))
|
||||
if [[ $rc -eq 0 ]]; then
|
||||
echo " ✓ $label done in ${elapsed}s"
|
||||
n_done+=1
|
||||
else
|
||||
echo " ✗ $label FAILED (rc=$rc, ${elapsed}s) — last 20 lines:"
|
||||
tail -20 "logs/${label}.log"
|
||||
FAILED+=("$label")
|
||||
n_failed+=1
|
||||
fi
|
||||
unset "PID_TO_LABEL[$p]" "PID_TO_STARTED[$p]"
|
||||
fi
|
||||
done
|
||||
echo "${still_running[@]}"
|
||||
}
|
||||
|
||||
PIDS=()
|
||||
for entry in "${JOBS[@]}"; do
|
||||
IFS=$'\t' read -r model mode hyper <<<"$entry"
|
||||
job_label="${model}_${mode}"
|
||||
|
|
@ -102,8 +142,13 @@ for entry in "${JOBS[@]}"; do
|
|||
continue
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "── $job_label ────────────────────────────────────"
|
||||
# Throttle: wait until there's room for one more.
|
||||
while [[ ${#PIDS[@]} -ge $MAX_PARALLEL ]]; do
|
||||
sleep 1
|
||||
read -ra PIDS <<<"$(reap_finished "${PIDS[@]}")"
|
||||
done
|
||||
|
||||
echo "── launch $job_label ────────────────────────────────────"
|
||||
started=$(date +%s)
|
||||
|
||||
if [[ "$model" == "transformer_ssl" ]]; then
|
||||
|
|
@ -124,25 +169,26 @@ for entry in "${JOBS[@]}"; do
|
|||
--reports-dir reports/eval
|
||||
--train-hosts elliott-thinkpad)
|
||||
fi
|
||||
# Tack on hyperparameters from the manifest
|
||||
if [[ -n "$hyper" ]]; then
|
||||
# shellcheck disable=SC2206
|
||||
extra_args=($hyper)
|
||||
cmd+=("${extra_args[@]}")
|
||||
fi
|
||||
|
||||
if (cd repo && "${cmd[@]}") > "$log" 2>&1; then
|
||||
elapsed=$(( $(date +%s) - started ))
|
||||
echo " ✓ $job_label done in ${elapsed}s"
|
||||
n_done+=1
|
||||
else
|
||||
rc=$?
|
||||
elapsed=$(( $(date +%s) - started ))
|
||||
echo " ✗ $job_label FAILED (rc=$rc, ${elapsed}s) — last 20 lines of log:"
|
||||
tail -20 "$log"
|
||||
FAILED+=("$job_label")
|
||||
n_failed+=1
|
||||
fi
|
||||
# Launch in background. Each subshell cd's into repo/ for module
|
||||
# imports; output redirected to per-job log; trainer + torch handle
|
||||
# multi-process CUDA OK on a single A100.
|
||||
(cd repo && "${cmd[@]}") > "$log" 2>&1 &
|
||||
pid=$!
|
||||
PIDS+=("$pid")
|
||||
PID_TO_LABEL[$pid]="$job_label"
|
||||
PID_TO_STARTED[$pid]="$started"
|
||||
done
|
||||
|
||||
# Drain remaining
|
||||
while [[ ${#PIDS[@]} -gt 0 ]]; do
|
||||
sleep 1
|
||||
read -ra PIDS <<<"$(reap_finished "${PIDS[@]}")"
|
||||
done
|
||||
|
||||
echo
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue