CIS490/scripts/build-lambda-bundle.sh

#!/usr/bin/env bash
# Build a self-contained tarball ready for rsync to a Lambda GPU instance.
#
# Inputs:
#   - The repo at /home/max/.env/CIS490 (or $REPO_ROOT)
#   - data/processed/validation_v1.parquet
#   - data/processed/features_window_v1.parquet
#   - data/processed/feature_schema_v1.json
#   - data/processed/tensor_window_v1/   (npz shards, one per episode)
#
# Output:
#   $OUT_DIR/lambda-bundle-<git-short>.tar.zst
#
# What's IN the bundle:
#   - repo/                  (sans .git, sans data/, sans artifacts*, sans .venv*)
#   - data/processed/        (the four artifacts above)
#   - bootstrap.sh           (entrypoint that runs ON Lambda)
#   - training_manifest.toml (the operator's canonical plan; bootstrap loops over jobs)
#
# What's NOT in the bundle:
#   - raw .tar.zst episodes (not needed once tensors are pre-built)
#   - .git directory (we ship a code snapshot, not history)
#   - prior artifacts/ (Lambda generates fresh)
#
# Run on the Pi:
#   bash scripts/build-lambda-bundle.sh
set -euo pipefail

REPO_ROOT="${REPO_ROOT:-/home/max/.env/CIS490}"
OUT_DIR="${OUT_DIR:-/tmp/cis490-lambda}"
SHORT=$(cd "$REPO_ROOT" && git rev-parse --short HEAD)
BUNDLE="$OUT_DIR/lambda-bundle-$SHORT.tar.zst"

mkdir -p "$OUT_DIR"

# Check the four required inputs exist BEFORE we start tarring 5 GB.
required=(
    "$REPO_ROOT/data/processed/validation_v1.parquet"
    "$REPO_ROOT/data/processed/features_window_v1.parquet"
    "$REPO_ROOT/data/processed/feature_schema_v1.json"
    "$REPO_ROOT/data/processed/tensor_window_v1"
)
for r in "${required[@]}"; do
    if [[ ! -e "$r" ]]; then
        echo "missing required input: $r" >&2
        echo "did the Pi-side feature build finish? check data/logs/build_features_full.log" >&2
        exit 1
    fi
done

# Stage the manifest into the bundle's working dir so bootstrap can read it.
STAGE="$(mktemp -d)"
trap 'rm -rf "$STAGE"' EXIT

# Pre-built data the Lambda instance needs
mkdir -p "$STAGE/data/processed"
cp "$REPO_ROOT/data/processed/validation_v1.parquet" "$STAGE/data/processed/"
cp "$REPO_ROOT/data/processed/features_window_v1.parquet" "$STAGE/data/processed/"
cp "$REPO_ROOT/data/processed/feature_schema_v1.json" "$STAGE/data/processed/"
cp -r "$REPO_ROOT/data/processed/tensor_window_v1" "$STAGE/data/processed/"

# Code snapshot — exclude .git, runtime caches, and anything under data/
mkdir -p "$STAGE/repo"
rsync -a \
    --exclude='.git/' \
    --exclude='.venv*/' \
    --exclude='__pycache__/' \
    --exclude='*.pyc' \
    --exclude='data/' \
    --exclude='artifacts*/' \
    --exclude='reports/eval/' \
    --exclude='reports/pca/' \
    --exclude='reports/xai/' \
    --exclude='reports/fleet-*/' \
    --exclude='/tmp/*' \
    --exclude='vm/images/' \
    --exclude='vm/snapshots/' \
    "$REPO_ROOT/" "$STAGE/repo/"

# The bootstrap script Lambda runs after extracting the bundle.
cp "$REPO_ROOT/scripts/lambda-bootstrap.sh" "$STAGE/bootstrap.sh"
chmod +x "$STAGE/bootstrap.sh"

# Use the canonical training manifest as the job list. If the operator
# wants a different plan, they edit etc/training_manifest.toml.example
# and we ship the edited version.
cp "$REPO_ROOT/etc/training_manifest.toml.example" \
   "$STAGE/training_manifest.toml"

# Manifest pinning — Lambda gets a stamp of what code commit produced
# this bundle, so rerunning against the same data with the same code
# is reproducible.
cat > "$STAGE/BUNDLE_MANIFEST.json" <<EOF
{
  "code_commit": "$(cd "$REPO_ROOT" && git rev-parse HEAD)",
  "code_commit_short": "$SHORT",
  "code_branch": "$(cd "$REPO_ROOT" && git rev-parse --abbrev-ref HEAD)",
  "code_dirty": "$(cd "$REPO_ROOT" && git status --porcelain | wc -l | xargs)",
  "built_at": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
  "built_on": "$(hostname)",
  "n_episodes": "$(/home/max/.env/CIS490/.venv-training/bin/python -c "import pyarrow.parquet as pq; print(pq.read_table('$STAGE/data/processed/validation_v1.parquet').num_rows)" 2>/dev/null)",
  "n_tensor_shards": "$(find "$STAGE/data/processed/tensor_window_v1" -name '*.npz' | wc -l | xargs)"
}
EOF

# tar.zst (zstd > gzip for both speed and ratio on this kind of payload)
echo "compressing bundle to $BUNDLE..."
tar -C "$STAGE" --use-compress-program='zstd -T0 -3' -cf "$BUNDLE" .

# Stamp the bundle's own sha256 so rsync resume + verify is stable.
sha256sum "$BUNDLE" > "$BUNDLE.sha256"

# Report
size=$(du -sh "$BUNDLE" | awk '{print $1}')
echo
echo "✓ bundle ready"
echo "  $BUNDLE  ($size)"
echo "  $BUNDLE.sha256"
echo
echo "next: bash scripts/run-on-lambda.sh ubuntu@<lambda-ip>"