# CIS490 training fleet manifest — example/template. # # This is the ONLY thing the operator edits to control what gets trained # across the training fleet. Mirrors the collection-side manifest.toml in # spirit: a single canonical file, no per-host overrides, every host loads # THIS exact file when it claims its next job. # # Copy to /etc/cis490/training_manifest.toml on the Pi (the receiver) and # the receiver loads it on startup + on SIGHUP. Workers don't read it # directly; they ask the receiver for jobs that match their capability. # # To change the fleet's plan: # 1. Edit this file # 2. systemctl reload cis490-receiver (or send SIGHUP) # 3. New jobs become claimable; in-flight jobs continue # # To add a new training host (e.g., your desktop): # 1. Append it to [hosts.] below with its declared capabilities # 2. Run scripts/install-training-worker-{linux,windows}.{sh,ps1} on it # 3. The worker connects, reports its capability, and starts claiming # jobs whose constraints it satisfies schema_version = 1 name = "cis490-training-v1" # -------------------------------------------------------------------- # [defaults] — applied to every job unless the job overrides # -------------------------------------------------------------------- [defaults] split_recipe = "host" # host | sample | time train_hosts = ["elliott-thinkpad"] # which hosts' episodes train; rest = test seed = 0 n_resamples = 1000 # bootstrap CIs # -------------------------------------------------------------------- # [hosts.] — declared capability for each known training host # -------------------------------------------------------------------- # These declarations are *advisory*. The worker ALSO self-detects # capability at startup; the receiver intersects the two and uses the # more restrictive set. So if you say a host has a 2070 Super here but # the worker doesn't actually find CUDA, the worker is treated as CPU-only # and won't claim cuda-required jobs. This prevents misconfiguration. [hosts.office-print] description = "the Pi (receiver). CPU-only, slow. Useful for GBT smoke runs." priority = 0 # higher number = pick this host first when multiple eligible allow_jobs = ["gbt", "mlp"] # whitelist of model names this host may run deny_jobs = [] # blacklist; deny wins over allow [hosts.spectral-desktop] description = "operator desktop. RTX 2070 Super (~8 GiB VRAM)." priority = 100 # allow_jobs = [] # empty list (or absent) = all jobs allowed # Add more hosts here as you enroll them. Names must match the worker's # self-reported hostname (or its FLEET_HOST_ID env var override). # -------------------------------------------------------------------- # [[jobs]] — the training plan. One entry per (model, mode) you want # trained. Add or remove freely; the receiver re-syncs the queue # against the file on SIGHUP. # -------------------------------------------------------------------- # ============ Tier 1: tree + dense baselines (CPU-friendly) ============ [[jobs]] name = "gbt-realistic" model = "gbt" mode = "realistic" priority = 100 # higher = picked first when multiple eligible require_cuda = false # no GPU needed; CPU is fine min_ram_gib = 4 [[jobs]] name = "gbt-oracle" model = "gbt" mode = "oracle" priority = 100 require_cuda = false min_ram_gib = 4 [[jobs]] name = "mlp-realistic" model = "mlp" mode = "realistic" priority = 90 require_cuda = false # tiny MLP — CPU OK, GPU nice min_ram_gib = 4 # hyper.* keys must match flags accepted by training/trainer/run.py # (currently: --epochs, --batch-size, --lr, --patience). Architecture- # specific knobs (hidden, n_layers, dropout) are baked into the model # class defaults; override them by editing the model file rather than # via the manifest until run.py grows the corresponding flags. hyper.epochs = 60 hyper.batch_size = 1024 hyper.lr = 1e-3 [[jobs]] name = "mlp-oracle" model = "mlp" mode = "oracle" priority = 90 require_cuda = false min_ram_gib = 4 # ============ Tier 2: sequence models (GPU strongly preferred) ========= [[jobs]] name = "cnn-realistic" model = "cnn" mode = "realistic" priority = 80 require_cuda = false # 1D-CNN is small enough to run on CPU prefer_cuda = true # but route to a GPU host if available min_vram_gib = 1 hyper.epochs = 60 hyper.batch_size = 512 [[jobs]] name = "cnn-oracle" model = "cnn" mode = "oracle" priority = 80 require_cuda = false prefer_cuda = true min_vram_gib = 1 [[jobs]] name = "gru-realistic" model = "gru" mode = "realistic" priority = 70 require_cuda = true # RNNs slow on CPU; require GPU min_vram_gib = 2 [[jobs]] name = "gru-oracle" model = "gru" mode = "oracle" priority = 70 require_cuda = true min_vram_gib = 2 [[jobs]] name = "lstm-realistic" model = "lstm" mode = "realistic" priority = 60 require_cuda = true min_vram_gib = 2 [[jobs]] name = "lstm-oracle" model = "lstm" mode = "oracle" priority = 60 require_cuda = true min_vram_gib = 2 [[jobs]] name = "transformer-realistic" model = "transformer" mode = "realistic" priority = 50 require_cuda = true min_vram_gib = 4 hyper.epochs = 80 hyper.batch_size = 256 [[jobs]] name = "transformer-oracle" model = "transformer" mode = "oracle" priority = 50 require_cuda = true min_vram_gib = 4 hyper.epochs = 80 hyper.batch_size = 256 # ============ Tier 3: self-supervised pretrain (GPU recommended) ======= [[jobs]] name = "transformer-ssl-realistic" model = "transformer_ssl" mode = "realistic" priority = 40 require_cuda = true min_vram_gib = 4 hyper.epochs = 100 hyper.target_fpr = 0.05 [[jobs]] name = "transformer-ssl-oracle" model = "transformer_ssl" mode = "oracle" priority = 40 require_cuda = true min_vram_gib = 4 hyper.epochs = 100 # Notes on the priority field: # - Higher number = claimed first when multiple jobs are eligible # - Tier 1 (cheap, fast, foundational) > Tier 2 (slower) > Tier 3 (research) # - You can override on a per-job basis if e.g. you want to rush a # specific architecture # # Notes on require_cuda vs prefer_cuda: # - require_cuda = true: only CUDA workers can claim # - prefer_cuda = true: any worker can claim, but CUDA workers are preferred # (the receiver waits ~5 min for a CUDA worker # before letting a CPU worker take it) # # Notes on hyperparameters: # - All hyper.* keys are passed to training/trainer/run.py as -- # - Unset keys fall back to the trainer's defaults # - The receiver hashes the full (model, mode, hyper) blob into job_id # so the same job always produces the same id; re-queueing is idempotent