CIS490/etc/training_manifest.toml.example

# CIS490 training fleet manifest — example/template.
#
# This is the ONLY thing the operator edits to control what gets trained
# across the training fleet. Mirrors the collection-side manifest.toml in
# spirit: a single canonical file, no per-host overrides, every host loads
# THIS exact file when it claims its next job.
#
# Copy to /etc/cis490/training_manifest.toml on the Pi (the receiver) and
# the receiver loads it on startup + on SIGHUP. Workers don't read it
# directly; they ask the receiver for jobs that match their capability.
#
# To change the fleet's plan:
#   1. Edit this file
#   2. systemctl reload cis490-receiver        (or send SIGHUP)
#   3. New jobs become claimable; in-flight jobs continue
#
# To add a new training host (e.g., your desktop):
#   1. Append it to [hosts.<name>] below with its declared capabilities
#   2. Run scripts/install-training-worker-{linux,windows}.{sh,ps1} on it
#   3. The worker connects, reports its capability, and starts claiming
#      jobs whose constraints it satisfies

schema_version = 1
name = "cis490-training-v1"

# --------------------------------------------------------------------
# [defaults] — applied to every job unless the job overrides
# --------------------------------------------------------------------
[defaults]
split_recipe = "host"               # host | sample | time
train_hosts  = ["elliott-thinkpad"] # which hosts' episodes train; rest = test
seed         = 0
n_resamples  = 1000                  # bootstrap CIs

# --------------------------------------------------------------------
# [hosts.<name>] — declared capability for each known training host
# --------------------------------------------------------------------
# These declarations are *advisory*. The worker ALSO self-detects
# capability at startup; the receiver intersects the two and uses the
# more restrictive set. So if you say a host has a 2070 Super here but
# the worker doesn't actually find CUDA, the worker is treated as CPU-only
# and won't claim cuda-required jobs. This prevents misconfiguration.
[hosts.office-print]
description = "the Pi (receiver). CPU-only, slow. Useful for GBT smoke runs."
priority    = 0       # higher number = pick this host first when multiple eligible
allow_jobs  = ["gbt", "mlp"]    # whitelist of model names this host may run
deny_jobs   = []      # blacklist; deny wins over allow

[hosts.spectral-desktop]
description = "operator desktop. RTX 2070 Super (~8 GiB VRAM)."
priority    = 100
# allow_jobs  = []    # empty list (or absent) = all jobs allowed

# Add more hosts here as you enroll them. Names must match the worker's
# self-reported hostname (or its FLEET_HOST_ID env var override).

# --------------------------------------------------------------------
# [[jobs]] — the training plan. One entry per (model, mode) you want
# trained. Add or remove freely; the receiver re-syncs the queue
# against the file on SIGHUP.
# --------------------------------------------------------------------

# ============ Tier 1: tree + dense baselines (CPU-friendly) ============

[[jobs]]
name        = "gbt-realistic"
model       = "gbt"
mode        = "realistic"
priority    = 100                # higher = picked first when multiple eligible
require_cuda = false             # no GPU needed; CPU is fine
min_ram_gib  = 4

[[jobs]]
name        = "gbt-oracle"
model       = "gbt"
mode        = "oracle"
priority    = 100
require_cuda = false
min_ram_gib  = 4

[[jobs]]
name        = "knn-realistic"
model       = "knn"
mode        = "realistic"
priority    = 95              # right after GBT — fastest non-parametric baseline
require_cuda = false
min_ram_gib  = 4
# KNN's k=10 / weights=distance live in the model class. To override,
# add --k / --weights to training/trainer/run.py first; otherwise these
# hyper.* keys would fail with the unknown-arg exit-2 issue.

[[jobs]]
name        = "knn-oracle"
model       = "knn"
mode        = "oracle"
priority    = 95
require_cuda = false
min_ram_gib  = 4

# Semi-supervised KNN (self-training) — answers "if we only had 20% of
# labels, could we recover most of supervised KNN's accuracy?" by
# pseudo-labeling the rest via confidence-filtered KNN-vote and
# retraining. Comparing knn vs knn_semi at the same data scale tells
# you whether the unlabeled rest is recoverable.
[[jobs]]
name        = "knn-semi-realistic"
model       = "knn_semi"
mode        = "realistic"
priority    = 85
require_cuda = false
min_ram_gib  = 4

[[jobs]]
name        = "knn-semi-oracle"
model       = "knn_semi"
mode        = "oracle"
priority    = 85
require_cuda = false
min_ram_gib  = 4

[[jobs]]
name        = "mlp-realistic"
model       = "mlp"
mode        = "realistic"
priority    = 90
require_cuda = false             # tiny MLP — CPU OK, GPU nice
min_ram_gib  = 4
# hyper.* keys must match flags accepted by training/trainer/run.py
# (currently: --epochs, --batch-size, --lr, --patience). Architecture-
# specific knobs (hidden, n_layers, dropout) are baked into the model
# class defaults; override them by editing the model file rather than
# via the manifest until run.py grows the corresponding flags.
hyper.epochs = 60
hyper.batch_size = 1024
hyper.lr     = 1e-3

[[jobs]]
name        = "mlp-oracle"
model       = "mlp"
mode        = "oracle"
priority    = 90
require_cuda = false
min_ram_gib  = 4

# ============ Tier 2: sequence models (GPU strongly preferred) =========

[[jobs]]
name        = "cnn-realistic"
model       = "cnn"
mode        = "realistic"
priority    = 80
require_cuda = false             # 1D-CNN is small enough to run on CPU
prefer_cuda = true               # but route to a GPU host if available
min_vram_gib = 1
hyper.epochs = 60
hyper.batch_size = 512

[[jobs]]
name        = "cnn-oracle"
model       = "cnn"
mode        = "oracle"
priority    = 80
require_cuda = false
prefer_cuda = true
min_vram_gib = 1

[[jobs]]
name        = "gru-realistic"
model       = "gru"
mode        = "realistic"
priority    = 70
require_cuda = true              # RNNs slow on CPU; require GPU
min_vram_gib = 2

[[jobs]]
name        = "gru-oracle"
model       = "gru"
mode        = "oracle"
priority    = 70
require_cuda = true
min_vram_gib = 2

[[jobs]]
name        = "lstm-realistic"
model       = "lstm"
mode        = "realistic"
priority    = 60
require_cuda = true
min_vram_gib = 2

[[jobs]]
name        = "lstm-oracle"
model       = "lstm"
mode        = "oracle"
priority    = 60
require_cuda = true
min_vram_gib = 2

[[jobs]]
name        = "transformer-realistic"
model       = "transformer"
mode        = "realistic"
priority    = 50
require_cuda = true
min_vram_gib = 4
hyper.epochs = 80
hyper.batch_size = 256

[[jobs]]
name        = "transformer-oracle"
model       = "transformer"
mode        = "oracle"
priority    = 50
require_cuda = true
min_vram_gib = 4
hyper.epochs = 80
hyper.batch_size = 256

# ============ Tier 3: self-supervised pretrain (GPU recommended) =======

[[jobs]]
name        = "transformer-ssl-realistic"
model       = "transformer_ssl"
mode        = "realistic"
priority    = 40
require_cuda = true
min_vram_gib = 4
hyper.epochs = 100
hyper.target_fpr = 0.05

[[jobs]]
name        = "transformer-ssl-oracle"
model       = "transformer_ssl"
mode        = "oracle"
priority    = 40
require_cuda = true
min_vram_gib = 4
hyper.epochs = 100

# Notes on the priority field:
#   - Higher number = claimed first when multiple jobs are eligible
#   - Tier 1 (cheap, fast, foundational) > Tier 2 (slower) > Tier 3 (research)
#   - You can override on a per-job basis if e.g. you want to rush a
#     specific architecture
#
# Notes on require_cuda vs prefer_cuda:
#   - require_cuda = true: only CUDA workers can claim
#   - prefer_cuda = true: any worker can claim, but CUDA workers are preferred
#                         (the receiver waits ~5 min for a CUDA worker
#                         before letting a CPU worker take it)
#
# Notes on hyperparameters:
#   - All hyper.* keys are passed to training/trainer/run.py as --<key>
#   - Unset keys fall back to the trainer's defaults
#   - The receiver hashes the full (model, mode, hyper) blob into job_id
#     so the same job always produces the same id; re-queueing is idempotent