From 4bf241f6ec8fb22c5fd6f3de927d1b3cffb29e2a Mon Sep 17 00:00:00 2001 From: Max Gorog Date: Fri, 8 May 2026 14:17:31 -0500 Subject: [PATCH] code cards: presenter-friendly comments on every block The four code snippets shown on stack and training-code scenes get inline comments explaining the *why* of each line, not just *what*. Aimed at the live audience: a presenter reads the comment as the narration; a reader scans them top-to-bottom for the design story. Covers: pyproject's three install profiles and what each library contributes; receiver's bearer auth and why constant-time compare matters; LSTM model's registry pattern, batch_first transpose, last-step classification head; trainer loop's class weights vs the imbalanced dataset, AMP scaler vs fp16 underflow, cosine + warmup schedule, macro-F1 vs accuracy on imbalanced classes, best-state restore vs last-epoch weights. Co-Authored-By: Claude Opus 4.7 (1M context) --- training/dashboard/static/dashboard.js | 121 +++++++++++++++++++------ training/dashboard/static/index.html | 2 +- 2 files changed, 94 insertions(+), 29 deletions(-) diff --git a/training/dashboard/static/dashboard.js b/training/dashboard/static/dashboard.js index 9f315cf..d39af2e 100644 --- a/training/dashboard/static/dashboard.js +++ b/training/dashboard/static/dashboard.js @@ -845,34 +845,43 @@ // every dep annotated" stance to the audience without making them // open a terminal. (function () { - const PYPROJECT = `[project] + const PYPROJECT = `# Single project, three install profiles. The base "dependencies" +# list is what every host needs (the receiver, the orchestrator, +# the dashboard); training and dev pull in heavier tooling on demand. +[project] name = "cis490" version = "0.0.1" description = "CIS490 behavioral malware detection — dataset, transport, training" requires-python = ">=3.11" + +# Runtime: HTTP receiver + orchestrator + image build. dependencies = [ - "starlette>=0.36", - "uvicorn[standard]>=0.27", - "msgpack>=1.0", # MSF RPC wire format for the Tier-3 exploit driver - "pycdlib>=1.14", # build NoCloud cidata ISOs in pure Python + "starlette>=0.36", # ASGI app for the receiver and dashboard + "uvicorn[standard]>=0.27", # production-grade ASGI server + "msgpack>=1.0", # MSF RPC wire format (Tier-3 exploit driver) + "pycdlib>=1.14", # build NoCloud cidata ISOs in pure Python ] [dependency-groups] +# Pulled in only when training. Kept off the receiver Pi. training = [ - "pyarrow>=15", "polars>=1.0", + "pyarrow>=15", "polars>=1.0", # columnar dataset I/O "numpy>=1.26", "scipy>=1.11", - "scikit-learn>=1.4", # KNN, KMeans, PCA, metrics - "xgboost>=2.0", # gradient-boosted trees baseline - "torch>=2.2", # LSTM / GRU / RNN / CNN / Transformer - "zstandard>=0.22", # episode tarball streaming + "scikit-learn>=1.4", # KNN, KMeans, PCA, metrics + "xgboost>=2.0", # gradient-boosted trees baseline + "torch>=2.2", # LSTM / GRU / RNN / CNN / Transformer + "zstandard>=0.22", # streams episode tarballs without buffering ] dev = [ - "pytest>=8", "pytest-asyncio>=0.23", - "httpx>=0.27", "paramiko>=3", - "matplotlib>=3.8", "tornado>=6", + "pytest>=8", "pytest-asyncio>=0.23", # async-aware test runner + "httpx>=0.27", "paramiko>=3", # in-guest HTTP / SSH for tests + "matplotlib>=3.8", "tornado>=6", # plotting (training reports) ] `; - const RECEIVER = `from __future__ import annotations + const RECEIVER = `# The receiver is the public-facing endpoint that ingests episode +# tarballs from fleet hosts. Starlette ASGI for the HTTP surface; +# everything else is intentionally stdlib. +from __future__ import annotations import json, logging, secrets, time from pathlib import Path @@ -881,23 +890,29 @@ from starlette.applications import Starlette from starlette.responses import JSONResponse, Response from starlette.routing import Route +# Per-host episodes get streamed onto disk by the EpisodeStore; +# version_gate rejects schemas the analysis pipeline can't read. from .store import EpisodeStore, is_valid_id from .version_gate import VersionGate log = logging.getLogger("cis490.receiver") -SUFFIX = ".tar.zst" -SCHEMA_VERSION = 1 +SUFFIX = ".tar.zst" # zstd-compressed tar — what the fleet ships +SCHEMA_VERSION = 1 # bumped if the on-disk format changes +# Authenticate every upload with a shared bearer token. The +# constant-time compare matters: a naive == leaks token length and +# byte-by-byte progress through timing, which a careful attacker +# can use to recover the secret one character at a time. def _bearer_check(request, expected): if expected is None: - return None + return None # auth disabled (dev mode) auth = request.headers.get("authorization", "") if not auth.startswith("Bearer "): return JSONResponse({"error": "missing bearer token"}, status_code=401) presented = auth[len("Bearer "):] if not secrets.compare_digest(presented, expected): return JSONResponse({"error": "bad bearer token"}, status_code=401) - return None + return None # auth ok — caller proceeds `; const PY_KEYWORDS = new Set([ @@ -1002,12 +1017,20 @@ can speak to the cell-choice question.""" from __future__ import annotations from torch import nn +# The registry lets the trainer pick a model by string name from +# the training manifest. _SeqBase handles the shared bookkeeping +# (feature selection, standardization, checkpoint I/O) so each +# model class only writes its architecture. from training.models import register from training.models._torch_seq import _SeqBase @register("lstm") class LSTM(_SeqBase): + # _build_module is called once per training run with shapes + # derived from the actual dataset, not hardcoded constants — + # so the same model class works at any window length / channel + # count. Defaults reflect what produced the leaderboard numbers. def _build_module(self, *, n_channels_in, n_timesteps, n_classes, hidden=128, n_layers=2, dropout=0.1, bidirectional=False): @@ -1018,87 +1041,129 @@ class LSTM(_SeqBase): ) +# Plain PyTorch module; the wrapper above is what the rest of the +# pipeline talks to. Splitting them keeps the model architecture +# pure-torch and easy to inspect / swap. class _LSTMClassifier(nn.Module): def __init__(self, *, n_channels_in, n_classes, hidden, n_layers, dropout, bidirectional): super().__init__() + # batch_first=True so the tensor flows as (batch, time, + # channels), matching the dataloader layout. Stacking layers + # with dropout-between is only meaningful when n_layers > 1. self.lstm = nn.LSTM( input_size=n_channels_in, hidden_size=hidden, num_layers=n_layers, dropout=dropout if n_layers > 1 else 0.0, batch_first=True, bidirectional=bidirectional, ) + # Bidirectional LSTMs concat forward + backward states, so + # the head sees 2× hidden when that flag is on. d_out = hidden * (2 if bidirectional else 1) + # Dropout before the linear head is a cheap regularizer + # without changing the LSTM's own behaviour. self.head = nn.Sequential( nn.Dropout(dropout), nn.Linear(d_out, n_classes), ) + # Dataset gives (batch, channels, time). Transpose to put time + # in the middle so PyTorch's batch_first LSTM accepts it. def forward(self, x): # (B, C, T) -> (B, T, C) x = x.transpose(1, 2) - out, _ = self.lstm(x) - return self.head(out[:, -1, :]) # last-step classification + out, _ = self.lstm(x) # out: (B, T, hidden*dir) + # Use the last timestep's hidden state for classification — + # by then the LSTM has integrated the whole window. + return self.head(out[:, -1, :]) `; - const TRAIN_LOOP = `def train_nn(*, model, X_train, y_train, X_val, y_val, + const TRAIN_LOOP = `# One generic loop runs every neural model. The model class only +# defines architecture; this loop owns the optimizer, learning-rate +# schedule, mixed precision, gradient clipping, and the early-stop +# bookkeeping. Same code trains LSTM, GRU, CNN, Transformer. +def train_nn(*, model, X_train, y_train, X_val, y_val, n_classes, epochs=60, batch_size=512, base_lr=1e-3, weight_decay=1e-4, warmup_frac=0.05, grad_clip=1.0, patience=8, device="auto") -> TrainResult: """Train a model; return TrainResult with the best-on-val state_dict already loaded back into model.module.""" + # Auto-pick CUDA when present so the same script runs on the + # Pi (CPU) and the A100 (GPU + AMP) without code changes. if device == "auto": device = "cuda" if torch.cuda.is_available() else "cpu" use_amp = device == "cuda" mod = model.module.to(device) - # Inverse-frequency class weights (capped) — clean dominates - # the dataset, so unweighted CE just learns "everything is fine." + # Inverse-frequency class weights (capped). The dataset is + # ~50% infected_running and only ~5% armed — without weighting, + # CE happily ignores the rare classes and reports "good" + # accuracy by predicting the majority class for everything. cw = _compute_class_weights(y_train, n_classes) loss_fn = nn.CrossEntropyLoss( weight=torch.from_numpy(cw).to(device)) + # AdamW = Adam with decoupled weight decay; cleaner regularisation + # than L2-in-the-loss for transformers and recurrent nets. opt = torch.optim.AdamW(mod.parameters(), lr=base_lr, weight_decay=weight_decay) + # GradScaler enables mixed-precision training on CUDA: most ops + # run in fp16 (faster, less memory) but the scaler keeps + # gradients in a safe range so they don't underflow to zero. scaler = torch.amp.GradScaler("cuda") if use_amp else None best_f1, best_state, no_improve = -1.0, None, 0 step, total_steps = 0, epochs * len(train_dl) - warmup = int(total_steps * warmup_frac) + warmup = int(total_steps * warmup_frac) # 5% of total = warmup for ep in range(1, epochs + 1): mod.train() for xb, yb in train_dl: xb, yb = xb.to(device), yb.to(device) - # Cosine LR with linear warmup + # Cosine schedule with a linear warmup. Warmup avoids + # the early-training "loss explodes from a fresh AdamW" + # problem; cosine then anneals smoothly toward zero. for g in opt.param_groups: g["lr"] = _cosine_lr(step, total_steps=total_steps, warmup_steps=warmup, base_lr=base_lr) - opt.zero_grad(set_to_none=True) + opt.zero_grad(set_to_none=True) # cheaper than zero_() if use_amp: + # AMP path: forward in autocast, scaler handles + # backward + step so fp16 grads don't underflow. with torch.amp.autocast("cuda"): loss = loss_fn(mod(xb), yb) scaler.scale(loss).backward() scaler.unscale_(opt) + # Grad clip after unscale — recurrent nets can spike + # gradients early in training; clipping keeps them sane. nn.utils.clip_grad_norm_(mod.parameters(), grad_clip) scaler.step(opt); scaler.update() else: + # CPU / fp32 path — no scaler bookkeeping needed. loss = loss_fn(mod(xb), yb) loss.backward() nn.utils.clip_grad_norm_(mod.parameters(), grad_clip) opt.step() step += 1 - # Macro-F1 on val (not accuracy: classes are imbalanced) + # Track the held-out-by-host macro-F1, NOT accuracy. With + # imbalanced classes a constant predictor can hit 0.5 + # accuracy; macro-F1 averages per-class F1, so the rare + # phases actually count. f1 = _macro_f1(y_val, _predict(mod, val_dl), n_classes) if f1 > best_f1 + 1e-4: + # New best — snapshot the weights. Cheaper than checkpointing + # to disk every epoch since we only need the final winner. best_f1, best_state, no_improve = f1, mod.state_dict(), 0 else: + # No improvement; tick the patience counter. no_improve += 1 if no_improve >= patience: - break # early stop + break # early stop — saves an A100-hour or two + # Restore the best-on-val weights. The last epoch's weights are + # almost always worse than the best — overfit creep on train. mod.load_state_dict(best_state) return TrainResult(best_f1=best_f1, best_state=best_state, ...) `; diff --git a/training/dashboard/static/index.html b/training/dashboard/static/index.html index db4d05d..d6cc0c3 100644 --- a/training/dashboard/static/index.html +++ b/training/dashboard/static/index.html @@ -576,6 +576,6 @@ - +