From 4bf241f6ec8fb22c5fd6f3de927d1b3cffb29e2a Mon Sep 17 00:00:00 2001
From: Max Gorog <mgorog@gmail.com>
Date: Fri, 8 May 2026 14:17:31 -0500
Subject: [PATCH] code cards: presenter-friendly comments on every block

The four code snippets shown on stack and training-code scenes get
inline comments explaining the *why* of each line, not just *what*.
Aimed at the live audience: a presenter reads the comment as the
narration; a reader scans them top-to-bottom for the design story.

Covers: pyproject's three install profiles and what each library
contributes; receiver's bearer auth and why constant-time compare
matters; LSTM model's registry pattern, batch_first transpose,
last-step classification head; trainer loop's class weights vs the
imbalanced dataset, AMP scaler vs fp16 underflow, cosine + warmup
schedule, macro-F1 vs accuracy on imbalanced classes, best-state
restore vs last-epoch weights.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 training/dashboard/static/dashboard.js | 121 +++++++++++++++++++------
 training/dashboard/static/index.html   |   2 +-
 2 files changed, 94 insertions(+), 29 deletions(-)

diff --git a/training/dashboard/static/dashboard.js b/training/dashboard/static/dashboard.js
index 9f315cf..d39af2e 100644
--- a/training/dashboard/static/dashboard.js
+++ b/training/dashboard/static/dashboard.js
@@ -845,34 +845,43 @@
   // every dep annotated" stance to the audience without making them
   // open a terminal.
   (function () {
-    const PYPROJECT = `[project]
+    const PYPROJECT = `# Single project, three install profiles. The base "dependencies"
+# list is what every host needs (the receiver, the orchestrator,
+# the dashboard); training and dev pull in heavier tooling on demand.
+[project]
 name = "cis490"
 version = "0.0.1"
 description = "CIS490 behavioral malware detection — dataset, transport, training"
 requires-python = ">=3.11"
+
+# Runtime: HTTP receiver + orchestrator + image build.
 dependencies = [
-    "starlette>=0.36",
-    "uvicorn[standard]>=0.27",
-    "msgpack>=1.0",   # MSF RPC wire format for the Tier-3 exploit driver
-    "pycdlib>=1.14",  # build NoCloud cidata ISOs in pure Python
+    "starlette>=0.36",       # ASGI app for the receiver and dashboard
+    "uvicorn[standard]>=0.27",  # production-grade ASGI server
+    "msgpack>=1.0",          # MSF RPC wire format (Tier-3 exploit driver)
+    "pycdlib>=1.14",          # build NoCloud cidata ISOs in pure Python
 ]
 
 [dependency-groups]
+# Pulled in only when training. Kept off the receiver Pi.
 training = [
-    "pyarrow>=15", "polars>=1.0",
+    "pyarrow>=15", "polars>=1.0",  # columnar dataset I/O
     "numpy>=1.26", "scipy>=1.11",
-    "scikit-learn>=1.4",   # KNN, KMeans, PCA, metrics
-    "xgboost>=2.0",        # gradient-boosted trees baseline
-    "torch>=2.2",          # LSTM / GRU / RNN / CNN / Transformer
-    "zstandard>=0.22",     # episode tarball streaming
+    "scikit-learn>=1.4",     # KNN, KMeans, PCA, metrics
+    "xgboost>=2.0",           # gradient-boosted trees baseline
+    "torch>=2.2",             # LSTM / GRU / RNN / CNN / Transformer
+    "zstandard>=0.22",        # streams episode tarballs without buffering
 ]
 dev = [
-    "pytest>=8", "pytest-asyncio>=0.23",
-    "httpx>=0.27", "paramiko>=3",
-    "matplotlib>=3.8", "tornado>=6",
+    "pytest>=8", "pytest-asyncio>=0.23",  # async-aware test runner
+    "httpx>=0.27", "paramiko>=3",         # in-guest HTTP / SSH for tests
+    "matplotlib>=3.8", "tornado>=6",      # plotting (training reports)
 ]
 `;
-    const RECEIVER = `from __future__ import annotations
+    const RECEIVER = `# The receiver is the public-facing endpoint that ingests episode
+# tarballs from fleet hosts. Starlette ASGI for the HTTP surface;
+# everything else is intentionally stdlib.
+from __future__ import annotations
 
 import json, logging, secrets, time
 from pathlib import Path
@@ -881,23 +890,29 @@ from starlette.applications import Starlette
 from starlette.responses import JSONResponse, Response
 from starlette.routing import Route
 
+# Per-host episodes get streamed onto disk by the EpisodeStore;
+# version_gate rejects schemas the analysis pipeline can't read.
 from .store import EpisodeStore, is_valid_id
 from .version_gate import VersionGate
 
 log = logging.getLogger("cis490.receiver")
-SUFFIX = ".tar.zst"
-SCHEMA_VERSION = 1
+SUFFIX = ".tar.zst"   # zstd-compressed tar — what the fleet ships
+SCHEMA_VERSION = 1     # bumped if the on-disk format changes
 
+# Authenticate every upload with a shared bearer token. The
+# constant-time compare matters: a naive == leaks token length and
+# byte-by-byte progress through timing, which a careful attacker
+# can use to recover the secret one character at a time.
 def _bearer_check(request, expected):
     if expected is None:
-        return None
+        return None  # auth disabled (dev mode)
     auth = request.headers.get("authorization", "")
     if not auth.startswith("Bearer "):
         return JSONResponse({"error": "missing bearer token"}, status_code=401)
     presented = auth[len("Bearer "):]
     if not secrets.compare_digest(presented, expected):
         return JSONResponse({"error": "bad bearer token"}, status_code=401)
-    return None
+    return None  # auth ok — caller proceeds
 `;
 
     const PY_KEYWORDS = new Set([
@@ -1002,12 +1017,20 @@ can speak to the cell-choice question."""
 from __future__ import annotations
 from torch import nn
 
+# The registry lets the trainer pick a model by string name from
+# the training manifest. _SeqBase handles the shared bookkeeping
+# (feature selection, standardization, checkpoint I/O) so each
+# model class only writes its architecture.
 from training.models import register
 from training.models._torch_seq import _SeqBase
 
 
 @register("lstm")
 class LSTM(_SeqBase):
+    # _build_module is called once per training run with shapes
+    # derived from the actual dataset, not hardcoded constants —
+    # so the same model class works at any window length / channel
+    # count. Defaults reflect what produced the leaderboard numbers.
     def _build_module(self, *, n_channels_in, n_timesteps,
                       n_classes, hidden=128, n_layers=2,
                       dropout=0.1, bidirectional=False):
@@ -1018,87 +1041,129 @@ class LSTM(_SeqBase):
         )
 
 
+# Plain PyTorch module; the wrapper above is what the rest of the
+# pipeline talks to. Splitting them keeps the model architecture
+# pure-torch and easy to inspect / swap.
 class _LSTMClassifier(nn.Module):
     def __init__(self, *, n_channels_in, n_classes, hidden,
                  n_layers, dropout, bidirectional):
         super().__init__()
+        # batch_first=True so the tensor flows as (batch, time,
+        # channels), matching the dataloader layout. Stacking layers
+        # with dropout-between is only meaningful when n_layers > 1.
         self.lstm = nn.LSTM(
             input_size=n_channels_in, hidden_size=hidden,
             num_layers=n_layers,
             dropout=dropout if n_layers > 1 else 0.0,
             batch_first=True, bidirectional=bidirectional,
         )
+        # Bidirectional LSTMs concat forward + backward states, so
+        # the head sees 2× hidden when that flag is on.
         d_out = hidden * (2 if bidirectional else 1)
+        # Dropout before the linear head is a cheap regularizer
+        # without changing the LSTM's own behaviour.
         self.head = nn.Sequential(
             nn.Dropout(dropout),
             nn.Linear(d_out, n_classes),
         )
 
+    # Dataset gives (batch, channels, time). Transpose to put time
+    # in the middle so PyTorch's batch_first LSTM accepts it.
     def forward(self, x):                # (B, C, T) -> (B, T, C)
         x = x.transpose(1, 2)
-        out, _ = self.lstm(x)
-        return self.head(out[:, -1, :])  # last-step classification
+        out, _ = self.lstm(x)            # out: (B, T, hidden*dir)
+        # Use the last timestep's hidden state for classification —
+        # by then the LSTM has integrated the whole window.
+        return self.head(out[:, -1, :])
 `;
 
-    const TRAIN_LOOP = `def train_nn(*, model, X_train, y_train, X_val, y_val,
+    const TRAIN_LOOP = `# One generic loop runs every neural model. The model class only
+# defines architecture; this loop owns the optimizer, learning-rate
+# schedule, mixed precision, gradient clipping, and the early-stop
+# bookkeeping. Same code trains LSTM, GRU, CNN, Transformer.
+def train_nn(*, model, X_train, y_train, X_val, y_val,
              n_classes, epochs=60, batch_size=512,
              base_lr=1e-3, weight_decay=1e-4,
              warmup_frac=0.05, grad_clip=1.0,
              patience=8, device="auto") -> TrainResult:
     """Train a model; return TrainResult with the best-on-val
     state_dict already loaded back into model.module."""
+    # Auto-pick CUDA when present so the same script runs on the
+    # Pi (CPU) and the A100 (GPU + AMP) without code changes.
     if device == "auto":
         device = "cuda" if torch.cuda.is_available() else "cpu"
     use_amp = device == "cuda"
     mod = model.module.to(device)
 
-    # Inverse-frequency class weights (capped) — clean dominates
-    # the dataset, so unweighted CE just learns "everything is fine."
+    # Inverse-frequency class weights (capped). The dataset is
+    # ~50% infected_running and only ~5% armed — without weighting,
+    # CE happily ignores the rare classes and reports "good"
+    # accuracy by predicting the majority class for everything.
     cw = _compute_class_weights(y_train, n_classes)
     loss_fn = nn.CrossEntropyLoss(
         weight=torch.from_numpy(cw).to(device))
 
+    # AdamW = Adam with decoupled weight decay; cleaner regularisation
+    # than L2-in-the-loss for transformers and recurrent nets.
     opt = torch.optim.AdamW(mod.parameters(), lr=base_lr,
                              weight_decay=weight_decay)
+    # GradScaler enables mixed-precision training on CUDA: most ops
+    # run in fp16 (faster, less memory) but the scaler keeps
+    # gradients in a safe range so they don't underflow to zero.
     scaler = torch.amp.GradScaler("cuda") if use_amp else None
 
     best_f1, best_state, no_improve = -1.0, None, 0
     step, total_steps = 0, epochs * len(train_dl)
-    warmup = int(total_steps * warmup_frac)
+    warmup = int(total_steps * warmup_frac)  # 5% of total = warmup
 
     for ep in range(1, epochs + 1):
         mod.train()
         for xb, yb in train_dl:
             xb, yb = xb.to(device), yb.to(device)
-            # Cosine LR with linear warmup
+            # Cosine schedule with a linear warmup. Warmup avoids
+            # the early-training "loss explodes from a fresh AdamW"
+            # problem; cosine then anneals smoothly toward zero.
             for g in opt.param_groups:
                 g["lr"] = _cosine_lr(step,
                     total_steps=total_steps,
                     warmup_steps=warmup, base_lr=base_lr)
-            opt.zero_grad(set_to_none=True)
+            opt.zero_grad(set_to_none=True)  # cheaper than zero_()
             if use_amp:
+                # AMP path: forward in autocast, scaler handles
+                # backward + step so fp16 grads don't underflow.
                 with torch.amp.autocast("cuda"):
                     loss = loss_fn(mod(xb), yb)
                 scaler.scale(loss).backward()
                 scaler.unscale_(opt)
+                # Grad clip after unscale — recurrent nets can spike
+                # gradients early in training; clipping keeps them sane.
                 nn.utils.clip_grad_norm_(mod.parameters(), grad_clip)
                 scaler.step(opt); scaler.update()
             else:
+                # CPU / fp32 path — no scaler bookkeeping needed.
                 loss = loss_fn(mod(xb), yb)
                 loss.backward()
                 nn.utils.clip_grad_norm_(mod.parameters(), grad_clip)
                 opt.step()
             step += 1
 
-        # Macro-F1 on val (not accuracy: classes are imbalanced)
+        # Track the held-out-by-host macro-F1, NOT accuracy. With
+        # imbalanced classes a constant predictor can hit 0.5
+        # accuracy; macro-F1 averages per-class F1, so the rare
+        # phases actually count.
         f1 = _macro_f1(y_val, _predict(mod, val_dl), n_classes)
         if f1 > best_f1 + 1e-4:
+            # New best — snapshot the weights. Cheaper than checkpointing
+            # to disk every epoch since we only need the final winner.
             best_f1, best_state, no_improve = f1, mod.state_dict(), 0
         else:
+            # No improvement; tick the patience counter.
             no_improve += 1
             if no_improve >= patience:
-                break    # early stop
+                break    # early stop — saves an A100-hour or two
 
+    # Restore the best-on-val weights. The last epoch's weights are
+    # almost always worse than the best — overfit creep on train.
     mod.load_state_dict(best_state)
     return TrainResult(best_f1=best_f1, best_state=best_state, ...)
 `;
diff --git a/training/dashboard/static/index.html b/training/dashboard/static/index.html
index db4d05d..d6cc0c3 100644
--- a/training/dashboard/static/index.html
+++ b/training/dashboard/static/index.html
@@ -576,6 +576,6 @@
     </article>
   </div>
 
-  <script src="/static/dashboard.js?v=15fac426"></script>
+  <script src="/static/dashboard.js?v=a33c0771"></script>
 </body>
 </html>