code cards: mirror the actual training stack and trainer loop

The stack scene's pyproject snippet was missing the `training` group (torch, sklearn, xgboost, zstandard) — the libraries that do the actual model work. Updated to match the real pyproject.toml. The receiver snippet now ends at _bearer_check(...) instead of the import block alone — gives the slide a non-trivial line of code to read. The training-code scene replaces the toy "PhaseLSTM" hand-rolled loop with the real LSTM model class (registry-decorated _SeqBase subclass + _LSTMClassifier wrapping nn.LSTM with last-step classification head) and adds a second card showing the actual train_nn loop: AMP autocast/scaler, cosine LR with linear warmup, inverse-frequency class weights, gradient clipping, macro-F1 on val, early stop with best-state restore. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 14:15:01 -05:00 · 2026-05-08 14:15:01 -05:00 · da0e9ce83c
commit da0e9ce83c
parent c1c8e98180
2 changed files with 141 additions and 42 deletions
--- a/training/dashboard/static/dashboard.js
+++ b/training/dashboard/static/dashboard.js
@ -847,6 +847,7 @@
  (function () {
    const PYPROJECT = `[project]
 name = "cis490"
+version = "0.0.1"
 description = "CIS490 behavioral malware detection — dataset, transport, training"
 requires-python = ">=3.11"
 dependencies = [
@ -857,24 +858,26 @@ dependencies = [
 ]

 [dependency-groups]
+training = [
+    "pyarrow>=15", "polars>=1.0",
+    "numpy>=1.26", "scipy>=1.11",
+    "scikit-learn>=1.4",   # KNN, KMeans, PCA, metrics
+    "xgboost>=2.0",        # gradient-boosted trees baseline
+    "torch>=2.2",          # LSTM / GRU / RNN / CNN / Transformer
+    "zstandard>=0.22",     # episode tarball streaming
+]
 dev = [
-    "pytest>=8",
-    "pytest-asyncio>=0.23",
-    "httpx>=0.27",
-    "paramiko>=3",  # SSH client for in-guest control on images that support it
+    "pytest>=8", "pytest-asyncio>=0.23",
+    "httpx>=0.27", "paramiko>=3",
+    "matplotlib>=3.8", "tornado>=6",
 ]
 `;
    const RECEIVER = `from __future__ import annotations

-import json
-import logging
-import secrets
-import time
+import json, logging, secrets, time
 from pathlib import Path
-from typing import Awaitable, Callable

 from starlette.applications import Starlette
-from starlette.requests import Request
 from starlette.responses import JSONResponse, Response
 from starlette.routing import Route

@ -882,6 +885,19 @@ from .store import EpisodeStore, is_valid_id
 from .version_gate import VersionGate

 log = logging.getLogger("cis490.receiver")
+SUFFIX = ".tar.zst"
+SCHEMA_VERSION = 1
+
+def _bearer_check(request, expected):
+    if expected is None:
+        return None
+    auth = request.headers.get("authorization", "")
+    if not auth.startswith("Bearer "):
+        return JSONResponse({"error": "missing bearer token"}, status_code=401)
+    presented = auth[len("Bearer "):]
+    if not secrets.compare_digest(presented, expected):
+        return JSONResponse({"error": "bad bearer token"}, status_code=401)
+    return None
 `;

    const PY_KEYWORDS = new Set([
@ -978,42 +994,119 @@ log = logging.getLogger("cis490.receiver")
      }).join('\n');
    }

-    const TRAINER = `"""Train PhaseLSTM on the windowed dataset.
+    const TRAINER = `"""Long Short-Term Memory over channel × time windows.

-Each window is 10 s of /proc telemetry (100 samples × 12 channels)
-labeled with the phase that occupies its center. The LSTM reads the
-window timestep-by-timestep and predicts a single phase.
-
-Held-out *samples* — not held-out time slices — are the bar that
-matters. Generalization to malware the model has never seen is the
-whole reason this dataset exists.
-"""
+Same input/output as GRU, swap the cell. ~30% more parameters than
+the GRU at the same hidden size; included so the comparison report
+can speak to the cell-choice question."""
 from __future__ import annotations
+from torch import nn

-import torch
-import torch.nn as nn
-from torch.utils.data import DataLoader
+from training.models import register
+from training.models._torch_seq import _SeqBase

-from training.data.windows import WindowedEpisodes
-from training.models.lstm import PhaseLSTM

-ds      = WindowedEpisodes("train", window_s=10, hz=10)
-loader  = DataLoader(ds, batch_size=128, shuffle=True)
-model   = PhaseLSTM(channels=12, hidden=64, num_phases=5).cuda()
-optim   = torch.optim.AdamW(model.parameters(), lr=3e-4)
-loss_fn = nn.CrossEntropyLoss()
+@register("lstm")
+class LSTM(_SeqBase):
+    def _build_module(self, *, n_channels_in, n_timesteps,
+                      n_classes, hidden=128, n_layers=2,
+                      dropout=0.1, bidirectional=False):
+        return _LSTMClassifier(
+            n_channels_in=n_channels_in, n_classes=n_classes,
+            hidden=hidden, n_layers=n_layers,
+            dropout=dropout, bidirectional=bidirectional,
+        )

-for epoch in range(20):
-    for x, y in loader:
-        loss = loss_fn(model(x.cuda()), y.cuda())
-        optim.zero_grad()
-        loss.backward()
-        optim.step()
+
+class _LSTMClassifier(nn.Module):
+    def __init__(self, *, n_channels_in, n_classes, hidden,
+                 n_layers, dropout, bidirectional):
+        super().__init__()
+        self.lstm = nn.LSTM(
+            input_size=n_channels_in, hidden_size=hidden,
+            num_layers=n_layers,
+            dropout=dropout if n_layers > 1 else 0.0,
+            batch_first=True, bidirectional=bidirectional,
+        )
+        d_out = hidden * (2 if bidirectional else 1)
+        self.head = nn.Sequential(
+            nn.Dropout(dropout),
+            nn.Linear(d_out, n_classes),
+        )
+
+    def forward(self, x):                # (B, C, T) -> (B, T, C)
+        x = x.transpose(1, 2)
+        out, _ = self.lstm(x)
+        return self.head(out[:, -1, :])  # last-step classification
 `;

-    document.getElementById('code-pyproject').innerHTML  = highlightToml(PYPROJECT);
-    document.getElementById('code-receiver').innerHTML   = highlightPython(RECEIVER);
-    document.getElementById('code-train-lstm').innerHTML = highlightPython(TRAINER);
+    const TRAIN_LOOP = `def train_nn(*, model, X_train, y_train, X_val, y_val,
+             n_classes, epochs=60, batch_size=512,
+             base_lr=1e-3, weight_decay=1e-4,
+             warmup_frac=0.05, grad_clip=1.0,
+             patience=8, device="auto") -> TrainResult:
+    """Train a model; return TrainResult with the best-on-val
+    state_dict already loaded back into model.module."""
+    if device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    use_amp = device == "cuda"
+    mod = model.module.to(device)
+
+    # Inverse-frequency class weights (capped) — clean dominates
+    # the dataset, so unweighted CE just learns "everything is fine."
+    cw = _compute_class_weights(y_train, n_classes)
+    loss_fn = nn.CrossEntropyLoss(
+        weight=torch.from_numpy(cw).to(device))
+
+    opt = torch.optim.AdamW(mod.parameters(), lr=base_lr,
+                             weight_decay=weight_decay)
+    scaler = torch.amp.GradScaler("cuda") if use_amp else None
+
+    best_f1, best_state, no_improve = -1.0, None, 0
+    step, total_steps = 0, epochs * len(train_dl)
+    warmup = int(total_steps * warmup_frac)
+
+    for ep in range(1, epochs + 1):
+        mod.train()
+        for xb, yb in train_dl:
+            xb, yb = xb.to(device), yb.to(device)
+            # Cosine LR with linear warmup
+            for g in opt.param_groups:
+                g["lr"] = _cosine_lr(step,
+                    total_steps=total_steps,
+                    warmup_steps=warmup, base_lr=base_lr)
+            opt.zero_grad(set_to_none=True)
+            if use_amp:
+                with torch.amp.autocast("cuda"):
+                    loss = loss_fn(mod(xb), yb)
+                scaler.scale(loss).backward()
+                scaler.unscale_(opt)
+                nn.utils.clip_grad_norm_(mod.parameters(), grad_clip)
+                scaler.step(opt); scaler.update()
+            else:
+                loss = loss_fn(mod(xb), yb)
+                loss.backward()
+                nn.utils.clip_grad_norm_(mod.parameters(), grad_clip)
+                opt.step()
+            step += 1
+
+        # Macro-F1 on val (not accuracy: classes are imbalanced)
+        f1 = _macro_f1(y_val, _predict(mod, val_dl), n_classes)
+        if f1 > best_f1 + 1e-4:
+            best_f1, best_state, no_improve = f1, mod.state_dict(), 0
+        else:
+            no_improve += 1
+            if no_improve >= patience:
+                break    # early stop
+
+    mod.load_state_dict(best_state)
+    return TrainResult(best_f1=best_f1, best_state=best_state, ...)
+`;
+
+    document.getElementById('code-pyproject').innerHTML   = highlightToml(PYPROJECT);
+    document.getElementById('code-receiver').innerHTML    = highlightPython(RECEIVER);
+    document.getElementById('code-train-lstm').innerHTML  = highlightPython(TRAINER);
+    document.getElementById('code-train-loop').innerHTML  = highlightPython(TRAIN_LOOP);
  })();

  // ── Ingest counter + 60-second sparkline ──────────────────────
--- a/training/dashboard/static/index.html
+++ b/training/dashboard/static/index.html
@ -286,9 +286,15 @@
        <div class="stage-view" data-view="training-code">
          <div class="metric-stack metric-stack-wide">
            <div class="metric-eyebrow">how we trained the sequence models</div>
-            <div class="code-card">
-              <div class="code-card-header">training/models/lstm.py</div>
-              <pre class="code" id="code-train-lstm"></pre>
+            <div class="code-grid">
+              <div class="code-card">
+                <div class="code-card-header">training/models/lstm.py</div>
+                <pre class="code" id="code-train-lstm"></pre>
+              </div>
+              <div class="code-card">
+                <div class="code-card-header">training/trainer/_loop.py · train_nn</div>
+                <pre class="code" id="code-train-loop"></pre>
+              </div>
            </div>
          </div>
        </div>
@ -570,6 +576,6 @@
    </article>
  </div>

-  <script src="/static/dashboard.js?v=061aec1c"></script>
+  <script src="/static/dashboard.js?v=15fac426"></script>
 </body>
 </html>