code cards: presenter-friendly comments on every block

The four code snippets shown on stack and training-code scenes get
inline comments explaining the *why* of each line, not just *what*.
Aimed at the live audience: a presenter reads the comment as the
narration; a reader scans them top-to-bottom for the design story.

Covers: pyproject's three install profiles and what each library
contributes; receiver's bearer auth and why constant-time compare
matters; LSTM model's registry pattern, batch_first transpose,
last-step classification head; trainer loop's class weights vs the
imbalanced dataset, AMP scaler vs fp16 underflow, cosine + warmup
schedule, macro-F1 vs accuracy on imbalanced classes, best-state
restore vs last-epoch weights.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Max Gorog 2026-05-08 14:17:31 -05:00
parent da0e9ce83c
commit 4bf241f6ec
2 changed files with 94 additions and 29 deletions

View file

@ -845,34 +845,43 @@
// every dep annotated" stance to the audience without making them
// open a terminal.
(function () {
const PYPROJECT = `[project]
const PYPROJECT = `# Single project, three install profiles. The base "dependencies"
# list is what every host needs (the receiver, the orchestrator,
# the dashboard); training and dev pull in heavier tooling on demand.
[project]
name = "cis490"
version = "0.0.1"
description = "CIS490 behavioral malware detection — dataset, transport, training"
requires-python = ">=3.11"
# Runtime: HTTP receiver + orchestrator + image build.
dependencies = [
"starlette>=0.36",
"uvicorn[standard]>=0.27",
"msgpack>=1.0", # MSF RPC wire format for the Tier-3 exploit driver
"pycdlib>=1.14", # build NoCloud cidata ISOs in pure Python
"starlette>=0.36", # ASGI app for the receiver and dashboard
"uvicorn[standard]>=0.27", # production-grade ASGI server
"msgpack>=1.0", # MSF RPC wire format (Tier-3 exploit driver)
"pycdlib>=1.14", # build NoCloud cidata ISOs in pure Python
]
[dependency-groups]
# Pulled in only when training. Kept off the receiver Pi.
training = [
"pyarrow>=15", "polars>=1.0",
"pyarrow>=15", "polars>=1.0", # columnar dataset I/O
"numpy>=1.26", "scipy>=1.11",
"scikit-learn>=1.4", # KNN, KMeans, PCA, metrics
"xgboost>=2.0", # gradient-boosted trees baseline
"torch>=2.2", # LSTM / GRU / RNN / CNN / Transformer
"zstandard>=0.22", # episode tarball streaming
"scikit-learn>=1.4", # KNN, KMeans, PCA, metrics
"xgboost>=2.0", # gradient-boosted trees baseline
"torch>=2.2", # LSTM / GRU / RNN / CNN / Transformer
"zstandard>=0.22", # streams episode tarballs without buffering
]
dev = [
"pytest>=8", "pytest-asyncio>=0.23",
"httpx>=0.27", "paramiko>=3",
"matplotlib>=3.8", "tornado>=6",
"pytest>=8", "pytest-asyncio>=0.23", # async-aware test runner
"httpx>=0.27", "paramiko>=3", # in-guest HTTP / SSH for tests
"matplotlib>=3.8", "tornado>=6", # plotting (training reports)
]
`;
const RECEIVER = `from __future__ import annotations
const RECEIVER = `# The receiver is the public-facing endpoint that ingests episode
# tarballs from fleet hosts. Starlette ASGI for the HTTP surface;
# everything else is intentionally stdlib.
from __future__ import annotations
import json, logging, secrets, time
from pathlib import Path
@ -881,23 +890,29 @@ from starlette.applications import Starlette
from starlette.responses import JSONResponse, Response
from starlette.routing import Route
# Per-host episodes get streamed onto disk by the EpisodeStore;
# version_gate rejects schemas the analysis pipeline can't read.
from .store import EpisodeStore, is_valid_id
from .version_gate import VersionGate
log = logging.getLogger("cis490.receiver")
SUFFIX = ".tar.zst"
SCHEMA_VERSION = 1
SUFFIX = ".tar.zst" # zstd-compressed tar what the fleet ships
SCHEMA_VERSION = 1 # bumped if the on-disk format changes
# Authenticate every upload with a shared bearer token. The
# constant-time compare matters: a naive == leaks token length and
# byte-by-byte progress through timing, which a careful attacker
# can use to recover the secret one character at a time.
def _bearer_check(request, expected):
if expected is None:
return None
return None # auth disabled (dev mode)
auth = request.headers.get("authorization", "")
if not auth.startswith("Bearer "):
return JSONResponse({"error": "missing bearer token"}, status_code=401)
presented = auth[len("Bearer "):]
if not secrets.compare_digest(presented, expected):
return JSONResponse({"error": "bad bearer token"}, status_code=401)
return None
return None # auth ok caller proceeds
`;
const PY_KEYWORDS = new Set([
@ -1002,12 +1017,20 @@ can speak to the cell-choice question."""
from __future__ import annotations
from torch import nn
# The registry lets the trainer pick a model by string name from
# the training manifest. _SeqBase handles the shared bookkeeping
# (feature selection, standardization, checkpoint I/O) so each
# model class only writes its architecture.
from training.models import register
from training.models._torch_seq import _SeqBase
@register("lstm")
class LSTM(_SeqBase):
# _build_module is called once per training run with shapes
# derived from the actual dataset, not hardcoded constants
# so the same model class works at any window length / channel
# count. Defaults reflect what produced the leaderboard numbers.
def _build_module(self, *, n_channels_in, n_timesteps,
n_classes, hidden=128, n_layers=2,
dropout=0.1, bidirectional=False):
@ -1018,87 +1041,129 @@ class LSTM(_SeqBase):
)
# Plain PyTorch module; the wrapper above is what the rest of the
# pipeline talks to. Splitting them keeps the model architecture
# pure-torch and easy to inspect / swap.
class _LSTMClassifier(nn.Module):
def __init__(self, *, n_channels_in, n_classes, hidden,
n_layers, dropout, bidirectional):
super().__init__()
# batch_first=True so the tensor flows as (batch, time,
# channels), matching the dataloader layout. Stacking layers
# with dropout-between is only meaningful when n_layers > 1.
self.lstm = nn.LSTM(
input_size=n_channels_in, hidden_size=hidden,
num_layers=n_layers,
dropout=dropout if n_layers > 1 else 0.0,
batch_first=True, bidirectional=bidirectional,
)
# Bidirectional LSTMs concat forward + backward states, so
# the head sees 2× hidden when that flag is on.
d_out = hidden * (2 if bidirectional else 1)
# Dropout before the linear head is a cheap regularizer
# without changing the LSTM's own behaviour.
self.head = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(d_out, n_classes),
)
# Dataset gives (batch, channels, time). Transpose to put time
# in the middle so PyTorch's batch_first LSTM accepts it.
def forward(self, x): # (B, C, T) -> (B, T, C)
x = x.transpose(1, 2)
out, _ = self.lstm(x)
return self.head(out[:, -1, :]) # last-step classification
out, _ = self.lstm(x) # out: (B, T, hidden*dir)
# Use the last timestep's hidden state for classification
# by then the LSTM has integrated the whole window.
return self.head(out[:, -1, :])
`;
const TRAIN_LOOP = `def train_nn(*, model, X_train, y_train, X_val, y_val,
const TRAIN_LOOP = `# One generic loop runs every neural model. The model class only
# defines architecture; this loop owns the optimizer, learning-rate
# schedule, mixed precision, gradient clipping, and the early-stop
# bookkeeping. Same code trains LSTM, GRU, CNN, Transformer.
def train_nn(*, model, X_train, y_train, X_val, y_val,
n_classes, epochs=60, batch_size=512,
base_lr=1e-3, weight_decay=1e-4,
warmup_frac=0.05, grad_clip=1.0,
patience=8, device="auto") -> TrainResult:
"""Train a model; return TrainResult with the best-on-val
state_dict already loaded back into model.module."""
# Auto-pick CUDA when present so the same script runs on the
# Pi (CPU) and the A100 (GPU + AMP) without code changes.
if device == "auto":
device = "cuda" if torch.cuda.is_available() else "cpu"
use_amp = device == "cuda"
mod = model.module.to(device)
# Inverse-frequency class weights (capped) clean dominates
# the dataset, so unweighted CE just learns "everything is fine."
# Inverse-frequency class weights (capped). The dataset is
# ~50% infected_running and only ~5% armed without weighting,
# CE happily ignores the rare classes and reports "good"
# accuracy by predicting the majority class for everything.
cw = _compute_class_weights(y_train, n_classes)
loss_fn = nn.CrossEntropyLoss(
weight=torch.from_numpy(cw).to(device))
# AdamW = Adam with decoupled weight decay; cleaner regularisation
# than L2-in-the-loss for transformers and recurrent nets.
opt = torch.optim.AdamW(mod.parameters(), lr=base_lr,
weight_decay=weight_decay)
# GradScaler enables mixed-precision training on CUDA: most ops
# run in fp16 (faster, less memory) but the scaler keeps
# gradients in a safe range so they don't underflow to zero.
scaler = torch.amp.GradScaler("cuda") if use_amp else None
best_f1, best_state, no_improve = -1.0, None, 0
step, total_steps = 0, epochs * len(train_dl)
warmup = int(total_steps * warmup_frac)
warmup = int(total_steps * warmup_frac) # 5% of total = warmup
for ep in range(1, epochs + 1):
mod.train()
for xb, yb in train_dl:
xb, yb = xb.to(device), yb.to(device)
# Cosine LR with linear warmup
# Cosine schedule with a linear warmup. Warmup avoids
# the early-training "loss explodes from a fresh AdamW"
# problem; cosine then anneals smoothly toward zero.
for g in opt.param_groups:
g["lr"] = _cosine_lr(step,
total_steps=total_steps,
warmup_steps=warmup, base_lr=base_lr)
opt.zero_grad(set_to_none=True)
opt.zero_grad(set_to_none=True) # cheaper than zero_()
if use_amp:
# AMP path: forward in autocast, scaler handles
# backward + step so fp16 grads don't underflow.
with torch.amp.autocast("cuda"):
loss = loss_fn(mod(xb), yb)
scaler.scale(loss).backward()
scaler.unscale_(opt)
# Grad clip after unscale recurrent nets can spike
# gradients early in training; clipping keeps them sane.
nn.utils.clip_grad_norm_(mod.parameters(), grad_clip)
scaler.step(opt); scaler.update()
else:
# CPU / fp32 path no scaler bookkeeping needed.
loss = loss_fn(mod(xb), yb)
loss.backward()
nn.utils.clip_grad_norm_(mod.parameters(), grad_clip)
opt.step()
step += 1
# Macro-F1 on val (not accuracy: classes are imbalanced)
# Track the held-out-by-host macro-F1, NOT accuracy. With
# imbalanced classes a constant predictor can hit 0.5
# accuracy; macro-F1 averages per-class F1, so the rare
# phases actually count.
f1 = _macro_f1(y_val, _predict(mod, val_dl), n_classes)
if f1 > best_f1 + 1e-4:
# New best snapshot the weights. Cheaper than checkpointing
# to disk every epoch since we only need the final winner.
best_f1, best_state, no_improve = f1, mod.state_dict(), 0
else:
# No improvement; tick the patience counter.
no_improve += 1
if no_improve >= patience:
break # early stop
break # early stop saves an A100-hour or two
# Restore the best-on-val weights. The last epoch's weights are
# almost always worse than the best overfit creep on train.
mod.load_state_dict(best_state)
return TrainResult(best_f1=best_f1, best_state=best_state, ...)
`;

View file

@ -576,6 +576,6 @@
</article>
</div>
<script src="/static/dashboard.js?v=15fac426"></script>
<script src="/static/dashboard.js?v=a33c0771"></script>
</body>
</html>