code cards: mirror the actual training stack and trainer loop

The stack scene's pyproject snippet was missing the `training`
group (torch, sklearn, xgboost, zstandard) — the libraries that
do the actual model work. Updated to match the real pyproject.toml.

The receiver snippet now ends at _bearer_check(...) instead of the
import block alone — gives the slide a non-trivial line of code to
read.

The training-code scene replaces the toy "PhaseLSTM" hand-rolled
loop with the real LSTM model class (registry-decorated _SeqBase
subclass + _LSTMClassifier wrapping nn.LSTM with last-step
classification head) and adds a second card showing the actual
train_nn loop: AMP autocast/scaler, cosine LR with linear warmup,
inverse-frequency class weights, gradient clipping, macro-F1
on val, early stop with best-state restore.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Max Gorog 2026-05-08 14:15:01 -05:00
parent c1c8e98180
commit da0e9ce83c
2 changed files with 141 additions and 42 deletions

View file

@ -847,6 +847,7 @@
(function () {
const PYPROJECT = `[project]
name = "cis490"
version = "0.0.1"
description = "CIS490 behavioral malware detection — dataset, transport, training"
requires-python = ">=3.11"
dependencies = [
@ -857,24 +858,26 @@ dependencies = [
]
[dependency-groups]
training = [
"pyarrow>=15", "polars>=1.0",
"numpy>=1.26", "scipy>=1.11",
"scikit-learn>=1.4", # KNN, KMeans, PCA, metrics
"xgboost>=2.0", # gradient-boosted trees baseline
"torch>=2.2", # LSTM / GRU / RNN / CNN / Transformer
"zstandard>=0.22", # episode tarball streaming
]
dev = [
"pytest>=8",
"pytest-asyncio>=0.23",
"httpx>=0.27",
"paramiko>=3", # SSH client for in-guest control on images that support it
"pytest>=8", "pytest-asyncio>=0.23",
"httpx>=0.27", "paramiko>=3",
"matplotlib>=3.8", "tornado>=6",
]
`;
const RECEIVER = `from __future__ import annotations
import json
import logging
import secrets
import time
import json, logging, secrets, time
from pathlib import Path
from typing import Awaitable, Callable
from starlette.applications import Starlette
from starlette.requests import Request
from starlette.responses import JSONResponse, Response
from starlette.routing import Route
@ -882,6 +885,19 @@ from .store import EpisodeStore, is_valid_id
from .version_gate import VersionGate
log = logging.getLogger("cis490.receiver")
SUFFIX = ".tar.zst"
SCHEMA_VERSION = 1
def _bearer_check(request, expected):
if expected is None:
return None
auth = request.headers.get("authorization", "")
if not auth.startswith("Bearer "):
return JSONResponse({"error": "missing bearer token"}, status_code=401)
presented = auth[len("Bearer "):]
if not secrets.compare_digest(presented, expected):
return JSONResponse({"error": "bad bearer token"}, status_code=401)
return None
`;
const PY_KEYWORDS = new Set([
@ -978,42 +994,119 @@ log = logging.getLogger("cis490.receiver")
}).join('\n');
}
const TRAINER = `"""Train PhaseLSTM on the windowed dataset.
const TRAINER = `"""Long Short-Term Memory over channel × time windows.
Each window is 10 s of /proc telemetry (100 samples × 12 channels)
labeled with the phase that occupies its center. The LSTM reads the
window timestep-by-timestep and predicts a single phase.
Held-out *samples* not held-out time slices are the bar that
matters. Generalization to malware the model has never seen is the
whole reason this dataset exists.
"""
Same input/output as GRU, swap the cell. ~30% more parameters than
the GRU at the same hidden size; included so the comparison report
can speak to the cell-choice question."""
from __future__ import annotations
from torch import nn
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from training.models import register
from training.models._torch_seq import _SeqBase
from training.data.windows import WindowedEpisodes
from training.models.lstm import PhaseLSTM
ds = WindowedEpisodes("train", window_s=10, hz=10)
loader = DataLoader(ds, batch_size=128, shuffle=True)
model = PhaseLSTM(channels=12, hidden=64, num_phases=5).cuda()
optim = torch.optim.AdamW(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()
@register("lstm")
class LSTM(_SeqBase):
def _build_module(self, *, n_channels_in, n_timesteps,
n_classes, hidden=128, n_layers=2,
dropout=0.1, bidirectional=False):
return _LSTMClassifier(
n_channels_in=n_channels_in, n_classes=n_classes,
hidden=hidden, n_layers=n_layers,
dropout=dropout, bidirectional=bidirectional,
)
for epoch in range(20):
for x, y in loader:
loss = loss_fn(model(x.cuda()), y.cuda())
optim.zero_grad()
loss.backward()
optim.step()
class _LSTMClassifier(nn.Module):
def __init__(self, *, n_channels_in, n_classes, hidden,
n_layers, dropout, bidirectional):
super().__init__()
self.lstm = nn.LSTM(
input_size=n_channels_in, hidden_size=hidden,
num_layers=n_layers,
dropout=dropout if n_layers > 1 else 0.0,
batch_first=True, bidirectional=bidirectional,
)
d_out = hidden * (2 if bidirectional else 1)
self.head = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(d_out, n_classes),
)
def forward(self, x): # (B, C, T) -> (B, T, C)
x = x.transpose(1, 2)
out, _ = self.lstm(x)
return self.head(out[:, -1, :]) # last-step classification
`;
document.getElementById('code-pyproject').innerHTML = highlightToml(PYPROJECT);
document.getElementById('code-receiver').innerHTML = highlightPython(RECEIVER);
document.getElementById('code-train-lstm').innerHTML = highlightPython(TRAINER);
const TRAIN_LOOP = `def train_nn(*, model, X_train, y_train, X_val, y_val,
n_classes, epochs=60, batch_size=512,
base_lr=1e-3, weight_decay=1e-4,
warmup_frac=0.05, grad_clip=1.0,
patience=8, device="auto") -> TrainResult:
"""Train a model; return TrainResult with the best-on-val
state_dict already loaded back into model.module."""
if device == "auto":
device = "cuda" if torch.cuda.is_available() else "cpu"
use_amp = device == "cuda"
mod = model.module.to(device)
# Inverse-frequency class weights (capped) clean dominates
# the dataset, so unweighted CE just learns "everything is fine."
cw = _compute_class_weights(y_train, n_classes)
loss_fn = nn.CrossEntropyLoss(
weight=torch.from_numpy(cw).to(device))
opt = torch.optim.AdamW(mod.parameters(), lr=base_lr,
weight_decay=weight_decay)
scaler = torch.amp.GradScaler("cuda") if use_amp else None
best_f1, best_state, no_improve = -1.0, None, 0
step, total_steps = 0, epochs * len(train_dl)
warmup = int(total_steps * warmup_frac)
for ep in range(1, epochs + 1):
mod.train()
for xb, yb in train_dl:
xb, yb = xb.to(device), yb.to(device)
# Cosine LR with linear warmup
for g in opt.param_groups:
g["lr"] = _cosine_lr(step,
total_steps=total_steps,
warmup_steps=warmup, base_lr=base_lr)
opt.zero_grad(set_to_none=True)
if use_amp:
with torch.amp.autocast("cuda"):
loss = loss_fn(mod(xb), yb)
scaler.scale(loss).backward()
scaler.unscale_(opt)
nn.utils.clip_grad_norm_(mod.parameters(), grad_clip)
scaler.step(opt); scaler.update()
else:
loss = loss_fn(mod(xb), yb)
loss.backward()
nn.utils.clip_grad_norm_(mod.parameters(), grad_clip)
opt.step()
step += 1
# Macro-F1 on val (not accuracy: classes are imbalanced)
f1 = _macro_f1(y_val, _predict(mod, val_dl), n_classes)
if f1 > best_f1 + 1e-4:
best_f1, best_state, no_improve = f1, mod.state_dict(), 0
else:
no_improve += 1
if no_improve >= patience:
break # early stop
mod.load_state_dict(best_state)
return TrainResult(best_f1=best_f1, best_state=best_state, ...)
`;
document.getElementById('code-pyproject').innerHTML = highlightToml(PYPROJECT);
document.getElementById('code-receiver').innerHTML = highlightPython(RECEIVER);
document.getElementById('code-train-lstm').innerHTML = highlightPython(TRAINER);
document.getElementById('code-train-loop').innerHTML = highlightPython(TRAIN_LOOP);
})();
// ── Ingest counter + 60-second sparkline ──────────────────────

View file

@ -286,9 +286,15 @@
<div class="stage-view" data-view="training-code">
<div class="metric-stack metric-stack-wide">
<div class="metric-eyebrow">how we trained the sequence models</div>
<div class="code-card">
<div class="code-card-header">training/models/lstm.py</div>
<pre class="code" id="code-train-lstm"></pre>
<div class="code-grid">
<div class="code-card">
<div class="code-card-header">training/models/lstm.py</div>
<pre class="code" id="code-train-lstm"></pre>
</div>
<div class="code-card">
<div class="code-card-header">training/trainer/_loop.py · train_nn</div>
<pre class="code" id="code-train-loop"></pre>
</div>
</div>
</div>
</div>
@ -570,6 +576,6 @@
</article>
</div>
<script src="/static/dashboard.js?v=061aec1c"></script>
<script src="/static/dashboard.js?v=15fac426"></script>
</body>
</html>