diff --git a/training/dashboard/static/dashboard.js b/training/dashboard/static/dashboard.js index 6e5aea5..9f315cf 100644 --- a/training/dashboard/static/dashboard.js +++ b/training/dashboard/static/dashboard.js @@ -847,6 +847,7 @@ (function () { const PYPROJECT = `[project] name = "cis490" +version = "0.0.1" description = "CIS490 behavioral malware detection — dataset, transport, training" requires-python = ">=3.11" dependencies = [ @@ -857,24 +858,26 @@ dependencies = [ ] [dependency-groups] +training = [ + "pyarrow>=15", "polars>=1.0", + "numpy>=1.26", "scipy>=1.11", + "scikit-learn>=1.4", # KNN, KMeans, PCA, metrics + "xgboost>=2.0", # gradient-boosted trees baseline + "torch>=2.2", # LSTM / GRU / RNN / CNN / Transformer + "zstandard>=0.22", # episode tarball streaming +] dev = [ - "pytest>=8", - "pytest-asyncio>=0.23", - "httpx>=0.27", - "paramiko>=3", # SSH client for in-guest control on images that support it + "pytest>=8", "pytest-asyncio>=0.23", + "httpx>=0.27", "paramiko>=3", + "matplotlib>=3.8", "tornado>=6", ] `; const RECEIVER = `from __future__ import annotations -import json -import logging -import secrets -import time +import json, logging, secrets, time from pathlib import Path -from typing import Awaitable, Callable from starlette.applications import Starlette -from starlette.requests import Request from starlette.responses import JSONResponse, Response from starlette.routing import Route @@ -882,6 +885,19 @@ from .store import EpisodeStore, is_valid_id from .version_gate import VersionGate log = logging.getLogger("cis490.receiver") +SUFFIX = ".tar.zst" +SCHEMA_VERSION = 1 + +def _bearer_check(request, expected): + if expected is None: + return None + auth = request.headers.get("authorization", "") + if not auth.startswith("Bearer "): + return JSONResponse({"error": "missing bearer token"}, status_code=401) + presented = auth[len("Bearer "):] + if not secrets.compare_digest(presented, expected): + return JSONResponse({"error": "bad bearer token"}, status_code=401) + return None `; const PY_KEYWORDS = new Set([ @@ -978,42 +994,119 @@ log = logging.getLogger("cis490.receiver") }).join('\n'); } - const TRAINER = `"""Train PhaseLSTM on the windowed dataset. + const TRAINER = `"""Long Short-Term Memory over channel × time windows. -Each window is 10 s of /proc telemetry (100 samples × 12 channels) -labeled with the phase that occupies its center. The LSTM reads the -window timestep-by-timestep and predicts a single phase. - -Held-out *samples* — not held-out time slices — are the bar that -matters. Generalization to malware the model has never seen is the -whole reason this dataset exists. -""" +Same input/output as GRU, swap the cell. ~30% more parameters than +the GRU at the same hidden size; included so the comparison report +can speak to the cell-choice question.""" from __future__ import annotations +from torch import nn -import torch -import torch.nn as nn -from torch.utils.data import DataLoader +from training.models import register +from training.models._torch_seq import _SeqBase -from training.data.windows import WindowedEpisodes -from training.models.lstm import PhaseLSTM -ds = WindowedEpisodes("train", window_s=10, hz=10) -loader = DataLoader(ds, batch_size=128, shuffle=True) -model = PhaseLSTM(channels=12, hidden=64, num_phases=5).cuda() -optim = torch.optim.AdamW(model.parameters(), lr=3e-4) -loss_fn = nn.CrossEntropyLoss() +@register("lstm") +class LSTM(_SeqBase): + def _build_module(self, *, n_channels_in, n_timesteps, + n_classes, hidden=128, n_layers=2, + dropout=0.1, bidirectional=False): + return _LSTMClassifier( + n_channels_in=n_channels_in, n_classes=n_classes, + hidden=hidden, n_layers=n_layers, + dropout=dropout, bidirectional=bidirectional, + ) -for epoch in range(20): - for x, y in loader: - loss = loss_fn(model(x.cuda()), y.cuda()) - optim.zero_grad() - loss.backward() - optim.step() + +class _LSTMClassifier(nn.Module): + def __init__(self, *, n_channels_in, n_classes, hidden, + n_layers, dropout, bidirectional): + super().__init__() + self.lstm = nn.LSTM( + input_size=n_channels_in, hidden_size=hidden, + num_layers=n_layers, + dropout=dropout if n_layers > 1 else 0.0, + batch_first=True, bidirectional=bidirectional, + ) + d_out = hidden * (2 if bidirectional else 1) + self.head = nn.Sequential( + nn.Dropout(dropout), + nn.Linear(d_out, n_classes), + ) + + def forward(self, x): # (B, C, T) -> (B, T, C) + x = x.transpose(1, 2) + out, _ = self.lstm(x) + return self.head(out[:, -1, :]) # last-step classification `; - document.getElementById('code-pyproject').innerHTML = highlightToml(PYPROJECT); - document.getElementById('code-receiver').innerHTML = highlightPython(RECEIVER); - document.getElementById('code-train-lstm').innerHTML = highlightPython(TRAINER); + const TRAIN_LOOP = `def train_nn(*, model, X_train, y_train, X_val, y_val, + n_classes, epochs=60, batch_size=512, + base_lr=1e-3, weight_decay=1e-4, + warmup_frac=0.05, grad_clip=1.0, + patience=8, device="auto") -> TrainResult: + """Train a model; return TrainResult with the best-on-val + state_dict already loaded back into model.module.""" + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + use_amp = device == "cuda" + mod = model.module.to(device) + + # Inverse-frequency class weights (capped) — clean dominates + # the dataset, so unweighted CE just learns "everything is fine." + cw = _compute_class_weights(y_train, n_classes) + loss_fn = nn.CrossEntropyLoss( + weight=torch.from_numpy(cw).to(device)) + + opt = torch.optim.AdamW(mod.parameters(), lr=base_lr, + weight_decay=weight_decay) + scaler = torch.amp.GradScaler("cuda") if use_amp else None + + best_f1, best_state, no_improve = -1.0, None, 0 + step, total_steps = 0, epochs * len(train_dl) + warmup = int(total_steps * warmup_frac) + + for ep in range(1, epochs + 1): + mod.train() + for xb, yb in train_dl: + xb, yb = xb.to(device), yb.to(device) + # Cosine LR with linear warmup + for g in opt.param_groups: + g["lr"] = _cosine_lr(step, + total_steps=total_steps, + warmup_steps=warmup, base_lr=base_lr) + opt.zero_grad(set_to_none=True) + if use_amp: + with torch.amp.autocast("cuda"): + loss = loss_fn(mod(xb), yb) + scaler.scale(loss).backward() + scaler.unscale_(opt) + nn.utils.clip_grad_norm_(mod.parameters(), grad_clip) + scaler.step(opt); scaler.update() + else: + loss = loss_fn(mod(xb), yb) + loss.backward() + nn.utils.clip_grad_norm_(mod.parameters(), grad_clip) + opt.step() + step += 1 + + # Macro-F1 on val (not accuracy: classes are imbalanced) + f1 = _macro_f1(y_val, _predict(mod, val_dl), n_classes) + if f1 > best_f1 + 1e-4: + best_f1, best_state, no_improve = f1, mod.state_dict(), 0 + else: + no_improve += 1 + if no_improve >= patience: + break # early stop + + mod.load_state_dict(best_state) + return TrainResult(best_f1=best_f1, best_state=best_state, ...) +`; + + document.getElementById('code-pyproject').innerHTML = highlightToml(PYPROJECT); + document.getElementById('code-receiver').innerHTML = highlightPython(RECEIVER); + document.getElementById('code-train-lstm').innerHTML = highlightPython(TRAINER); + document.getElementById('code-train-loop').innerHTML = highlightPython(TRAIN_LOOP); })(); // ── Ingest counter + 60-second sparkline ────────────────────── diff --git a/training/dashboard/static/index.html b/training/dashboard/static/index.html index 8c03322..db4d05d 100644 --- a/training/dashboard/static/index.html +++ b/training/dashboard/static/index.html @@ -286,9 +286,15 @@
how we trained the sequence models
-
-
training/models/lstm.py
-

+            
+
+
training/models/lstm.py
+

+              
+
+
training/trainer/_loop.py · train_nn
+

+              
@@ -570,6 +576,6 @@
- +