The model layer of the project, built honestly:
- tools/dataset_validate.py — full-sweep validator over the receiver
store (sha256, schema, monotonic labels, telemetry-row gate). On the
current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
is committed as the per-episode acceptance index.
- training/_features.py — channel registry (46 channels across
proc/guest/qmp/netflow), summary-stat windowing AND channel×time
tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
(Unix ns) — tested fix for a real netflow-vs-host clock-base
inconsistency that was silently dropping every netflow channel.
- training/_split.py — three held-out recipes (host / sample / time)
with profile-stratification assertions. held_out_host carries
untested_profiles for cases like scan-and-dial absent from the test
host (5 of 6 profiles tested cross-device, never silently averaged).
- training/models/ — 6 architectures behind a common BaseModel
interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
trained twice (realistic / oracle) per the deployment threat model.
Schema-hashed checkpoints refuse to load if _features.py changed
since training (silent-input-drift protection, tested).
- training/trainer/ — unified training loop: class-weighted CE, LR
warmup + cosine, gradient clipping, mixed precision when CUDA,
early stopping on val macro F1, best-on-val checkpoint. Same loop
runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
early_stopping_rounds on val mlogloss.
- training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
per-profile and per-host breakdown, paired-bootstrap significance
for model-vs-model gap. Confusion matrix uses union of seen labels.
- training/dashboard/producers/ — replay/metrics/perf/profiles
emitting the six event types the dashboard's awaiting scenes
consume; on-demand tensor extraction so the Pi can run live
inference without 65 GB of shards.
- 17 unit tests (split coverage, features round-trip, schema mismatch,
determinism, time-base alignment regression).
End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
103 lines
3.6 KiB
Python
103 lines
3.6 KiB
Python
"""Loader + scoring helpers for trained models, dashboard side.
|
|
|
|
Replaces the original ad-hoc loader. Every checkpoint goes through
|
|
``training.models._checkpoint.load_checkpoint`` which verifies the
|
|
schema hash matches the live ``_features.py`` registry. If the
|
|
training-time schema doesn't match, the loader raises rather than
|
|
silently feeding mis-aligned columns to the model — that's the entire
|
|
point of the checkpoint format.
|
|
|
|
Discovery: any ``*.ckpt.json`` under ``artifacts/`` is a candidate.
|
|
We sort by ``(name, mode)`` so producers can iterate deterministically.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
|
|
from training.models import BaseModel
|
|
from training.models._checkpoint import load_checkpoint, load_header
|
|
|
|
|
|
log = logging.getLogger("cis490.dashboard.producers._models")
|
|
|
|
|
|
def discover_checkpoints(artifacts_dir: Path) -> list[Path]:
|
|
"""All checkpoint JSON paths under artifacts_dir, sorted."""
|
|
return sorted(Path(artifacts_dir).glob("*.ckpt.json"))
|
|
|
|
|
|
def load_models(artifacts_dir: Path, *, device: str = "auto"
|
|
) -> list[BaseModel]:
|
|
"""Load every checkpoint we find. Skips (and logs) any whose schema
|
|
hash doesn't match the live registry — a clear signal that the
|
|
feature/channel schema changed since training.
|
|
"""
|
|
models: list[BaseModel] = []
|
|
for p in discover_checkpoints(artifacts_dir):
|
|
try:
|
|
m = load_checkpoint(p, device=device)
|
|
models.append(m)
|
|
log.info("loaded %s (kind=%s)", p.name, m.input_kind)
|
|
except Exception as e:
|
|
log.warning("skipping %s: %s", p.name, e)
|
|
return models
|
|
|
|
|
|
def model_display_name(m: BaseModel) -> str:
|
|
"""For dashboard event payloads. e.g. 'gbt_realistic'."""
|
|
name = getattr(m, "__model_name__", "model")
|
|
# Mode is in the header, but BaseModel doesn't keep it; pull from class
|
|
# via the keep_mask cardinality vs full mask is fragile. Better to
|
|
# rely on the JSON header — discover_checkpoints reads it once.
|
|
return name
|
|
|
|
|
|
def headers_for(artifacts_dir: Path) -> list[dict]:
|
|
return [load_header(p) for p in discover_checkpoints(artifacts_dir)]
|
|
|
|
|
|
def latency_us(model: BaseModel, X_one: np.ndarray, *, n_iter: int = 200,
|
|
warmup: int = 20) -> float:
|
|
"""Median microseconds per forward pass on a single window.
|
|
|
|
``X_one`` shape:
|
|
- summary: (1, F)
|
|
- tensor: (1, C, T)
|
|
"""
|
|
Xk = model.select(X_one[:1])
|
|
# Warm up
|
|
for _ in range(warmup):
|
|
_ = model.predict_proba(X_one[:1])
|
|
samples = []
|
|
for _ in range(n_iter):
|
|
t0 = time.perf_counter_ns()
|
|
_ = model.predict_proba(X_one[:1])
|
|
samples.append((time.perf_counter_ns() - t0) / 1000.0)
|
|
return float(np.median(samples))
|
|
|
|
|
|
def latency_us_batched(model: BaseModel, X: np.ndarray, *,
|
|
batch_sizes: tuple[int, ...] = (1, 8, 64, 512),
|
|
n_iter: int = 200, warmup: int = 20
|
|
) -> dict[int, float]:
|
|
"""Per-batch-size median microseconds. Reports both single-window
|
|
(worst case) and production-batch (best case) numbers — single-
|
|
window timing is misleading because Python overhead dominates."""
|
|
out: dict[int, float] = {}
|
|
for bs in batch_sizes:
|
|
if bs > X.shape[0]:
|
|
continue
|
|
Xb = X[:bs]
|
|
for _ in range(warmup):
|
|
_ = model.predict_proba(Xb)
|
|
samples = []
|
|
for _ in range(n_iter):
|
|
t0 = time.perf_counter_ns()
|
|
_ = model.predict_proba(Xb)
|
|
samples.append((time.perf_counter_ns() - t0) / 1000.0)
|
|
out[bs] = float(np.median(samples))
|
|
return out
|