CIS490/training/eval_/_metrics.py
Max 1fabd4a246 training: validator, feature/tensor extractors, 6 supervised models, schema-hashed checkpoints, eval suite, dashboard producers
The model layer of the project, built honestly:

  - tools/dataset_validate.py — full-sweep validator over the receiver
    store (sha256, schema, monotonic labels, telemetry-row gate). On the
    current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
    7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
    is committed as the per-episode acceptance index.

  - training/_features.py — channel registry (46 channels across
    proc/guest/qmp/netflow), summary-stat windowing AND channel×time
    tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
    (Unix ns) — tested fix for a real netflow-vs-host clock-base
    inconsistency that was silently dropping every netflow channel.

  - training/_split.py — three held-out recipes (host / sample / time)
    with profile-stratification assertions. held_out_host carries
    untested_profiles for cases like scan-and-dial absent from the test
    host (5 of 6 profiles tested cross-device, never silently averaged).

  - training/models/ — 6 architectures behind a common BaseModel
    interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
    trained twice (realistic / oracle) per the deployment threat model.
    Schema-hashed checkpoints refuse to load if _features.py changed
    since training (silent-input-drift protection, tested).

  - training/trainer/ — unified training loop: class-weighted CE, LR
    warmup + cosine, gradient clipping, mixed precision when CUDA,
    early stopping on val macro F1, best-on-val checkpoint. Same loop
    runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
    early_stopping_rounds on val mlogloss.

  - training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
    per-profile and per-host breakdown, paired-bootstrap significance
    for model-vs-model gap. Confusion matrix uses union of seen labels.

  - training/dashboard/producers/ — replay/metrics/perf/profiles
    emitting the six event types the dashboard's awaiting scenes
    consume; on-demand tensor extraction so the Pi can run live
    inference without 65 GB of shards.

  - 17 unit tests (split coverage, features round-trip, schema mismatch,
    determinism, time-base alignment regression).

End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 01:19:00 -05:00

139 lines
5.1 KiB
Python

"""Metrics with bootstrap confidence intervals.
A test-set scalar reported as ``F1=0.873`` is dishonest — that's a point
estimate from one finite sample. The right honesty bar is ``F1=0.873 ±
0.012`` from N nonparametric bootstraps over the test windows.
For paired comparisons (model A vs model B on the same test set) we
use a *paired* bootstrap: resample row indices and apply the same
indices to both models' predictions. This controls for which test
windows happened to be hard.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Iterable
import numpy as np
@dataclass
class CI:
"""Confidence interval (low, high) at the named confidence level."""
point: float
low: float
high: float
level: float = 0.95
def fmt(self, digits: int = 3) -> str:
return f"{self.point:.{digits}f} [{self.low:.{digits}f}, {self.high:.{digits}f}]"
def _f1(y_true: np.ndarray, y_pred: np.ndarray, k: int) -> float:
tp = int(((y_pred == k) & (y_true == k)).sum())
fp = int(((y_pred == k) & (y_true != k)).sum())
fn = int(((y_pred != k) & (y_true == k)).sum())
if tp == 0:
return 0.0
prec = tp / (tp + fp)
rec = tp / (tp + fn)
return 2 * prec * rec / (prec + rec)
def _macro_f1(y_true: np.ndarray, y_pred: np.ndarray, n_classes: int) -> float:
return float(np.mean([_f1(y_true, y_pred, k) for k in range(n_classes)]))
def per_class_pr_f1(y_true: np.ndarray, y_pred: np.ndarray, n_classes: int
) -> dict[int, dict[str, float]]:
"""Plain per-class precision/recall/F1 (no CI, point estimate only)."""
out: dict[int, dict[str, float]] = {}
for k in range(n_classes):
tp = int(((y_pred == k) & (y_true == k)).sum())
fp = int(((y_pred == k) & (y_true != k)).sum())
fn = int(((y_pred != k) & (y_true == k)).sum())
prec = tp / (tp + fp) if (tp + fp) else 0.0
rec = tp / (tp + fn) if (tp + fn) else 0.0
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
out[k] = {"precision": prec, "recall": rec, "f1": f1, "support": int(tp + fn)}
return out
def bootstrap_macro_f1(
y_true: np.ndarray, y_pred: np.ndarray, n_classes: int,
*, n_resamples: int = 1000, level: float = 0.95, seed: int = 0,
) -> CI:
"""Bootstrap CI for macro F1 by resampling test rows with replacement."""
rng = np.random.default_rng(seed)
n = len(y_true)
point = _macro_f1(y_true, y_pred, n_classes)
samples = np.empty(n_resamples, dtype=np.float64)
for i in range(n_resamples):
idx = rng.integers(0, n, size=n)
samples[i] = _macro_f1(y_true[idx], y_pred[idx], n_classes)
lo, hi = np.quantile(samples, [(1 - level) / 2, 1 - (1 - level) / 2])
return CI(point=point, low=float(lo), high=float(hi), level=level)
def bootstrap_per_class_f1(
y_true: np.ndarray, y_pred: np.ndarray, n_classes: int,
*, n_resamples: int = 1000, level: float = 0.95, seed: int = 0,
) -> dict[int, CI]:
"""Per-class F1 CI."""
rng = np.random.default_rng(seed)
n = len(y_true)
out: dict[int, list[float]] = {k: [] for k in range(n_classes)}
for _ in range(n_resamples):
idx = rng.integers(0, n, size=n)
for k in range(n_classes):
out[k].append(_f1(y_true[idx], y_pred[idx], k))
cis: dict[int, CI] = {}
for k in range(n_classes):
arr = np.asarray(out[k])
cis[k] = CI(
point=_f1(y_true, y_pred, k),
low=float(np.quantile(arr, (1 - level) / 2)),
high=float(np.quantile(arr, 1 - (1 - level) / 2)),
level=level,
)
return cis
def paired_bootstrap_macro_f1_diff(
y_true: np.ndarray,
y_pred_a: np.ndarray, y_pred_b: np.ndarray,
n_classes: int,
*, n_resamples: int = 1000, level: float = 0.95, seed: int = 0,
) -> CI:
"""Paired bootstrap of (A.macro_f1 - B.macro_f1).
If the CI excludes 0, the difference is significant at ``level``.
Same row indices applied to both predictions on each resample, so
"which windows happened to be hard" cancels out.
"""
rng = np.random.default_rng(seed)
n = len(y_true)
diffs = np.empty(n_resamples, dtype=np.float64)
for i in range(n_resamples):
idx = rng.integers(0, n, size=n)
a = _macro_f1(y_true[idx], y_pred_a[idx], n_classes)
b = _macro_f1(y_true[idx], y_pred_b[idx], n_classes)
diffs[i] = a - b
lo, hi = np.quantile(diffs, [(1 - level) / 2, 1 - (1 - level) / 2])
return CI(
point=_macro_f1(y_true, y_pred_a, n_classes)
- _macro_f1(y_true, y_pred_b, n_classes),
low=float(lo), high=float(hi), level=level,
)
def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray,
n_classes: int) -> np.ndarray:
"""Returns a (n_classes, n_classes) integer matrix using the same
label set for rows and columns. Avoids the bug where one side has
a class the other doesn't."""
cm = np.zeros((n_classes, n_classes), dtype=np.int64)
for t, p in zip(y_true, y_pred):
if 0 <= t < n_classes and 0 <= p < n_classes:
cm[t, p] += 1
return cm