The model layer of the project, built honestly:
- tools/dataset_validate.py — full-sweep validator over the receiver
store (sha256, schema, monotonic labels, telemetry-row gate). On the
current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
is committed as the per-episode acceptance index.
- training/_features.py — channel registry (46 channels across
proc/guest/qmp/netflow), summary-stat windowing AND channel×time
tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
(Unix ns) — tested fix for a real netflow-vs-host clock-base
inconsistency that was silently dropping every netflow channel.
- training/_split.py — three held-out recipes (host / sample / time)
with profile-stratification assertions. held_out_host carries
untested_profiles for cases like scan-and-dial absent from the test
host (5 of 6 profiles tested cross-device, never silently averaged).
- training/models/ — 6 architectures behind a common BaseModel
interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
trained twice (realistic / oracle) per the deployment threat model.
Schema-hashed checkpoints refuse to load if _features.py changed
since training (silent-input-drift protection, tested).
- training/trainer/ — unified training loop: class-weighted CE, LR
warmup + cosine, gradient clipping, mixed precision when CUDA,
early stopping on val macro F1, best-on-val checkpoint. Same loop
runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
early_stopping_rounds on val mlogloss.
- training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
per-profile and per-host breakdown, paired-bootstrap significance
for model-vs-model gap. Confusion matrix uses union of seen labels.
- training/dashboard/producers/ — replay/metrics/perf/profiles
emitting the six event types the dashboard's awaiting scenes
consume; on-demand tensor extraction so the Pi can run live
inference without 65 GB of shards.
- 17 unit tests (split coverage, features round-trip, schema mismatch,
determinism, time-base alignment regression).
End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
139 lines
5.1 KiB
Python
139 lines
5.1 KiB
Python
"""Metrics with bootstrap confidence intervals.
|
|
|
|
A test-set scalar reported as ``F1=0.873`` is dishonest — that's a point
|
|
estimate from one finite sample. The right honesty bar is ``F1=0.873 ±
|
|
0.012`` from N nonparametric bootstraps over the test windows.
|
|
|
|
For paired comparisons (model A vs model B on the same test set) we
|
|
use a *paired* bootstrap: resample row indices and apply the same
|
|
indices to both models' predictions. This controls for which test
|
|
windows happened to be hard.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Iterable
|
|
|
|
import numpy as np
|
|
|
|
|
|
@dataclass
|
|
class CI:
|
|
"""Confidence interval (low, high) at the named confidence level."""
|
|
point: float
|
|
low: float
|
|
high: float
|
|
level: float = 0.95
|
|
|
|
def fmt(self, digits: int = 3) -> str:
|
|
return f"{self.point:.{digits}f} [{self.low:.{digits}f}, {self.high:.{digits}f}]"
|
|
|
|
|
|
def _f1(y_true: np.ndarray, y_pred: np.ndarray, k: int) -> float:
|
|
tp = int(((y_pred == k) & (y_true == k)).sum())
|
|
fp = int(((y_pred == k) & (y_true != k)).sum())
|
|
fn = int(((y_pred != k) & (y_true == k)).sum())
|
|
if tp == 0:
|
|
return 0.0
|
|
prec = tp / (tp + fp)
|
|
rec = tp / (tp + fn)
|
|
return 2 * prec * rec / (prec + rec)
|
|
|
|
|
|
def _macro_f1(y_true: np.ndarray, y_pred: np.ndarray, n_classes: int) -> float:
|
|
return float(np.mean([_f1(y_true, y_pred, k) for k in range(n_classes)]))
|
|
|
|
|
|
def per_class_pr_f1(y_true: np.ndarray, y_pred: np.ndarray, n_classes: int
|
|
) -> dict[int, dict[str, float]]:
|
|
"""Plain per-class precision/recall/F1 (no CI, point estimate only)."""
|
|
out: dict[int, dict[str, float]] = {}
|
|
for k in range(n_classes):
|
|
tp = int(((y_pred == k) & (y_true == k)).sum())
|
|
fp = int(((y_pred == k) & (y_true != k)).sum())
|
|
fn = int(((y_pred != k) & (y_true == k)).sum())
|
|
prec = tp / (tp + fp) if (tp + fp) else 0.0
|
|
rec = tp / (tp + fn) if (tp + fn) else 0.0
|
|
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
|
|
out[k] = {"precision": prec, "recall": rec, "f1": f1, "support": int(tp + fn)}
|
|
return out
|
|
|
|
|
|
def bootstrap_macro_f1(
|
|
y_true: np.ndarray, y_pred: np.ndarray, n_classes: int,
|
|
*, n_resamples: int = 1000, level: float = 0.95, seed: int = 0,
|
|
) -> CI:
|
|
"""Bootstrap CI for macro F1 by resampling test rows with replacement."""
|
|
rng = np.random.default_rng(seed)
|
|
n = len(y_true)
|
|
point = _macro_f1(y_true, y_pred, n_classes)
|
|
samples = np.empty(n_resamples, dtype=np.float64)
|
|
for i in range(n_resamples):
|
|
idx = rng.integers(0, n, size=n)
|
|
samples[i] = _macro_f1(y_true[idx], y_pred[idx], n_classes)
|
|
lo, hi = np.quantile(samples, [(1 - level) / 2, 1 - (1 - level) / 2])
|
|
return CI(point=point, low=float(lo), high=float(hi), level=level)
|
|
|
|
|
|
def bootstrap_per_class_f1(
|
|
y_true: np.ndarray, y_pred: np.ndarray, n_classes: int,
|
|
*, n_resamples: int = 1000, level: float = 0.95, seed: int = 0,
|
|
) -> dict[int, CI]:
|
|
"""Per-class F1 CI."""
|
|
rng = np.random.default_rng(seed)
|
|
n = len(y_true)
|
|
out: dict[int, list[float]] = {k: [] for k in range(n_classes)}
|
|
for _ in range(n_resamples):
|
|
idx = rng.integers(0, n, size=n)
|
|
for k in range(n_classes):
|
|
out[k].append(_f1(y_true[idx], y_pred[idx], k))
|
|
cis: dict[int, CI] = {}
|
|
for k in range(n_classes):
|
|
arr = np.asarray(out[k])
|
|
cis[k] = CI(
|
|
point=_f1(y_true, y_pred, k),
|
|
low=float(np.quantile(arr, (1 - level) / 2)),
|
|
high=float(np.quantile(arr, 1 - (1 - level) / 2)),
|
|
level=level,
|
|
)
|
|
return cis
|
|
|
|
|
|
def paired_bootstrap_macro_f1_diff(
|
|
y_true: np.ndarray,
|
|
y_pred_a: np.ndarray, y_pred_b: np.ndarray,
|
|
n_classes: int,
|
|
*, n_resamples: int = 1000, level: float = 0.95, seed: int = 0,
|
|
) -> CI:
|
|
"""Paired bootstrap of (A.macro_f1 - B.macro_f1).
|
|
|
|
If the CI excludes 0, the difference is significant at ``level``.
|
|
Same row indices applied to both predictions on each resample, so
|
|
"which windows happened to be hard" cancels out.
|
|
"""
|
|
rng = np.random.default_rng(seed)
|
|
n = len(y_true)
|
|
diffs = np.empty(n_resamples, dtype=np.float64)
|
|
for i in range(n_resamples):
|
|
idx = rng.integers(0, n, size=n)
|
|
a = _macro_f1(y_true[idx], y_pred_a[idx], n_classes)
|
|
b = _macro_f1(y_true[idx], y_pred_b[idx], n_classes)
|
|
diffs[i] = a - b
|
|
lo, hi = np.quantile(diffs, [(1 - level) / 2, 1 - (1 - level) / 2])
|
|
return CI(
|
|
point=_macro_f1(y_true, y_pred_a, n_classes)
|
|
- _macro_f1(y_true, y_pred_b, n_classes),
|
|
low=float(lo), high=float(hi), level=level,
|
|
)
|
|
|
|
|
|
def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray,
|
|
n_classes: int) -> np.ndarray:
|
|
"""Returns a (n_classes, n_classes) integer matrix using the same
|
|
label set for rows and columns. Avoids the bug where one side has
|
|
a class the other doesn't."""
|
|
cm = np.zeros((n_classes, n_classes), dtype=np.int64)
|
|
for t, p in zip(y_true, y_pred):
|
|
if 0 <= t < n_classes and 0 <= p < n_classes:
|
|
cm[t, p] += 1
|
|
return cm
|