CIS490/training/models/gbt.py
Max 1fabd4a246 training: validator, feature/tensor extractors, 6 supervised models, schema-hashed checkpoints, eval suite, dashboard producers
The model layer of the project, built honestly:

  - tools/dataset_validate.py — full-sweep validator over the receiver
    store (sha256, schema, monotonic labels, telemetry-row gate). On the
    current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
    7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
    is committed as the per-episode acceptance index.

  - training/_features.py — channel registry (46 channels across
    proc/guest/qmp/netflow), summary-stat windowing AND channel×time
    tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
    (Unix ns) — tested fix for a real netflow-vs-host clock-base
    inconsistency that was silently dropping every netflow channel.

  - training/_split.py — three held-out recipes (host / sample / time)
    with profile-stratification assertions. held_out_host carries
    untested_profiles for cases like scan-and-dial absent from the test
    host (5 of 6 profiles tested cross-device, never silently averaged).

  - training/models/ — 6 architectures behind a common BaseModel
    interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
    trained twice (realistic / oracle) per the deployment threat model.
    Schema-hashed checkpoints refuse to load if _features.py changed
    since training (silent-input-drift protection, tested).

  - training/trainer/ — unified training loop: class-weighted CE, LR
    warmup + cosine, gradient clipping, mixed precision when CUDA,
    early stopping on val macro F1, best-on-val checkpoint. Same loop
    runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
    early_stopping_rounds on val mlogloss.

  - training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
    per-profile and per-host breakdown, paired-bootstrap significance
    for model-vs-model gap. Confusion matrix uses union of seen labels.

  - training/dashboard/producers/ — replay/metrics/perf/profiles
    emitting the six event types the dashboard's awaiting scenes
    consume; on-demand tensor extraction so the Pi can run live
    inference without 65 GB of shards.

  - 17 unit tests (split coverage, features round-trip, schema mismatch,
    determinism, time-base alignment regression).

End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 01:19:00 -05:00

145 lines
4.8 KiB
Python

"""XGBoost classifier on per-window summary features.
Tier-1 baseline. Cheap, strong, interpretable. Realistic mode trains
on in_deployment features only; oracle uses everything. Held-out-by-
host (or by-sample) split + early stopping on val macro-F1.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import numpy as np
from training.models import register
from training.models._base import BaseModel, StandardizeStats
@register("gbt")
class GBT(BaseModel):
input_kind = "summary"
def __init__(
self,
*,
n_classes: int,
keep_mask: np.ndarray,
standardize: StandardizeStats,
booster=None,
params: dict | None = None,
) -> None:
self.n_classes = n_classes
self.keep_mask = keep_mask.astype(bool)
self.standardize = standardize
self._booster = booster
self._params = dict(params or {})
@property
def booster(self):
if self._booster is None:
raise RuntimeError("model not fitted; call .fit(...) first")
return self._booster
def _to_dmatrix(self, X: np.ndarray, y: np.ndarray | None = None,
weights: np.ndarray | None = None, *, ref=None):
import xgboost as xgb
Xk = self.select(X)
if ref is None:
return xgb.QuantileDMatrix(Xk, label=y, weight=weights)
return xgb.QuantileDMatrix(Xk, label=y, weight=weights, ref=ref)
def fit(
self,
*,
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray,
y_val: np.ndarray,
sample_weight: np.ndarray | None = None,
params: dict | None = None,
n_estimators: int = 1000,
early_stopping_rounds: int = 30,
verbose_eval: int | bool = 50,
) -> dict:
"""Train with early stopping on val macro-error proxy.
Returns ``{"best_iter": int, "history": dict}``.
"""
import xgboost as xgb
full_params = {
"objective": "multi:softprob",
"num_class": self.n_classes,
"max_depth": 6,
"eta": 0.1,
"tree_method": "hist",
"eval_metric": "mlogloss",
"verbosity": 1,
}
full_params.update(self._params)
if params:
full_params.update(params)
# CUDA available? XGBoost picks it up via device="cuda".
try:
import torch
if torch.cuda.is_available():
full_params.setdefault("device", "cuda")
except Exception:
pass
d_train = self._to_dmatrix(X_train, y_train, weights=sample_weight)
d_val = self._to_dmatrix(X_val, y_val, ref=d_train)
evals_result: dict = {}
booster = xgb.train(
full_params,
d_train,
num_boost_round=n_estimators,
evals=[(d_train, "train"), (d_val, "val")],
early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result,
verbose_eval=verbose_eval,
)
self._booster = booster
self._params = full_params
return {
"best_iter": int(booster.best_iteration),
"best_score": float(booster.best_score),
"history": evals_result,
}
def predict_proba(self, X: np.ndarray) -> np.ndarray:
import xgboost as xgb
d = self._to_dmatrix(X)
# iteration_range to force the best iteration even if the booster
# was loaded from disk (where best_iteration is preserved).
best = getattr(self._booster, "best_iteration", None)
if best is not None:
return self._booster.predict(d, iteration_range=(0, best + 1))
return self._booster.predict(d)
# --- Checkpoint API -------------------------------------------------
def state_for_checkpoint(self) -> dict[str, Any]:
# GBT writes its own sidecar via the checkpoint machinery; this
# returns metadata only.
return {"params": self._params,
"best_iter": int(getattr(self._booster, "best_iteration", -1))}
def save_sidecar(self, path: Path) -> None:
"""Called by save_checkpoint to dump the booster JSON."""
self.booster.save_model(str(path))
@classmethod
def from_checkpoint(cls, header: dict, payload: dict, *,
device: str = "cpu") -> "GBT":
import xgboost as xgb
booster = xgb.Booster()
booster.load_model(payload["sidecar_path"])
return cls(
n_classes=int(header["n_classes"]),
keep_mask=np.asarray(header["keep_mask"], dtype=bool),
standardize=StandardizeStats.from_dict(header["standardize"]),
booster=booster,
params=dict(header.get("config", {}).get("params", {})),
)