CIS490/training/dashboard/producers/replay.py
Max 1fabd4a246 training: validator, feature/tensor extractors, 6 supervised models, schema-hashed checkpoints, eval suite, dashboard producers
The model layer of the project, built honestly:

  - tools/dataset_validate.py — full-sweep validator over the receiver
    store (sha256, schema, monotonic labels, telemetry-row gate). On the
    current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
    7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
    is committed as the per-episode acceptance index.

  - training/_features.py — channel registry (46 channels across
    proc/guest/qmp/netflow), summary-stat windowing AND channel×time
    tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
    (Unix ns) — tested fix for a real netflow-vs-host clock-base
    inconsistency that was silently dropping every netflow channel.

  - training/_split.py — three held-out recipes (host / sample / time)
    with profile-stratification assertions. held_out_host carries
    untested_profiles for cases like scan-and-dial absent from the test
    host (5 of 6 profiles tested cross-device, never silently averaged).

  - training/models/ — 6 architectures behind a common BaseModel
    interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
    trained twice (realistic / oracle) per the deployment threat model.
    Schema-hashed checkpoints refuse to load if _features.py changed
    since training (silent-input-drift protection, tested).

  - training/trainer/ — unified training loop: class-weighted CE, LR
    warmup + cosine, gradient clipping, mixed precision when CUDA,
    early stopping on val macro F1, best-on-val checkpoint. Same loop
    runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
    early_stopping_rounds on val mlogloss.

  - training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
    per-profile and per-host breakdown, paired-bootstrap significance
    for model-vs-model gap. Confusion matrix uses union of seen labels.

  - training/dashboard/producers/ — replay/metrics/perf/profiles
    emitting the six event types the dashboard's awaiting scenes
    consume; on-demand tensor extraction so the Pi can run live
    inference without 65 GB of shards.

  - 17 unit tests (split coverage, features round-trip, schema mismatch,
    determinism, time-base alignment regression).

End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 01:19:00 -05:00

220 lines
7.7 KiB
Python

"""Replay an episode at wall-clock time, emitting live dashboard events.
For one episode we emit:
phase — ground truth from labels.jsonl, on each transition
prediction — per-window predicted vs actual phase from one model
(the "primary" model, default: first GBT loaded)
embedding — 2-D PCA projection of each window for the KNN scatter
Producer is transport-agnostic via _publish.PublishFn. Models are
loaded via the schema-hashed checkpoint format — schema mismatch
between training and inference fails loud, not silent.
Both summary and tensor models are supported. The producer extracts
the right input flavor per model on demand:
- summary: summary_windows(epi)
- tensor: tensor_windows(epi)
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import sys
import time
from pathlib import Path
import numpy as np
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
from training._episode_io import open_episode
from training._features import (
PHASE_TO_INT, summary_windows, tensor_windows,
)
from training.dashboard.producers._models import (
load_models, model_display_name,
)
from training.dashboard.producers._publish import (
PublishFn, http_publisher, null_publisher,
)
from training.models import BaseModel
log = logging.getLogger("cis490.dashboard.producers.replay")
def _pick_primary(models: list[BaseModel]) -> BaseModel | None:
"""Pick the model whose predictions drive the chunking widget. We
prefer a realistic-mode model since that's the one a deployed system
would run."""
if not models:
return None
# Prefer the realistic-mode model on a stable ranking by name.
rank = {"gbt": 0, "cnn": 1, "transformer": 2,
"gru": 3, "lstm": 4, "mlp": 5}
sorted_models = sorted(
models,
key=lambda m: (
0 if "realistic" in str(m.__class__.__name__).lower() else 1,
rank.get(m.__model_name__, 99),
),
)
return sorted_models[0]
async def replay_episode(
*,
publish: PublishFn,
episode_path: Path,
host_id: str,
models: list[BaseModel],
speed: float = 1.0,
) -> None:
epi = open_episode(episode_path, host_id=host_id)
if not epi.labels:
log.warning("episode %s has no labels — nothing to replay", episode_path)
return
# Build inputs for each input_kind once.
inputs: dict[str, dict] = {}
if any(m.input_kind == "summary" for m in models):
Xs, ys, ts, _ = summary_windows(epi)
inputs["summary"] = {"X": Xs, "y": ys, "t": ts}
if any(m.input_kind == "tensor" for m in models):
Xt, yt, tt, _, _ = tensor_windows(epi)
inputs["tensor"] = {"X": Xt, "y": yt, "t": tt}
# Time alignment uses tensor's t if present (most fine-grained); fall
# back to summary.
ref = inputs.get("tensor") or inputs.get("summary")
if ref is None or ref["X"].shape[0] == 0:
log.warning("no usable windows for %s", episode_path)
return
n_w = ref["X"].shape[0]
t_centers = ref["t"]
y_actual = ref["y"]
# Phase ground-truth events from labels.jsonl
label_events: list[tuple[float, str]] = []
t0 = int(epi.labels[0]["t_wall_ns"])
for L in epi.labels:
label_events.append(((L["t_wall_ns"] - t0) / 1e9, L["phase"]))
int_to_phase = {i: p for p, i in PHASE_TO_INT.items()}
primary = _pick_primary(models)
if primary is None:
log.info("no models loaded; emitting phase + embedding only")
log.info("replay start: %d windows, %d models, primary=%s",
n_w, len(models),
model_display_name(primary) if primary else None)
start_wall = time.monotonic()
label_cursor = 0
for w in range(n_w):
target_wall = start_wall + float(t_centers[w]) / speed
delay = target_wall - time.monotonic()
if delay > 0:
await asyncio.sleep(delay)
# Phase events for any label transitions whose time has passed
while (label_cursor < len(label_events)
and label_events[label_cursor][0] <= float(t_centers[w])):
phase_name = label_events[label_cursor][1]
await publish({"type": "phase", "phase": phase_name})
label_cursor += 1
actual_name = int_to_phase.get(int(y_actual[w]), "clean")
# Predictions: only the primary's prediction goes to chunking widget
if primary is not None:
X_one = inputs[primary.input_kind]["X"][w:w + 1]
try:
pred = int(primary.predict(X_one)[0])
pred_name = int_to_phase.get(pred, "clean")
except Exception as e:
log.warning("predict failed: %s", e)
pred_name = actual_name
await publish({
"type": "prediction",
"episode_id": epi.episode_id,
"window_idx": w,
"predicted": pred_name,
"actual": actual_name,
"model": primary.__model_name__,
})
# Embedding: project the primary's standardized window through
# its saved PCA-2 (loaded from the checkpoint header). If the
# primary doesn't have a projection, skip embedding for this
# window.
if primary is not None:
xy = _project_one(primary, X_one)
if xy is not None:
await publish({
"type": "embedding",
"x": float(xy[0]), "y": float(xy[1]),
"phase": actual_name,
})
def _project_one(model: BaseModel, X_one: np.ndarray) -> tuple[float, float] | None:
"""Apply the model's standardize+keep, then project through the
PCA-2 baked into the checkpoint header (if any). Returns (x, y) in
[0, 1] using a min-max squash with stats fit on first call."""
pca = getattr(model, "_pca_proj", None)
if pca is None:
return None
Xk = model.select(X_one[:1])
if Xk.ndim == 3:
Xk = Xk.reshape(1, -1)
if Xk.shape[1] != pca.shape[0]:
return None
p = (Xk @ pca).ravel()
# Tanh-squash with k=0.05 so most points land in (0.2, 0.8). Without
# train-time min/max it's the cleanest stateless squash.
return (
0.5 + 0.5 * float(np.tanh(0.05 * p[0])),
0.5 + 0.5 * float(np.tanh(0.05 * p[1])),
)
async def _run(args: argparse.Namespace) -> int:
logging.basicConfig(level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s")
models = load_models(args.artifacts, device=args.device)
# Hydrate PCA projection from each checkpoint header
from training.models._checkpoint import load_header
paths = sorted(Path(args.artifacts).glob("*.ckpt.json"))
for m, p in zip(models, paths):
header = load_header(p)
if header.get("pca_proj") is not None:
m._pca_proj = np.asarray(header["pca_proj"], dtype=np.float32)
publisher = (null_publisher() if args.dry_run
else http_publisher(args.publish_url))
await replay_episode(
publish=publisher, episode_path=args.episode,
host_id=args.host_id, models=models, speed=args.speed,
)
return 0
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--episode", required=True, type=Path)
ap.add_argument("--host-id", required=True)
ap.add_argument("--artifacts", type=Path, default=Path("artifacts"))
ap.add_argument("--publish-url", default="http://127.0.0.1:8447/publish")
ap.add_argument("--speed", type=float, default=1.0)
ap.add_argument("--device", default="auto")
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
return asyncio.run(_run(args))
if __name__ == "__main__":
raise SystemExit(main())