The model layer of the project, built honestly:
- tools/dataset_validate.py — full-sweep validator over the receiver
store (sha256, schema, monotonic labels, telemetry-row gate). On the
current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
is committed as the per-episode acceptance index.
- training/_features.py — channel registry (46 channels across
proc/guest/qmp/netflow), summary-stat windowing AND channel×time
tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
(Unix ns) — tested fix for a real netflow-vs-host clock-base
inconsistency that was silently dropping every netflow channel.
- training/_split.py — three held-out recipes (host / sample / time)
with profile-stratification assertions. held_out_host carries
untested_profiles for cases like scan-and-dial absent from the test
host (5 of 6 profiles tested cross-device, never silently averaged).
- training/models/ — 6 architectures behind a common BaseModel
interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
trained twice (realistic / oracle) per the deployment threat model.
Schema-hashed checkpoints refuse to load if _features.py changed
since training (silent-input-drift protection, tested).
- training/trainer/ — unified training loop: class-weighted CE, LR
warmup + cosine, gradient clipping, mixed precision when CUDA,
early stopping on val macro F1, best-on-val checkpoint. Same loop
runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
early_stopping_rounds on val mlogloss.
- training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
per-profile and per-host breakdown, paired-bootstrap significance
for model-vs-model gap. Confusion matrix uses union of seen labels.
- training/dashboard/producers/ — replay/metrics/perf/profiles
emitting the six event types the dashboard's awaiting scenes
consume; on-demand tensor extraction so the Pi can run live
inference without 65 GB of shards.
- 17 unit tests (split coverage, features round-trip, schema mismatch,
determinism, time-base alignment regression).
End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
220 lines
7.7 KiB
Python
220 lines
7.7 KiB
Python
"""Replay an episode at wall-clock time, emitting live dashboard events.
|
|
|
|
For one episode we emit:
|
|
phase — ground truth from labels.jsonl, on each transition
|
|
prediction — per-window predicted vs actual phase from one model
|
|
(the "primary" model, default: first GBT loaded)
|
|
embedding — 2-D PCA projection of each window for the KNN scatter
|
|
|
|
Producer is transport-agnostic via _publish.PublishFn. Models are
|
|
loaded via the schema-hashed checkpoint format — schema mismatch
|
|
between training and inference fails loud, not silent.
|
|
|
|
Both summary and tensor models are supported. The producer extracts
|
|
the right input flavor per model on demand:
|
|
- summary: summary_windows(epi)
|
|
- tensor: tensor_windows(epi)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
|
|
from training._episode_io import open_episode
|
|
from training._features import (
|
|
PHASE_TO_INT, summary_windows, tensor_windows,
|
|
)
|
|
from training.dashboard.producers._models import (
|
|
load_models, model_display_name,
|
|
)
|
|
from training.dashboard.producers._publish import (
|
|
PublishFn, http_publisher, null_publisher,
|
|
)
|
|
from training.models import BaseModel
|
|
|
|
|
|
log = logging.getLogger("cis490.dashboard.producers.replay")
|
|
|
|
|
|
def _pick_primary(models: list[BaseModel]) -> BaseModel | None:
|
|
"""Pick the model whose predictions drive the chunking widget. We
|
|
prefer a realistic-mode model since that's the one a deployed system
|
|
would run."""
|
|
if not models:
|
|
return None
|
|
# Prefer the realistic-mode model on a stable ranking by name.
|
|
rank = {"gbt": 0, "cnn": 1, "transformer": 2,
|
|
"gru": 3, "lstm": 4, "mlp": 5}
|
|
sorted_models = sorted(
|
|
models,
|
|
key=lambda m: (
|
|
0 if "realistic" in str(m.__class__.__name__).lower() else 1,
|
|
rank.get(m.__model_name__, 99),
|
|
),
|
|
)
|
|
return sorted_models[0]
|
|
|
|
|
|
async def replay_episode(
|
|
*,
|
|
publish: PublishFn,
|
|
episode_path: Path,
|
|
host_id: str,
|
|
models: list[BaseModel],
|
|
speed: float = 1.0,
|
|
) -> None:
|
|
epi = open_episode(episode_path, host_id=host_id)
|
|
if not epi.labels:
|
|
log.warning("episode %s has no labels — nothing to replay", episode_path)
|
|
return
|
|
|
|
# Build inputs for each input_kind once.
|
|
inputs: dict[str, dict] = {}
|
|
if any(m.input_kind == "summary" for m in models):
|
|
Xs, ys, ts, _ = summary_windows(epi)
|
|
inputs["summary"] = {"X": Xs, "y": ys, "t": ts}
|
|
if any(m.input_kind == "tensor" for m in models):
|
|
Xt, yt, tt, _, _ = tensor_windows(epi)
|
|
inputs["tensor"] = {"X": Xt, "y": yt, "t": tt}
|
|
|
|
# Time alignment uses tensor's t if present (most fine-grained); fall
|
|
# back to summary.
|
|
ref = inputs.get("tensor") or inputs.get("summary")
|
|
if ref is None or ref["X"].shape[0] == 0:
|
|
log.warning("no usable windows for %s", episode_path)
|
|
return
|
|
n_w = ref["X"].shape[0]
|
|
t_centers = ref["t"]
|
|
y_actual = ref["y"]
|
|
|
|
# Phase ground-truth events from labels.jsonl
|
|
label_events: list[tuple[float, str]] = []
|
|
t0 = int(epi.labels[0]["t_wall_ns"])
|
|
for L in epi.labels:
|
|
label_events.append(((L["t_wall_ns"] - t0) / 1e9, L["phase"]))
|
|
|
|
int_to_phase = {i: p for p, i in PHASE_TO_INT.items()}
|
|
primary = _pick_primary(models)
|
|
if primary is None:
|
|
log.info("no models loaded; emitting phase + embedding only")
|
|
|
|
log.info("replay start: %d windows, %d models, primary=%s",
|
|
n_w, len(models),
|
|
model_display_name(primary) if primary else None)
|
|
|
|
start_wall = time.monotonic()
|
|
label_cursor = 0
|
|
|
|
for w in range(n_w):
|
|
target_wall = start_wall + float(t_centers[w]) / speed
|
|
delay = target_wall - time.monotonic()
|
|
if delay > 0:
|
|
await asyncio.sleep(delay)
|
|
|
|
# Phase events for any label transitions whose time has passed
|
|
while (label_cursor < len(label_events)
|
|
and label_events[label_cursor][0] <= float(t_centers[w])):
|
|
phase_name = label_events[label_cursor][1]
|
|
await publish({"type": "phase", "phase": phase_name})
|
|
label_cursor += 1
|
|
|
|
actual_name = int_to_phase.get(int(y_actual[w]), "clean")
|
|
|
|
# Predictions: only the primary's prediction goes to chunking widget
|
|
if primary is not None:
|
|
X_one = inputs[primary.input_kind]["X"][w:w + 1]
|
|
try:
|
|
pred = int(primary.predict(X_one)[0])
|
|
pred_name = int_to_phase.get(pred, "clean")
|
|
except Exception as e:
|
|
log.warning("predict failed: %s", e)
|
|
pred_name = actual_name
|
|
await publish({
|
|
"type": "prediction",
|
|
"episode_id": epi.episode_id,
|
|
"window_idx": w,
|
|
"predicted": pred_name,
|
|
"actual": actual_name,
|
|
"model": primary.__model_name__,
|
|
})
|
|
|
|
# Embedding: project the primary's standardized window through
|
|
# its saved PCA-2 (loaded from the checkpoint header). If the
|
|
# primary doesn't have a projection, skip embedding for this
|
|
# window.
|
|
if primary is not None:
|
|
xy = _project_one(primary, X_one)
|
|
if xy is not None:
|
|
await publish({
|
|
"type": "embedding",
|
|
"x": float(xy[0]), "y": float(xy[1]),
|
|
"phase": actual_name,
|
|
})
|
|
|
|
|
|
def _project_one(model: BaseModel, X_one: np.ndarray) -> tuple[float, float] | None:
|
|
"""Apply the model's standardize+keep, then project through the
|
|
PCA-2 baked into the checkpoint header (if any). Returns (x, y) in
|
|
[0, 1] using a min-max squash with stats fit on first call."""
|
|
pca = getattr(model, "_pca_proj", None)
|
|
if pca is None:
|
|
return None
|
|
Xk = model.select(X_one[:1])
|
|
if Xk.ndim == 3:
|
|
Xk = Xk.reshape(1, -1)
|
|
if Xk.shape[1] != pca.shape[0]:
|
|
return None
|
|
p = (Xk @ pca).ravel()
|
|
# Tanh-squash with k=0.05 so most points land in (0.2, 0.8). Without
|
|
# train-time min/max it's the cleanest stateless squash.
|
|
return (
|
|
0.5 + 0.5 * float(np.tanh(0.05 * p[0])),
|
|
0.5 + 0.5 * float(np.tanh(0.05 * p[1])),
|
|
)
|
|
|
|
|
|
async def _run(args: argparse.Namespace) -> int:
|
|
logging.basicConfig(level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s")
|
|
|
|
models = load_models(args.artifacts, device=args.device)
|
|
# Hydrate PCA projection from each checkpoint header
|
|
from training.models._checkpoint import load_header
|
|
paths = sorted(Path(args.artifacts).glob("*.ckpt.json"))
|
|
for m, p in zip(models, paths):
|
|
header = load_header(p)
|
|
if header.get("pca_proj") is not None:
|
|
m._pca_proj = np.asarray(header["pca_proj"], dtype=np.float32)
|
|
|
|
publisher = (null_publisher() if args.dry_run
|
|
else http_publisher(args.publish_url))
|
|
await replay_episode(
|
|
publish=publisher, episode_path=args.episode,
|
|
host_id=args.host_id, models=models, speed=args.speed,
|
|
)
|
|
return 0
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--episode", required=True, type=Path)
|
|
ap.add_argument("--host-id", required=True)
|
|
ap.add_argument("--artifacts", type=Path, default=Path("artifacts"))
|
|
ap.add_argument("--publish-url", default="http://127.0.0.1:8447/publish")
|
|
ap.add_argument("--speed", type=float, default=1.0)
|
|
ap.add_argument("--device", default="auto")
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
return asyncio.run(_run(args))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|