CIS490/training/dashboard/producers/metrics.py
Max 1fabd4a246 training: validator, feature/tensor extractors, 6 supervised models, schema-hashed checkpoints, eval suite, dashboard producers
The model layer of the project, built honestly:

  - tools/dataset_validate.py — full-sweep validator over the receiver
    store (sha256, schema, monotonic labels, telemetry-row gate). On the
    current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
    7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
    is committed as the per-episode acceptance index.

  - training/_features.py — channel registry (46 channels across
    proc/guest/qmp/netflow), summary-stat windowing AND channel×time
    tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
    (Unix ns) — tested fix for a real netflow-vs-host clock-base
    inconsistency that was silently dropping every netflow channel.

  - training/_split.py — three held-out recipes (host / sample / time)
    with profile-stratification assertions. held_out_host carries
    untested_profiles for cases like scan-and-dial absent from the test
    host (5 of 6 profiles tested cross-device, never silently averaged).

  - training/models/ — 6 architectures behind a common BaseModel
    interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
    trained twice (realistic / oracle) per the deployment threat model.
    Schema-hashed checkpoints refuse to load if _features.py changed
    since training (silent-input-drift protection, tested).

  - training/trainer/ — unified training loop: class-weighted CE, LR
    warmup + cosine, gradient clipping, mixed precision when CUDA,
    early stopping on val macro F1, best-on-val checkpoint. Same loop
    runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
    early_stopping_rounds on val mlogloss.

  - training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
    per-profile and per-host breakdown, paired-bootstrap significance
    for model-vs-model gap. Confusion matrix uses union of seen labels.

  - training/dashboard/producers/ — replay/metrics/perf/profiles
    emitting the six event types the dashboard's awaiting scenes
    consume; on-demand tensor extraction so the Pi can run live
    inference without 65 GB of shards.

  - 17 unit tests (split coverage, features round-trip, schema mismatch,
    determinism, time-base alignment regression).

End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 01:19:00 -05:00

159 lines
6.3 KiB
Python

"""Emit `model_metric` events for the dashboard's accuracy bars.
Loads every checkpoint via the schema-hashed loader, scores each on
the held-out test split (held-out-by-host by default), publishes one
``model_metric`` per model. Re-publishes on a tick so a browser
opening 30s after a one-shot run still sees populated bars.
Note: dashboard's CSS styles bars by exact name (`rnn|gru|lstm|bert`).
Our names are e.g. `gbt_realistic`. Bars render with a default color.
The accuracy reported is **macro-F1** under the realistic-vs-oracle
split that the model was trained for — *not* plain accuracy. We
publish under the existing `accuracy` key so the dashboard JS doesn't
need a frontend change; macro-F1 is the metric we actually care about.
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import sys
from pathlib import Path
import numpy as np
import pyarrow.parquet as pq
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
from training._split import (
held_out_host, held_out_sample, held_out_time,
)
from training.dashboard.producers._models import load_models
from training.dashboard.producers._publish import (
PublishFn, http_publisher, null_publisher,
)
from training.eval_._metrics import _macro_f1
from training.models import BaseModel
log = logging.getLogger("cis490.dashboard.producers.metrics")
def _build_test_set(model: BaseModel, *, validation_path: Path,
summary_path: Path | None,
tensors_root: Path | None,
split_recipe: str, train_hosts: list[str]
) -> tuple[np.ndarray, np.ndarray]:
"""Return (X_test, y_test) for the given model's input kind."""
val = pq.read_table(validation_path).to_pylist()
rows = [r for r in val if r["status"] in ("accepted", "degraded")]
profs = [r["profile"] for r in rows]
samples = [r["sample_name"] for r in rows]
hosts = [r["host_id"] for r in rows]
epi_ids = [r["episode_id"] for r in rows]
recv = [r.get("received_at_wall", "") for r in rows]
if split_recipe == "host":
splits = held_out_host(profiles=profs, sample_names=samples,
host_ids=hosts, episode_ids=epi_ids,
train_hosts=train_hosts, seed=0)
elif split_recipe == "sample":
splits = held_out_sample(profiles=profs, sample_names=samples,
host_ids=hosts, seed=0)
else:
splits = held_out_time(profiles=profs, sample_names=samples,
host_ids=hosts, received_at=recv, seed=0)
test_eps = {epi_ids[i] for i in range(len(epi_ids)) if splits.test[i]}
if model.input_kind == "summary":
if summary_path is None:
raise ValueError("--summary required for summary model")
from training.trainer._data import load_summary
# Need schema path; assume sibling
schema_path = summary_path.parent / "feature_schema_v1.json"
d = load_summary(summary_path, schema_path)
m = np.array([e in test_eps for e in d.episode_id], dtype=bool)
return d.X[m], d.y[m]
else:
if tensors_root is None:
raise ValueError("--tensors required for tensor model")
from training.trainer._data import load_tensor
d = load_tensor(tensors_root)
m = np.array([e in test_eps for e in d.episode_id], dtype=bool)
return d.X[m], d.y[m]
async def emit_metrics(*, publish: PublishFn, artifacts_dir: Path,
validation_path: Path,
summary_path: Path | None,
tensors_root: Path | None,
split_recipe: str,
train_hosts: list[str]) -> int:
models = load_models(artifacts_dir)
if not models:
log.warning("no models found under %s", artifacts_dir)
return 0
n = 0
for m in models:
try:
Xte, yte = _build_test_set(
m, validation_path=validation_path,
summary_path=summary_path, tensors_root=tensors_root,
split_recipe=split_recipe, train_hosts=train_hosts,
)
except Exception as e:
log.warning("test set build failed for %s: %s",
m.__model_name__, e)
continue
if len(yte) == 0:
log.warning("empty test set for %s; skipping", m.__model_name__)
continue
y_pred = m.predict(Xte)
f1 = _macro_f1(yte, y_pred, m.n_classes)
log.info("%s test_macro_f1=%.4f (n=%d)", m.__model_name__, f1, len(yte))
# `accuracy` key for the dashboard's existing bar widget; the
# value is macro-F1 in our project.
await publish({
"type": "model_metric",
"model": m.__model_name__,
"accuracy": f1,
})
n += 1
return n
async def _run(args: argparse.Namespace) -> int:
logging.basicConfig(level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s")
publisher = (null_publisher() if args.dry_run
else http_publisher(args.publish_url))
while True:
await emit_metrics(
publish=publisher, artifacts_dir=args.artifacts,
validation_path=args.validation,
summary_path=args.summary, tensors_root=args.tensors,
split_recipe=args.split_recipe,
train_hosts=args.train_hosts,
)
if args.interval <= 0:
return 0
await asyncio.sleep(args.interval)
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--validation", required=True, type=Path)
ap.add_argument("--artifacts", type=Path, default=Path("artifacts"))
ap.add_argument("--summary", type=Path, default=None)
ap.add_argument("--tensors", type=Path, default=None)
ap.add_argument("--split-recipe", choices=["host", "sample", "time"],
default="host")
ap.add_argument("--train-hosts", nargs="+", default=["elliott-thinkpad"])
ap.add_argument("--publish-url", default="http://127.0.0.1:8447/publish")
ap.add_argument("--interval", type=float, default=20.0)
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
return asyncio.run(_run(args))
if __name__ == "__main__":
raise SystemExit(main())