The model layer of the project, built honestly:
- tools/dataset_validate.py — full-sweep validator over the receiver
store (sha256, schema, monotonic labels, telemetry-row gate). On the
current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
is committed as the per-episode acceptance index.
- training/_features.py — channel registry (46 channels across
proc/guest/qmp/netflow), summary-stat windowing AND channel×time
tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
(Unix ns) — tested fix for a real netflow-vs-host clock-base
inconsistency that was silently dropping every netflow channel.
- training/_split.py — three held-out recipes (host / sample / time)
with profile-stratification assertions. held_out_host carries
untested_profiles for cases like scan-and-dial absent from the test
host (5 of 6 profiles tested cross-device, never silently averaged).
- training/models/ — 6 architectures behind a common BaseModel
interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
trained twice (realistic / oracle) per the deployment threat model.
Schema-hashed checkpoints refuse to load if _features.py changed
since training (silent-input-drift protection, tested).
- training/trainer/ — unified training loop: class-weighted CE, LR
warmup + cosine, gradient clipping, mixed precision when CUDA,
early stopping on val macro F1, best-on-val checkpoint. Same loop
runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
early_stopping_rounds on val mlogloss.
- training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
per-profile and per-host breakdown, paired-bootstrap significance
for model-vs-model gap. Confusion matrix uses union of seen labels.
- training/dashboard/producers/ — replay/metrics/perf/profiles
emitting the six event types the dashboard's awaiting scenes
consume; on-demand tensor extraction so the Pi can run live
inference without 65 GB of shards.
- 17 unit tests (split coverage, features round-trip, schema mismatch,
determinism, time-base alignment regression).
End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
118 lines
4 KiB
Python
118 lines
4 KiB
Python
"""Emit `model_perf` events — accuracy vs inference latency per model.
|
|
|
|
Latency is measured at a production-realistic batch size (default 64 —
|
|
roughly one second of windows from a few hosts at 0.5s stride). Single-
|
|
window timing is reported as `latency_us_b1` for completeness; the
|
|
dashboard's scatter widget uses `latency_us`. Republished on a tick
|
|
for reconnects.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pyarrow.parquet as pq
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
|
|
from training._split import held_out_host
|
|
from training.dashboard.producers._models import (
|
|
latency_us_batched, load_models,
|
|
)
|
|
from training.dashboard.producers._publish import (
|
|
PublishFn, http_publisher, null_publisher,
|
|
)
|
|
from training.eval_._metrics import _macro_f1
|
|
|
|
|
|
log = logging.getLogger("cis490.dashboard.producers.perf")
|
|
|
|
|
|
async def emit_perf(*, publish: PublishFn, artifacts_dir: Path,
|
|
validation_path: Path,
|
|
summary_path: Path | None,
|
|
tensors_root: Path | None,
|
|
batch_for_scatter: int = 64) -> int:
|
|
from training.dashboard.producers.metrics import _build_test_set
|
|
models = load_models(artifacts_dir)
|
|
if not models:
|
|
return 0
|
|
n = 0
|
|
for m in models:
|
|
try:
|
|
Xte, yte = _build_test_set(
|
|
m, validation_path=validation_path,
|
|
summary_path=summary_path, tensors_root=tensors_root,
|
|
split_recipe="host", train_hosts=["elliott-thinkpad"],
|
|
)
|
|
except Exception as e:
|
|
log.warning("test set build failed for %s: %s",
|
|
m.__model_name__, e)
|
|
continue
|
|
if len(yte) == 0:
|
|
continue
|
|
# Sub-sample to bound runtime on perf bench
|
|
if Xte.shape[0] > 4096:
|
|
Xte = Xte[:4096]; yte = yte[:4096]
|
|
y_pred = m.predict(Xte)
|
|
acc = _macro_f1(yte, y_pred, m.n_classes)
|
|
lat = latency_us_batched(m, Xte,
|
|
batch_sizes=(1, 8, 64, 512), n_iter=100)
|
|
primary = lat.get(batch_for_scatter, lat.get(min(lat) if lat else 1, 0.0))
|
|
log.info("%s acc=%.4f lat[1]=%.1fus lat[64]=%.1fus lat[512]=%.1fus",
|
|
m.__model_name__, acc,
|
|
lat.get(1, 0), lat.get(64, 0), lat.get(512, 0))
|
|
await publish({
|
|
"type": "model_perf",
|
|
"model": m.__model_name__,
|
|
"latency_us": primary,
|
|
"accuracy": acc,
|
|
"latency_us_by_batch": lat,
|
|
})
|
|
n += 1
|
|
return n
|
|
|
|
|
|
async def _run(args: argparse.Namespace) -> int:
|
|
logging.basicConfig(level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s")
|
|
publisher = (null_publisher() if args.dry_run
|
|
else http_publisher(args.publish_url))
|
|
cached: list[dict] = []
|
|
|
|
async def cached_publish(msg: dict) -> None:
|
|
cached.append(msg)
|
|
await publisher(msg)
|
|
|
|
await emit_perf(
|
|
publish=cached_publish, artifacts_dir=args.artifacts,
|
|
validation_path=args.validation,
|
|
summary_path=args.summary, tensors_root=args.tensors,
|
|
)
|
|
if args.interval <= 0 or not cached:
|
|
return 0
|
|
while True:
|
|
await asyncio.sleep(args.interval)
|
|
for msg in cached:
|
|
await publisher(msg)
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--validation", required=True, type=Path)
|
|
ap.add_argument("--artifacts", type=Path, default=Path("artifacts"))
|
|
ap.add_argument("--summary", type=Path, default=None)
|
|
ap.add_argument("--tensors", type=Path, default=None)
|
|
ap.add_argument("--publish-url", default="http://127.0.0.1:8447/publish")
|
|
ap.add_argument("--interval", type=float, default=30.0)
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
return asyncio.run(_run(args))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|