- multi_model_metrics: publish gbt / mlp / cnn / knn_semi / gru / lstm / bert (knn handled by knn streamer); read both *_train.json and *_eval.json with macro_f1.point fallback - dashboard.css: add palette gradients for the four non-canonical names so the bars render with a fill colour - dashboard.js: open the bar's visible scale to the full 0–1 range so honest-low cross-host F1s show as a bar instead of clamping to 0% - ship lambda-live-detection-loop.py + dashboard request docs (scenes 7/8/12, sticky cache, lambda-inference-demo) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
237 lines
8.4 KiB
Python
237 lines
8.4 KiB
Python
"""Pi-safe multi-model metrics publisher.
|
||
|
||
Publishes:
|
||
|
||
- ``ModelMetric`` (scene 9 / "models") — held-out-by-sample macro-F1
|
||
per canonical model name (rnn, gru, lstm, bert, knn).
|
||
- ``ModelPerf`` (scene 12 / "perf") — observed median latency
|
||
(μs/window) paired with the same F1 per canonical name.
|
||
|
||
Source of F1 numbers
|
||
====================
|
||
We read ``reports/eval/<family>_<mode>_{train,eval}.json`` files. Each
|
||
file has a ``split_recipe`` field plus ``test_macro_f1``. The dashboard
|
||
contract for these scenes is **held-out-by-sample** (recipe = "sample"
|
||
in our codebase, also called "oracle" mode); the bar widget's
|
||
``(accuracy − 0.5) / 0.5`` visible scale is calibrated for the high-F1
|
||
range that recipe produces.
|
||
|
||
Order of preference per file:
|
||
|
||
1. ``<family>_oracle_eval.json`` (split_recipe == "sample")
|
||
2. ``<family>_oracle_train.json`` (split_recipe == "sample")
|
||
3. ``<family>_realistic_eval.json`` (cross-host fallback)
|
||
4. ``<family>_realistic_train.json`` (cross-host fallback)
|
||
|
||
If only realistic is available we publish it anyway — better an honest
|
||
low bar than no bar at all — but the file the trainer should have
|
||
written for scene 9 is the oracle one.
|
||
|
||
Canonical-name contract
|
||
=======================
|
||
The dashboard's :class:`Model` literal is ``{rnn, gru, lstm, bert,
|
||
knn}`` and the bar widget's CSS palette is keyed off those exact
|
||
strings (``.model-fill.lstm``, ``.model-fill.gru``, etc.). We collapse
|
||
our zoo as follows:
|
||
|
||
gru ← gru_*
|
||
lstm ← lstm_*
|
||
bert ← transformer_* (BERT-style transformer encoder)
|
||
knn ← knn_*
|
||
|
||
We don't have a vanilla RNN trained, so ``rnn`` is never published —
|
||
the bar widget skips that bar, which is the correct behaviour.
|
||
|
||
Why not the existing ``training.producers.metrics``
|
||
==================================================
|
||
That producer iterates checkpoints with :func:`load_models` and re-
|
||
scores the test set every cycle. On the Pi (8 GiB ARM) the KNN
|
||
checkpoints alone (~300 MB pickle each, six variants) plus the test-
|
||
set tensor cache exceed RAM and OOM-killed the host. See
|
||
``feedback_no_heavy_pi_inference.md`` in the user's auto-memory. This
|
||
producer reads small JSON files instead — no checkpoint loading.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import asyncio
|
||
import json
|
||
import logging
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
||
from training.dashboard.events import ModelMetric, ModelPerf, Publisher
|
||
|
||
|
||
log = logging.getLogger("cis490.producers.multi_model_metrics")
|
||
|
||
|
||
# Microseconds per window, batch=64 amortized. Order-of-magnitude
|
||
# estimates from sklearn / torch on similar shapes. Should be re-
|
||
# benchmarked on actual deployment hardware for a paper, but indicative
|
||
# enough for a live demo's perf scatter.
|
||
LATENCY_ESTIMATES_US = {
|
||
"rnn": 1500.0,
|
||
"gru": 1500.0,
|
||
"lstm": 2000.0,
|
||
"bert": 800.0,
|
||
"knn": 3500.0,
|
||
}
|
||
|
||
|
||
# Bar-widget name → trained-checkpoint family. We publish every model
|
||
# we've trained so scene 9 shows the full zoo, not just the four
|
||
# canonical ``Model`` literal names. Names outside the dashboard's
|
||
# canonical set ({rnn, gru, lstm, bert, knn}) render as bars with no
|
||
# CSS fill colour — the row still appears with the model name and
|
||
# numeric F1, the bar track is just transparent. The dashboard chat's
|
||
# explicit guidance: "Other strings work but won't get a colored fill
|
||
# class without a CSS update."
|
||
#
|
||
# ``knn`` is intentionally absent here — ``training.producers.knn
|
||
# stream`` already publishes ``ModelMetric{model: 'knn'}`` and
|
||
# ``ModelPerf{model: 'knn'}`` on its own cycle. Two writers on the
|
||
# same name would flicker.
|
||
CANONICAL_TO_FAMILY = {
|
||
"gbt": "gbt",
|
||
"mlp": "mlp",
|
||
"cnn": "cnn",
|
||
"knn_semi": "knn_semi",
|
||
"gru": "gru",
|
||
"lstm": "lstm",
|
||
"bert": "transformer",
|
||
}
|
||
|
||
|
||
# Latency-per-window-microseconds estimates per family, batch=64
|
||
# amortised. Order-of-magnitude only — proper benchmarks need to run
|
||
# on the deployment hardware. Indicative enough for scene 12's
|
||
# log-scaled axis.
|
||
LATENCY_PER_FAMILY_US = {
|
||
"gbt": 250.0,
|
||
"mlp": 50.0,
|
||
"cnn": 500.0,
|
||
"knn": 3500.0,
|
||
"knn_semi": 3500.0,
|
||
"rnn": 1500.0,
|
||
"gru": 1500.0,
|
||
"lstm": 2000.0,
|
||
"bert": 800.0,
|
||
}
|
||
|
||
|
||
def _read_json(path: Path) -> dict | None:
|
||
try:
|
||
return json.loads(path.read_text())
|
||
except (OSError, json.JSONDecodeError) as e:
|
||
log.warning("could not read %s: %s", path.name, e)
|
||
return None
|
||
|
||
|
||
def _extract_f1(d: dict) -> float | None:
|
||
"""Pull a scalar test_macro_f1 from one of two known shapes.
|
||
|
||
- ``training.trainer.run`` writes ``test_macro_f1`` flat.
|
||
- ``training.eval_.run`` writes ``macro_f1: {point, low, high}``
|
||
and the family name only (no oracle/realistic suffix), so the
|
||
filename carries the mode if at all.
|
||
"""
|
||
if "test_macro_f1" in d and isinstance(d["test_macro_f1"], (int, float)):
|
||
return float(d["test_macro_f1"])
|
||
mf1 = d.get("macro_f1")
|
||
if isinstance(mf1, dict) and "point" in mf1:
|
||
return float(mf1["point"])
|
||
if isinstance(mf1, (int, float)):
|
||
return float(mf1)
|
||
return None
|
||
|
||
|
||
def _best_f1_for_family(reports_dir: Path, family: str) -> tuple[float, str] | None:
|
||
"""Pick the best-available test_macro_f1 for one family.
|
||
|
||
Returns ``(f1, source_label)`` or ``None`` if no candidate file
|
||
has a usable score.
|
||
|
||
Filename precedence (most-preferred first):
|
||
|
||
1. ``<family>_oracle_train.json`` — trainer-time, sample split
|
||
2. ``<family>_eval.json`` — eval_/run.py output, recipe
|
||
set by --split-recipe
|
||
3. ``<family>_realistic_train.json`` — cross-host fallback
|
||
"""
|
||
candidates = [
|
||
("oracle_train", f"{family}_oracle_train.json"),
|
||
("eval", f"{family}_eval.json"),
|
||
("realistic_train", f"{family}_realistic_train.json"),
|
||
]
|
||
for label, fname in candidates:
|
||
p = reports_dir / fname
|
||
if not p.exists():
|
||
continue
|
||
d = _read_json(p)
|
||
if d is None:
|
||
continue
|
||
f1 = _extract_f1(d)
|
||
if f1 is None:
|
||
continue
|
||
return f1, label
|
||
return None
|
||
|
||
|
||
def emit_once(*, publisher: Publisher, reports_dir: Path) -> int:
|
||
n = 0
|
||
for bar_name, family in CANONICAL_TO_FAMILY.items():
|
||
result = _best_f1_for_family(reports_dir, family)
|
||
if result is None:
|
||
log.info("no F1 yet for %s (family=%s) — skipping",
|
||
bar_name, family)
|
||
continue
|
||
f1, source = result
|
||
latency = float(LATENCY_PER_FAMILY_US.get(family, 1000.0))
|
||
try:
|
||
publisher.publish(ModelMetric(
|
||
model=bar_name, accuracy=f1))
|
||
publisher.publish(ModelPerf(
|
||
model=bar_name, latency_us=latency, accuracy=f1))
|
||
n += 1
|
||
log.debug("%s: F1=%.4f latency=%.0fus (from %s)",
|
||
bar_name, f1, latency, source)
|
||
except Exception as e:
|
||
log.warning("publish failed for %s: %s", bar_name, e)
|
||
log.info("published %d (model_metric + model_perf) pairs", n)
|
||
return n
|
||
|
||
|
||
async def _run(args) -> int:
|
||
publisher = Publisher(url=args.publish_url)
|
||
while True:
|
||
emit_once(publisher=publisher, reports_dir=args.reports_dir)
|
||
if args.interval <= 0:
|
||
return 0
|
||
await asyncio.sleep(args.interval)
|
||
|
||
|
||
def main() -> int:
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--reports-dir", type=Path,
|
||
default=Path("reports/eval"))
|
||
ap.add_argument("--publish-url",
|
||
default="http://127.0.0.1:8447/publish")
|
||
ap.add_argument("--interval", type=float, default=5.0,
|
||
help="re-publish period (s); 0 = one-shot. "
|
||
"Kept short so a fresh page-load sees populated "
|
||
"bars/scatter within a few seconds. The dashboard "
|
||
"broadcaster does not replay events to new "
|
||
"connections by default — see "
|
||
"docs/dashboard-request-sticky-cache.md.")
|
||
ap.add_argument("--log-level", default="INFO")
|
||
args = ap.parse_args()
|
||
logging.basicConfig(
|
||
level=args.log_level,
|
||
format="%(asctime)s %(levelname)s %(name)s %(message)s")
|
||
return asyncio.run(_run(args))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|