CIS490/training/producers/multi_model_metrics.py

"""Pi-safe multi-model metrics publisher.

Publishes:

  - ``ModelMetric`` (scene 9 / "models") — held-out-by-sample macro-F1
    per canonical model name (rnn, gru, lstm, bert, knn).
  - ``ModelPerf`` (scene 12 / "perf") — observed median latency
    (μs/window) paired with the same F1 per canonical name.

Source of F1 numbers
====================
We read ``reports/eval/<family>_<mode>_{train,eval}.json`` files. Each
file has a ``split_recipe`` field plus ``test_macro_f1``. The dashboard
contract for these scenes is **held-out-by-sample** (recipe = "sample"
in our codebase, also called "oracle" mode); the bar widget's
``(accuracy − 0.5) / 0.5`` visible scale is calibrated for the high-F1
range that recipe produces.

Order of preference per file:

  1. ``<family>_oracle_eval.json``   (split_recipe == "sample")
  2. ``<family>_oracle_train.json``  (split_recipe == "sample")
  3. ``<family>_realistic_eval.json``  (cross-host fallback)
  4. ``<family>_realistic_train.json`` (cross-host fallback)

If only realistic is available we publish it anyway — better an honest
low bar than no bar at all — but the file the trainer should have
written for scene 9 is the oracle one.

Canonical-name contract
=======================
The dashboard's :class:`Model` literal is ``{rnn, gru, lstm, bert,
knn}`` and the bar widget's CSS palette is keyed off those exact
strings (``.model-fill.lstm``, ``.model-fill.gru``, etc.). We collapse
our zoo as follows:

    gru   ←  gru_*
    lstm  ←  lstm_*
    bert  ←  transformer_*       (BERT-style transformer encoder)
    knn   ←  knn_*

We don't have a vanilla RNN trained, so ``rnn`` is never published —
the bar widget skips that bar, which is the correct behaviour.

Why not the existing ``training.producers.metrics``
==================================================
That producer iterates checkpoints with :func:`load_models` and re-
scores the test set every cycle. On the Pi (8 GiB ARM) the KNN
checkpoints alone (~300 MB pickle each, six variants) plus the test-
set tensor cache exceed RAM and OOM-killed the host. See
``feedback_no_heavy_pi_inference.md`` in the user's auto-memory. This
producer reads small JSON files instead — no checkpoint loading.
"""
from __future__ import annotations

import argparse
import asyncio
import json
import logging
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from training.dashboard.events import ModelMetric, ModelPerf, Publisher


log = logging.getLogger("cis490.producers.multi_model_metrics")


# Microseconds per window, batch=64 amortized. Order-of-magnitude
# estimates from sklearn / torch on similar shapes. Should be re-
# benchmarked on actual deployment hardware for a paper, but indicative
# enough for a live demo's perf scatter.
LATENCY_ESTIMATES_US = {
    "rnn":  1500.0,
    "gru":  1500.0,
    "lstm": 2000.0,
    "bert":  800.0,
    "knn": 3500.0,
}


# Bar-widget name → trained-checkpoint family. We publish every model
# we've trained so scene 9 shows the full zoo, not just the four
# canonical ``Model`` literal names. Names outside the dashboard's
# canonical set ({rnn, gru, lstm, bert, knn}) render as bars with no
# CSS fill colour — the row still appears with the model name and
# numeric F1, the bar track is just transparent. The dashboard chat's
# explicit guidance: "Other strings work but won't get a colored fill
# class without a CSS update."
#
# ``knn`` is intentionally absent here — ``training.producers.knn
# stream`` already publishes ``ModelMetric{model: 'knn'}`` and
# ``ModelPerf{model: 'knn'}`` on its own cycle. Two writers on the
# same name would flicker.
CANONICAL_TO_FAMILY = {
    "gbt":      "gbt",
    "mlp":      "mlp",
    "cnn":      "cnn",
    "knn_semi": "knn_semi",
    "gru":      "gru",
    "lstm":     "lstm",
    "bert":     "transformer",
}


# Latency-per-window-microseconds estimates per family, batch=64
# amortised. Order-of-magnitude only — proper benchmarks need to run
# on the deployment hardware. Indicative enough for scene 12's
# log-scaled axis.
LATENCY_PER_FAMILY_US = {
    "gbt":            250.0,
    "mlp":             50.0,
    "cnn":            500.0,
    "knn":           3500.0,
    "knn_semi":      3500.0,
    "rnn":           1500.0,
    "gru":           1500.0,
    "lstm":          2000.0,
    "bert":           800.0,
}


def _read_json(path: Path) -> dict | None:
    try:
        return json.loads(path.read_text())
    except (OSError, json.JSONDecodeError) as e:
        log.warning("could not read %s: %s", path.name, e)
        return None


def _extract_f1(d: dict) -> float | None:
    """Pull a scalar test_macro_f1 from one of two known shapes.

    - ``training.trainer.run`` writes ``test_macro_f1`` flat.
    - ``training.eval_.run`` writes ``macro_f1: {point, low, high}``
      and the family name only (no oracle/realistic suffix), so the
      filename carries the mode if at all.
    """
    if "test_macro_f1" in d and isinstance(d["test_macro_f1"], (int, float)):
        return float(d["test_macro_f1"])
    mf1 = d.get("macro_f1")
    if isinstance(mf1, dict) and "point" in mf1:
        return float(mf1["point"])
    if isinstance(mf1, (int, float)):
        return float(mf1)
    return None


def _best_f1_for_family(reports_dir: Path, family: str) -> tuple[float, str] | None:
    """Pick the best-available test_macro_f1 for one family.

    Returns ``(f1, source_label)`` or ``None`` if no candidate file
    has a usable score.

    Filename precedence (most-preferred first):

    1. ``<family>_oracle_train.json``  — trainer-time, sample split
    2. ``<family>_eval.json``          — eval_/run.py output, recipe
                                         set by --split-recipe
    3. ``<family>_realistic_train.json`` — cross-host fallback
    """
    candidates = [
        ("oracle_train",     f"{family}_oracle_train.json"),
        ("eval",             f"{family}_eval.json"),
        ("realistic_train",  f"{family}_realistic_train.json"),
    ]
    for label, fname in candidates:
        p = reports_dir / fname
        if not p.exists():
            continue
        d = _read_json(p)
        if d is None:
            continue
        f1 = _extract_f1(d)
        if f1 is None:
            continue
        return f1, label
    return None


def emit_once(*, publisher: Publisher, reports_dir: Path) -> int:
    n = 0
    for bar_name, family in CANONICAL_TO_FAMILY.items():
        result = _best_f1_for_family(reports_dir, family)
        if result is None:
            log.info("no F1 yet for %s (family=%s) — skipping",
                     bar_name, family)
            continue
        f1, source = result
        latency = float(LATENCY_PER_FAMILY_US.get(family, 1000.0))
        try:
            publisher.publish(ModelMetric(
                model=bar_name, accuracy=f1))
            publisher.publish(ModelPerf(
                model=bar_name, latency_us=latency, accuracy=f1))
            n += 1
            log.debug("%s: F1=%.4f latency=%.0fus (from %s)",
                      bar_name, f1, latency, source)
        except Exception as e:
            log.warning("publish failed for %s: %s", bar_name, e)
    log.info("published %d (model_metric + model_perf) pairs", n)
    return n


async def _run(args) -> int:
    publisher = Publisher(url=args.publish_url)
    while True:
        emit_once(publisher=publisher, reports_dir=args.reports_dir)
        if args.interval <= 0:
            return 0
        await asyncio.sleep(args.interval)


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--reports-dir", type=Path,
                    default=Path("reports/eval"))
    ap.add_argument("--publish-url",
                    default="http://127.0.0.1:8447/publish")
    ap.add_argument("--interval", type=float, default=5.0,
                    help="re-publish period (s); 0 = one-shot. "
                         "Kept short so a fresh page-load sees populated "
                         "bars/scatter within a few seconds. The dashboard "
                         "broadcaster does not replay events to new "
                         "connections by default — see "
                         "docs/dashboard-request-sticky-cache.md.")
    ap.add_argument("--log-level", default="INFO")
    args = ap.parse_args()
    logging.basicConfig(
        level=args.log_level,
        format="%(asctime)s %(levelname)s %(name)s %(message)s")
    return asyncio.run(_run(args))


if __name__ == "__main__":
    raise SystemExit(main())