CIS490/training/dashboard/producers/metrics.py

"""Emit `model_metric` events for the dashboard's accuracy bars.

Loads every checkpoint via the schema-hashed loader, scores each on
the held-out test split (held-out-by-host by default), publishes one
``model_metric`` per model. Re-publishes on a tick so a browser
opening 30s after a one-shot run still sees populated bars.

Note: dashboard's CSS styles bars by exact name (`rnn|gru|lstm|bert`).
Our names are e.g. `gbt_realistic`. Bars render with a default color.
The accuracy reported is **macro-F1** under the realistic-vs-oracle
split that the model was trained for — *not* plain accuracy. We
publish under the existing `accuracy` key so the dashboard JS doesn't
need a frontend change; macro-F1 is the metric we actually care about.
"""
from __future__ import annotations

import argparse
import asyncio
import json
import logging
import sys
from pathlib import Path

import numpy as np
import pyarrow.parquet as pq

sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
from training._split import (
    held_out_host, held_out_sample, held_out_time,
)
from training.dashboard.producers._models import load_models
from training.dashboard.producers._publish import (
    PublishFn, http_publisher, null_publisher,
)
from training.eval_._metrics import _macro_f1
from training.models import BaseModel


log = logging.getLogger("cis490.dashboard.producers.metrics")


def _build_test_set(model: BaseModel, *, validation_path: Path,
                     summary_path: Path | None,
                     tensors_root: Path | None,
                     split_recipe: str, train_hosts: list[str]
                     ) -> tuple[np.ndarray, np.ndarray]:
    """Return (X_test, y_test) for the given model's input kind."""
    val = pq.read_table(validation_path).to_pylist()
    rows = [r for r in val if r["status"] in ("accepted", "degraded")]
    profs   = [r["profile"] for r in rows]
    samples = [r["sample_name"] for r in rows]
    hosts   = [r["host_id"] for r in rows]
    epi_ids = [r["episode_id"] for r in rows]
    recv    = [r.get("received_at_wall", "") for r in rows]
    if split_recipe == "host":
        splits = held_out_host(profiles=profs, sample_names=samples,
                                host_ids=hosts, episode_ids=epi_ids,
                                train_hosts=train_hosts, seed=0)
    elif split_recipe == "sample":
        splits = held_out_sample(profiles=profs, sample_names=samples,
                                  host_ids=hosts, seed=0)
    else:
        splits = held_out_time(profiles=profs, sample_names=samples,
                                host_ids=hosts, received_at=recv, seed=0)
    test_eps = {epi_ids[i] for i in range(len(epi_ids)) if splits.test[i]}

    if model.input_kind == "summary":
        if summary_path is None:
            raise ValueError("--summary required for summary model")
        from training.trainer._data import load_summary
        # Need schema path; assume sibling
        schema_path = summary_path.parent / "feature_schema_v1.json"
        d = load_summary(summary_path, schema_path)
        m = np.array([e in test_eps for e in d.episode_id], dtype=bool)
        return d.X[m], d.y[m]
    else:
        if tensors_root is None:
            raise ValueError("--tensors required for tensor model")
        from training.trainer._data import load_tensor
        d = load_tensor(tensors_root)
        m = np.array([e in test_eps for e in d.episode_id], dtype=bool)
        return d.X[m], d.y[m]


async def emit_metrics(*, publish: PublishFn, artifacts_dir: Path,
                        validation_path: Path,
                        summary_path: Path | None,
                        tensors_root: Path | None,
                        split_recipe: str,
                        train_hosts: list[str]) -> int:
    models = load_models(artifacts_dir)
    if not models:
        log.warning("no models found under %s", artifacts_dir)
        return 0
    n = 0
    for m in models:
        try:
            Xte, yte = _build_test_set(
                m, validation_path=validation_path,
                summary_path=summary_path, tensors_root=tensors_root,
                split_recipe=split_recipe, train_hosts=train_hosts,
            )
        except Exception as e:
            log.warning("test set build failed for %s: %s",
                        m.__model_name__, e)
            continue
        if len(yte) == 0:
            log.warning("empty test set for %s; skipping", m.__model_name__)
            continue
        y_pred = m.predict(Xte)
        f1 = _macro_f1(yte, y_pred, m.n_classes)
        log.info("%s test_macro_f1=%.4f (n=%d)", m.__model_name__, f1, len(yte))
        # `accuracy` key for the dashboard's existing bar widget; the
        # value is macro-F1 in our project.
        await publish({
            "type": "model_metric",
            "model": m.__model_name__,
            "accuracy": f1,
        })
        n += 1
    return n


async def _run(args: argparse.Namespace) -> int:
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(name)s %(message)s")
    publisher = (null_publisher() if args.dry_run
                 else http_publisher(args.publish_url))
    while True:
        await emit_metrics(
            publish=publisher, artifacts_dir=args.artifacts,
            validation_path=args.validation,
            summary_path=args.summary, tensors_root=args.tensors,
            split_recipe=args.split_recipe,
            train_hosts=args.train_hosts,
        )
        if args.interval <= 0:
            return 0
        await asyncio.sleep(args.interval)


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--validation", required=True, type=Path)
    ap.add_argument("--artifacts", type=Path, default=Path("artifacts"))
    ap.add_argument("--summary", type=Path, default=None)
    ap.add_argument("--tensors", type=Path, default=None)
    ap.add_argument("--split-recipe", choices=["host", "sample", "time"],
                    default="host")
    ap.add_argument("--train-hosts", nargs="+", default=["elliott-thinkpad"])
    ap.add_argument("--publish-url", default="http://127.0.0.1:8447/publish")
    ap.add_argument("--interval", type=float, default=20.0)
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()
    return asyncio.run(_run(args))


if __name__ == "__main__":
    raise SystemExit(main())