CIS490/training/producers/metrics.py
Max 697e36a315 training/producers: move out of dashboard/ per ownership boundary
Producers are event *sources* — the renderer is everything inside
training/dashboard/. Sibling layout makes the dependency direction
one-way (producers import from training.dashboard.events; dashboard
never reaches into producers).

  training/dashboard/producers/   →   training/producers/

Internal imports rewritten via sed; eval_/run.py and training/README.md
cross-references updated. CLI entry stays via `python -m training.producers.<sub>`
(replay / metrics / perf / profiles).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 12:06:56 -05:00

159 lines
6.3 KiB
Python

"""Emit `model_metric` events for the dashboard's accuracy bars.
Loads every checkpoint via the schema-hashed loader, scores each on
the held-out test split (held-out-by-host by default), publishes one
``model_metric`` per model. Re-publishes on a tick so a browser
opening 30s after a one-shot run still sees populated bars.
Note: dashboard's CSS styles bars by exact name (`rnn|gru|lstm|bert`).
Our names are e.g. `gbt_realistic`. Bars render with a default color.
The accuracy reported is **macro-F1** under the realistic-vs-oracle
split that the model was trained for — *not* plain accuracy. We
publish under the existing `accuracy` key so the dashboard JS doesn't
need a frontend change; macro-F1 is the metric we actually care about.
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import sys
from pathlib import Path
import numpy as np
import pyarrow.parquet as pq
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
from training._split import (
held_out_host, held_out_sample, held_out_time,
)
from training.producers._models import load_models
from training.producers._publish import (
PublishFn, http_publisher, null_publisher,
)
from training.eval_._metrics import _macro_f1
from training.models import BaseModel
log = logging.getLogger("cis490.dashboard.producers.metrics")
def _build_test_set(model: BaseModel, *, validation_path: Path,
summary_path: Path | None,
tensors_root: Path | None,
split_recipe: str, train_hosts: list[str]
) -> tuple[np.ndarray, np.ndarray]:
"""Return (X_test, y_test) for the given model's input kind."""
val = pq.read_table(validation_path).to_pylist()
rows = [r for r in val if r["status"] in ("accepted", "degraded")]
profs = [r["profile"] for r in rows]
samples = [r["sample_name"] for r in rows]
hosts = [r["host_id"] for r in rows]
epi_ids = [r["episode_id"] for r in rows]
recv = [r.get("received_at_wall", "") for r in rows]
if split_recipe == "host":
splits = held_out_host(profiles=profs, sample_names=samples,
host_ids=hosts, episode_ids=epi_ids,
train_hosts=train_hosts, seed=0)
elif split_recipe == "sample":
splits = held_out_sample(profiles=profs, sample_names=samples,
host_ids=hosts, seed=0)
else:
splits = held_out_time(profiles=profs, sample_names=samples,
host_ids=hosts, received_at=recv, seed=0)
test_eps = {epi_ids[i] for i in range(len(epi_ids)) if splits.test[i]}
if model.input_kind == "summary":
if summary_path is None:
raise ValueError("--summary required for summary model")
from training.trainer._data import load_summary
# Need schema path; assume sibling
schema_path = summary_path.parent / "feature_schema_v1.json"
d = load_summary(summary_path, schema_path)
m = np.array([e in test_eps for e in d.episode_id], dtype=bool)
return d.X[m], d.y[m]
else:
if tensors_root is None:
raise ValueError("--tensors required for tensor model")
from training.trainer._data import load_tensor
d = load_tensor(tensors_root)
m = np.array([e in test_eps for e in d.episode_id], dtype=bool)
return d.X[m], d.y[m]
async def emit_metrics(*, publish: PublishFn, artifacts_dir: Path,
validation_path: Path,
summary_path: Path | None,
tensors_root: Path | None,
split_recipe: str,
train_hosts: list[str]) -> int:
models = load_models(artifacts_dir)
if not models:
log.warning("no models found under %s", artifacts_dir)
return 0
n = 0
for m in models:
try:
Xte, yte = _build_test_set(
m, validation_path=validation_path,
summary_path=summary_path, tensors_root=tensors_root,
split_recipe=split_recipe, train_hosts=train_hosts,
)
except Exception as e:
log.warning("test set build failed for %s: %s",
m.__model_name__, e)
continue
if len(yte) == 0:
log.warning("empty test set for %s; skipping", m.__model_name__)
continue
y_pred = m.predict(Xte)
f1 = _macro_f1(yte, y_pred, m.n_classes)
log.info("%s test_macro_f1=%.4f (n=%d)", m.__model_name__, f1, len(yte))
# `accuracy` key for the dashboard's existing bar widget; the
# value is macro-F1 in our project.
await publish({
"type": "model_metric",
"model": m.__model_name__,
"accuracy": f1,
})
n += 1
return n
async def _run(args: argparse.Namespace) -> int:
logging.basicConfig(level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s")
publisher = (null_publisher() if args.dry_run
else http_publisher(args.publish_url))
while True:
await emit_metrics(
publish=publisher, artifacts_dir=args.artifacts,
validation_path=args.validation,
summary_path=args.summary, tensors_root=args.tensors,
split_recipe=args.split_recipe,
train_hosts=args.train_hosts,
)
if args.interval <= 0:
return 0
await asyncio.sleep(args.interval)
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--validation", required=True, type=Path)
ap.add_argument("--artifacts", type=Path, default=Path("artifacts"))
ap.add_argument("--summary", type=Path, default=None)
ap.add_argument("--tensors", type=Path, default=None)
ap.add_argument("--split-recipe", choices=["host", "sample", "time"],
default="host")
ap.add_argument("--train-hosts", nargs="+", default=["elliott-thinkpad"])
ap.add_argument("--publish-url", default="http://127.0.0.1:8447/publish")
ap.add_argument("--interval", type=float, default=20.0)
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
return asyncio.run(_run(args))
if __name__ == "__main__":
raise SystemExit(main())