Producers are event *sources* — the renderer is everything inside training/dashboard/. Sibling layout makes the dependency direction one-way (producers import from training.dashboard.events; dashboard never reaches into producers). training/dashboard/producers/ → training/producers/ Internal imports rewritten via sed; eval_/run.py and training/README.md cross-references updated. CLI entry stays via `python -m training.producers.<sub>` (replay / metrics / perf / profiles). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
159 lines
6.3 KiB
Python
159 lines
6.3 KiB
Python
"""Emit `model_metric` events for the dashboard's accuracy bars.
|
|
|
|
Loads every checkpoint via the schema-hashed loader, scores each on
|
|
the held-out test split (held-out-by-host by default), publishes one
|
|
``model_metric`` per model. Re-publishes on a tick so a browser
|
|
opening 30s after a one-shot run still sees populated bars.
|
|
|
|
Note: dashboard's CSS styles bars by exact name (`rnn|gru|lstm|bert`).
|
|
Our names are e.g. `gbt_realistic`. Bars render with a default color.
|
|
The accuracy reported is **macro-F1** under the realistic-vs-oracle
|
|
split that the model was trained for — *not* plain accuracy. We
|
|
publish under the existing `accuracy` key so the dashboard JS doesn't
|
|
need a frontend change; macro-F1 is the metric we actually care about.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pyarrow.parquet as pq
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
|
|
from training._split import (
|
|
held_out_host, held_out_sample, held_out_time,
|
|
)
|
|
from training.producers._models import load_models
|
|
from training.producers._publish import (
|
|
PublishFn, http_publisher, null_publisher,
|
|
)
|
|
from training.eval_._metrics import _macro_f1
|
|
from training.models import BaseModel
|
|
|
|
|
|
log = logging.getLogger("cis490.dashboard.producers.metrics")
|
|
|
|
|
|
def _build_test_set(model: BaseModel, *, validation_path: Path,
|
|
summary_path: Path | None,
|
|
tensors_root: Path | None,
|
|
split_recipe: str, train_hosts: list[str]
|
|
) -> tuple[np.ndarray, np.ndarray]:
|
|
"""Return (X_test, y_test) for the given model's input kind."""
|
|
val = pq.read_table(validation_path).to_pylist()
|
|
rows = [r for r in val if r["status"] in ("accepted", "degraded")]
|
|
profs = [r["profile"] for r in rows]
|
|
samples = [r["sample_name"] for r in rows]
|
|
hosts = [r["host_id"] for r in rows]
|
|
epi_ids = [r["episode_id"] for r in rows]
|
|
recv = [r.get("received_at_wall", "") for r in rows]
|
|
if split_recipe == "host":
|
|
splits = held_out_host(profiles=profs, sample_names=samples,
|
|
host_ids=hosts, episode_ids=epi_ids,
|
|
train_hosts=train_hosts, seed=0)
|
|
elif split_recipe == "sample":
|
|
splits = held_out_sample(profiles=profs, sample_names=samples,
|
|
host_ids=hosts, seed=0)
|
|
else:
|
|
splits = held_out_time(profiles=profs, sample_names=samples,
|
|
host_ids=hosts, received_at=recv, seed=0)
|
|
test_eps = {epi_ids[i] for i in range(len(epi_ids)) if splits.test[i]}
|
|
|
|
if model.input_kind == "summary":
|
|
if summary_path is None:
|
|
raise ValueError("--summary required for summary model")
|
|
from training.trainer._data import load_summary
|
|
# Need schema path; assume sibling
|
|
schema_path = summary_path.parent / "feature_schema_v1.json"
|
|
d = load_summary(summary_path, schema_path)
|
|
m = np.array([e in test_eps for e in d.episode_id], dtype=bool)
|
|
return d.X[m], d.y[m]
|
|
else:
|
|
if tensors_root is None:
|
|
raise ValueError("--tensors required for tensor model")
|
|
from training.trainer._data import load_tensor
|
|
d = load_tensor(tensors_root)
|
|
m = np.array([e in test_eps for e in d.episode_id], dtype=bool)
|
|
return d.X[m], d.y[m]
|
|
|
|
|
|
async def emit_metrics(*, publish: PublishFn, artifacts_dir: Path,
|
|
validation_path: Path,
|
|
summary_path: Path | None,
|
|
tensors_root: Path | None,
|
|
split_recipe: str,
|
|
train_hosts: list[str]) -> int:
|
|
models = load_models(artifacts_dir)
|
|
if not models:
|
|
log.warning("no models found under %s", artifacts_dir)
|
|
return 0
|
|
n = 0
|
|
for m in models:
|
|
try:
|
|
Xte, yte = _build_test_set(
|
|
m, validation_path=validation_path,
|
|
summary_path=summary_path, tensors_root=tensors_root,
|
|
split_recipe=split_recipe, train_hosts=train_hosts,
|
|
)
|
|
except Exception as e:
|
|
log.warning("test set build failed for %s: %s",
|
|
m.__model_name__, e)
|
|
continue
|
|
if len(yte) == 0:
|
|
log.warning("empty test set for %s; skipping", m.__model_name__)
|
|
continue
|
|
y_pred = m.predict(Xte)
|
|
f1 = _macro_f1(yte, y_pred, m.n_classes)
|
|
log.info("%s test_macro_f1=%.4f (n=%d)", m.__model_name__, f1, len(yte))
|
|
# `accuracy` key for the dashboard's existing bar widget; the
|
|
# value is macro-F1 in our project.
|
|
await publish({
|
|
"type": "model_metric",
|
|
"model": m.__model_name__,
|
|
"accuracy": f1,
|
|
})
|
|
n += 1
|
|
return n
|
|
|
|
|
|
async def _run(args: argparse.Namespace) -> int:
|
|
logging.basicConfig(level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s")
|
|
publisher = (null_publisher() if args.dry_run
|
|
else http_publisher(args.publish_url))
|
|
while True:
|
|
await emit_metrics(
|
|
publish=publisher, artifacts_dir=args.artifacts,
|
|
validation_path=args.validation,
|
|
summary_path=args.summary, tensors_root=args.tensors,
|
|
split_recipe=args.split_recipe,
|
|
train_hosts=args.train_hosts,
|
|
)
|
|
if args.interval <= 0:
|
|
return 0
|
|
await asyncio.sleep(args.interval)
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--validation", required=True, type=Path)
|
|
ap.add_argument("--artifacts", type=Path, default=Path("artifacts"))
|
|
ap.add_argument("--summary", type=Path, default=None)
|
|
ap.add_argument("--tensors", type=Path, default=None)
|
|
ap.add_argument("--split-recipe", choices=["host", "sample", "time"],
|
|
default="host")
|
|
ap.add_argument("--train-hosts", nargs="+", default=["elliott-thinkpad"])
|
|
ap.add_argument("--publish-url", default="http://127.0.0.1:8447/publish")
|
|
ap.add_argument("--interval", type=float, default=20.0)
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
return asyncio.run(_run(args))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|