CIS490/training/eval_/run.py

"""End-to-end eval driver — load all checkpoints, score on test split,
emit per-model JSON + a comparison markdown.

Outputs to reports/eval/:
  <model>_<mode>_eval.json     full metrics: macro_f1 ± CI, per-phase F1 ± CI,
                               per-profile F1, per-host F1, confusion matrix
  comparison_v2.md             side-by-side table with paired-bootstrap
                               significance
"""
from __future__ import annotations

import argparse
import json
import logging
import sys
from dataclasses import asdict
from pathlib import Path

import numpy as np
import pyarrow.parquet as pq

sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from training._features import PHASES
from training._split import (
    held_out_host, held_out_sample, held_out_time,
)
from training.producers._models import load_models
from training.eval_._metrics import (
    bootstrap_macro_f1, bootstrap_per_class_f1,
    confusion_matrix, paired_bootstrap_macro_f1_diff,
    per_class_pr_f1,
)
from training.eval_.breakdown import by_host, by_profile


log = logging.getLogger("cis490.eval.run")


def _load_test(model, *, validation_path: Path,
                summary_path: Path | None, tensors_root: Path | None,
                split_recipe: str, train_hosts: list[str], seed: int = 0
                ) -> dict:
    val = pq.read_table(validation_path).to_pylist()
    rows = [r for r in val if r["status"] in ("accepted", "degraded")]
    profs   = [r["profile"] for r in rows]
    samples = [r["sample_name"] for r in rows]
    hosts   = [r["host_id"] for r in rows]
    epi_ids = [r["episode_id"] for r in rows]
    recv    = [r.get("received_at_wall", "") for r in rows]
    if split_recipe == "host":
        s = held_out_host(profiles=profs, sample_names=samples,
                           host_ids=hosts, episode_ids=epi_ids,
                           train_hosts=train_hosts, seed=seed)
    elif split_recipe == "sample":
        s = held_out_sample(profiles=profs, sample_names=samples,
                             host_ids=hosts, seed=seed)
    else:
        s = held_out_time(profiles=profs, sample_names=samples,
                           host_ids=hosts, received_at=recv, seed=seed)
    test_eps = {epi_ids[i] for i in range(len(epi_ids)) if s.test[i]}

    if model.input_kind == "summary":
        from training.trainer._data import load_summary
        schema = (summary_path.parent / "feature_schema_v1.json")
        d = load_summary(summary_path, schema)
    else:
        from training.trainer._data import load_tensor
        d = load_tensor(tensors_root)
    m = np.array([e in test_eps for e in d.episode_id], dtype=bool)
    X = d.X[m]
    y = d.y[m]
    profiles = [d.profile[i] for i in range(len(d.profile)) if m[i]]
    hosts_w = [d.host_id[i] for i in range(len(d.host_id)) if m[i]]
    return {"X": X, "y": y, "profiles": profiles, "hosts": hosts_w,
            "splits": s}


def _eval_one(model, *, validation_path, summary_path, tensors_root,
               split_recipe, train_hosts, n_resamples=1000) -> dict:
    test = _load_test(model, validation_path=validation_path,
                       summary_path=summary_path, tensors_root=tensors_root,
                       split_recipe=split_recipe, train_hosts=train_hosts)
    y_true = test["y"]
    y_pred = model.predict(test["X"])
    nc = model.n_classes

    overall = bootstrap_macro_f1(y_true, y_pred, nc, n_resamples=n_resamples)
    per_class_ci = bootstrap_per_class_f1(y_true, y_pred, nc,
                                            n_resamples=n_resamples)
    by_prof = by_profile(y_true=y_true, y_pred=y_pred,
                          profiles=test["profiles"], n_classes=nc,
                          n_resamples=max(200, n_resamples // 2))
    by_h = by_host(y_true=y_true, y_pred=y_pred, hosts=test["hosts"],
                    n_classes=nc, n_resamples=max(200, n_resamples // 2))
    cm = confusion_matrix(y_true, y_pred, nc)

    return {
        "model": model.__model_name__,
        "n_test": int(len(y_true)),
        "macro_f1": {"point": overall.point,
                      "low": overall.low, "high": overall.high},
        "per_class_f1": {
            PHASES[k]: {"point": per_class_ci[k].point,
                          "low": per_class_ci[k].low,
                          "high": per_class_ci[k].high}
            for k in range(nc)
        },
        "by_profile": {k: asdict(v) for k, v in by_prof.items()},
        "by_host": {k: asdict(v) for k, v in by_h.items()},
        "confusion_matrix": cm.tolist(),
        "split_recipe": split_recipe,
        "untested_profiles": list(test["splits"].untested_profiles),
        "excluded_profiles": list(test["splits"].excluded_profiles),
        "predictions": y_pred.tolist(),  # for paired bootstrap later
        "targets": y_true.tolist(),
    }


def _markdown_report(results: list[dict], out_path: Path,
                      *, n_classes: int, n_resamples: int = 1000) -> None:
    """Comparison table + paired-bootstrap significance for the top model."""
    lines = ["# Model comparison\n"]
    lines.append(f"Held-out recipe: **{results[0]['split_recipe']}**.  "
                 f"All metrics are macro F1 with bootstrap 95 % CIs.\n")
    if results[0]["untested_profiles"]:
        lines.append(f"⚠ untested profiles (no test cell): "
                     f"{results[0]['untested_profiles']}\n")
    if results[0]["excluded_profiles"]:
        lines.append(f"⚠ excluded profiles (no train data): "
                     f"{results[0]['excluded_profiles']}\n")

    lines.append("## Overall macro F1\n")
    lines.append("| model | n_test | macro F1 (95 % CI) |")
    lines.append("|---|---:|---|")
    sorted_r = sorted(results, key=lambda r: -r["macro_f1"]["point"])
    for r in sorted_r:
        f = r["macro_f1"]
        lines.append(f"| {r['model']} | {r['n_test']} | "
                     f"{f['point']:.3f} [{f['low']:.3f}, {f['high']:.3f}] |")

    lines.append("\n## Per-phase F1\n")
    # Use the intersection of phases each model reports; PHASES has
    # "failed" which models trained on the smoke set may not have seen.
    phases = sorted({
        p for r in sorted_r for p in r["per_class_f1"].keys()
    }, key=lambda p: PHASES.index(p) if p in PHASES else 99)
    head = "| model | " + " | ".join(phases) + " |"
    lines.append(head); lines.append("|---|" + "---:|" * len(phases))
    for r in sorted_r:
        cells = [
            (f"{r['per_class_f1'][p]['point']:.3f}"
             if p in r["per_class_f1"] else "—")
            for p in phases
        ]
        lines.append(f"| {r['model']} | " + " | ".join(cells) + " |")

    lines.append("\n## Per-profile macro F1 (top model only — full table in JSON)\n")
    top = sorted_r[0]
    lines.append(f"Top model: **{top['model']}**\n")
    lines.append("| profile | n | macro F1 (95 % CI) |")
    lines.append("|---|---:|---|")
    for prof, m in sorted(top["by_profile"].items()):
        lines.append(f"| {prof} | {m['n']} | "
                     f"{m['macro_f1']:.3f} [{m['macro_f1_lo']:.3f}, "
                     f"{m['macro_f1_hi']:.3f}] |")

    lines.append("\n## Per-host macro F1 (top model)\n")
    lines.append("| host | n | macro F1 (95 % CI) |")
    lines.append("|---|---:|---|")
    for h, m in sorted(top["by_host"].items()):
        lines.append(f"| {h} | {m['n']} | "
                     f"{m['macro_f1']:.3f} [{m['macro_f1_lo']:.3f}, "
                     f"{m['macro_f1_hi']:.3f}] |")

    # Paired-bootstrap significance: top vs each other
    if len(sorted_r) > 1:
        lines.append("\n## Paired-bootstrap significance vs top model\n")
        lines.append(f"Comparison anchor: **{top['model']}**. "
                     f"95 % CI excludes 0 → significant difference.\n")
        lines.append("| model | Δ macro F1 (anchor − model) (95 % CI) |")
        lines.append("|---|---|")
        y_true = np.asarray(top["targets"])
        y_anchor = np.asarray(top["predictions"])
        for r in sorted_r[1:]:
            y_other = np.asarray(r["predictions"])
            if len(y_other) != len(y_true):
                continue
            d = paired_bootstrap_macro_f1_diff(
                y_true, y_anchor, y_other, n_classes,
                n_resamples=n_resamples,
            )
            sig = "*" if (d.low > 0 or d.high < 0) else ""
            lines.append(f"| {r['model']} | "
                         f"{d.point:+.3f} [{d.low:+.3f}, {d.high:+.3f}] {sig} |")

    out_path.write_text("\n".join(lines) + "\n")


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--validation", required=True, type=Path)
    ap.add_argument("--artifacts", type=Path, default=Path("artifacts"))
    ap.add_argument("--summary", type=Path, default=None)
    ap.add_argument("--tensors", type=Path, default=None)
    ap.add_argument("--reports-dir", type=Path, default=Path("reports/eval"))
    ap.add_argument("--split-recipe", choices=["host", "sample", "time"],
                    default="host")
    ap.add_argument("--train-hosts", nargs="+", default=["elliott-thinkpad"])
    ap.add_argument("--n-resamples", type=int, default=1000)
    args = ap.parse_args()

    logging.basicConfig(level="INFO",
                        format="%(asctime)s %(levelname)s %(name)s %(message)s")
    args.reports_dir.mkdir(parents=True, exist_ok=True)

    models = load_models(args.artifacts)
    if not models:
        log.warning("no models found under %s", args.artifacts)
        return 1

    results = []
    for m in models:
        log.info("evaluating %s", m.__model_name__)
        res = _eval_one(m, validation_path=args.validation,
                         summary_path=args.summary, tensors_root=args.tensors,
                         split_recipe=args.split_recipe,
                         train_hosts=args.train_hosts,
                         n_resamples=args.n_resamples)
        out = args.reports_dir / f"{m.__model_name__}_eval.json"
        out.write_text(json.dumps(
            {k: v for k, v in res.items()
              if k not in {"predictions", "targets"}},
            indent=2) + "\n")
        results.append(res)

    if results:
        n_classes = max(r.get("n_test_classes",
                                len(PHASES)) for r in results)
        n_classes = len(PHASES)
        _markdown_report(
            results, args.reports_dir / "comparison_v2.md",
            n_classes=n_classes, n_resamples=args.n_resamples,
        )
        log.info("wrote %s", args.reports_dir / "comparison_v2.md")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())