CIS490/training/dashboard/producers/replay.py

"""Replay an episode at wall-clock time, emitting live dashboard events.

For one episode we emit:
  phase       — ground truth from labels.jsonl, on each transition
  prediction  — per-window predicted vs actual phase from one model
                (the "primary" model, default: first GBT loaded)
  embedding   — 2-D PCA projection of each window for the KNN scatter

Producer is transport-agnostic via _publish.PublishFn. Models are
loaded via the schema-hashed checkpoint format — schema mismatch
between training and inference fails loud, not silent.

Both summary and tensor models are supported. The producer extracts
the right input flavor per model on demand:
  - summary: summary_windows(epi)
  - tensor:  tensor_windows(epi)
"""
from __future__ import annotations

import argparse
import asyncio
import json
import logging
import sys
import time
from pathlib import Path

import numpy as np

sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
from training._episode_io import open_episode
from training._features import (
    PHASE_TO_INT, summary_windows, tensor_windows,
)
from training.dashboard.producers._models import (
    load_models, model_display_name,
)
from training.dashboard.producers._publish import (
    PublishFn, http_publisher, null_publisher,
)
from training.models import BaseModel


log = logging.getLogger("cis490.dashboard.producers.replay")


def _pick_primary(models: list[BaseModel]) -> BaseModel | None:
    """Pick the model whose predictions drive the chunking widget. We
    prefer a realistic-mode model since that's the one a deployed system
    would run."""
    if not models:
        return None
    # Prefer the realistic-mode model on a stable ranking by name.
    rank = {"gbt": 0, "cnn": 1, "transformer": 2,
             "gru": 3, "lstm": 4, "mlp": 5}
    sorted_models = sorted(
        models,
        key=lambda m: (
            0 if "realistic" in str(m.__class__.__name__).lower() else 1,
            rank.get(m.__model_name__, 99),
        ),
    )
    return sorted_models[0]


async def replay_episode(
    *,
    publish: PublishFn,
    episode_path: Path,
    host_id: str,
    models: list[BaseModel],
    speed: float = 1.0,
) -> None:
    epi = open_episode(episode_path, host_id=host_id)
    if not epi.labels:
        log.warning("episode %s has no labels — nothing to replay", episode_path)
        return

    # Build inputs for each input_kind once.
    inputs: dict[str, dict] = {}
    if any(m.input_kind == "summary" for m in models):
        Xs, ys, ts, _ = summary_windows(epi)
        inputs["summary"] = {"X": Xs, "y": ys, "t": ts}
    if any(m.input_kind == "tensor" for m in models):
        Xt, yt, tt, _, _ = tensor_windows(epi)
        inputs["tensor"] = {"X": Xt, "y": yt, "t": tt}

    # Time alignment uses tensor's t if present (most fine-grained); fall
    # back to summary.
    ref = inputs.get("tensor") or inputs.get("summary")
    if ref is None or ref["X"].shape[0] == 0:
        log.warning("no usable windows for %s", episode_path)
        return
    n_w = ref["X"].shape[0]
    t_centers = ref["t"]
    y_actual = ref["y"]

    # Phase ground-truth events from labels.jsonl
    label_events: list[tuple[float, str]] = []
    t0 = int(epi.labels[0]["t_wall_ns"])
    for L in epi.labels:
        label_events.append(((L["t_wall_ns"] - t0) / 1e9, L["phase"]))

    int_to_phase = {i: p for p, i in PHASE_TO_INT.items()}
    primary = _pick_primary(models)
    if primary is None:
        log.info("no models loaded; emitting phase + embedding only")

    log.info("replay start: %d windows, %d models, primary=%s",
             n_w, len(models),
             model_display_name(primary) if primary else None)

    start_wall = time.monotonic()
    label_cursor = 0

    for w in range(n_w):
        target_wall = start_wall + float(t_centers[w]) / speed
        delay = target_wall - time.monotonic()
        if delay > 0:
            await asyncio.sleep(delay)

        # Phase events for any label transitions whose time has passed
        while (label_cursor < len(label_events)
               and label_events[label_cursor][0] <= float(t_centers[w])):
            phase_name = label_events[label_cursor][1]
            await publish({"type": "phase", "phase": phase_name})
            label_cursor += 1

        actual_name = int_to_phase.get(int(y_actual[w]), "clean")

        # Predictions: only the primary's prediction goes to chunking widget
        if primary is not None:
            X_one = inputs[primary.input_kind]["X"][w:w + 1]
            try:
                pred = int(primary.predict(X_one)[0])
                pred_name = int_to_phase.get(pred, "clean")
            except Exception as e:
                log.warning("predict failed: %s", e)
                pred_name = actual_name
            await publish({
                "type": "prediction",
                "episode_id": epi.episode_id,
                "window_idx": w,
                "predicted": pred_name,
                "actual": actual_name,
                "model": primary.__model_name__,
            })

        # Embedding: project the primary's standardized window through
        # its saved PCA-2 (loaded from the checkpoint header). If the
        # primary doesn't have a projection, skip embedding for this
        # window.
        if primary is not None:
            xy = _project_one(primary, X_one)
            if xy is not None:
                await publish({
                    "type": "embedding",
                    "x": float(xy[0]), "y": float(xy[1]),
                    "phase": actual_name,
                })


def _project_one(model: BaseModel, X_one: np.ndarray) -> tuple[float, float] | None:
    """Apply the model's standardize+keep, then project through the
    PCA-2 baked into the checkpoint header (if any). Returns (x, y) in
    [0, 1] using a min-max squash with stats fit on first call."""
    pca = getattr(model, "_pca_proj", None)
    if pca is None:
        return None
    Xk = model.select(X_one[:1])
    if Xk.ndim == 3:
        Xk = Xk.reshape(1, -1)
    if Xk.shape[1] != pca.shape[0]:
        return None
    p = (Xk @ pca).ravel()
    # Tanh-squash with k=0.05 so most points land in (0.2, 0.8). Without
    # train-time min/max it's the cleanest stateless squash.
    return (
        0.5 + 0.5 * float(np.tanh(0.05 * p[0])),
        0.5 + 0.5 * float(np.tanh(0.05 * p[1])),
    )


async def _run(args: argparse.Namespace) -> int:
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(name)s %(message)s")

    models = load_models(args.artifacts, device=args.device)
    # Hydrate PCA projection from each checkpoint header
    from training.models._checkpoint import load_header
    paths = sorted(Path(args.artifacts).glob("*.ckpt.json"))
    for m, p in zip(models, paths):
        header = load_header(p)
        if header.get("pca_proj") is not None:
            m._pca_proj = np.asarray(header["pca_proj"], dtype=np.float32)

    publisher = (null_publisher() if args.dry_run
                 else http_publisher(args.publish_url))
    await replay_episode(
        publish=publisher, episode_path=args.episode,
        host_id=args.host_id, models=models, speed=args.speed,
    )
    return 0


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--episode", required=True, type=Path)
    ap.add_argument("--host-id", required=True)
    ap.add_argument("--artifacts", type=Path, default=Path("artifacts"))
    ap.add_argument("--publish-url", default="http://127.0.0.1:8447/publish")
    ap.add_argument("--speed", type=float, default=1.0)
    ap.add_argument("--device", default="auto")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()
    return asyncio.run(_run(args))


if __name__ == "__main__":
    raise SystemExit(main())