CIS490/training/eval_/_metrics.py

"""Metrics with bootstrap confidence intervals.

A test-set scalar reported as ``F1=0.873`` is dishonest — that's a point
estimate from one finite sample. The right honesty bar is ``F1=0.873 ±
0.012`` from N nonparametric bootstraps over the test windows.

For paired comparisons (model A vs model B on the same test set) we
use a *paired* bootstrap: resample row indices and apply the same
indices to both models' predictions. This controls for which test
windows happened to be hard.
"""
from __future__ import annotations

from dataclasses import dataclass
from typing import Iterable

import numpy as np


@dataclass
class CI:
    """Confidence interval (low, high) at the named confidence level."""
    point: float
    low: float
    high: float
    level: float = 0.95

    def fmt(self, digits: int = 3) -> str:
        return f"{self.point:.{digits}f} [{self.low:.{digits}f}, {self.high:.{digits}f}]"


def _f1(y_true: np.ndarray, y_pred: np.ndarray, k: int) -> float:
    tp = int(((y_pred == k) & (y_true == k)).sum())
    fp = int(((y_pred == k) & (y_true != k)).sum())
    fn = int(((y_pred != k) & (y_true == k)).sum())
    if tp == 0:
        return 0.0
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)
    return 2 * prec * rec / (prec + rec)


def _macro_f1(y_true: np.ndarray, y_pred: np.ndarray, n_classes: int) -> float:
    return float(np.mean([_f1(y_true, y_pred, k) for k in range(n_classes)]))


def per_class_pr_f1(y_true: np.ndarray, y_pred: np.ndarray, n_classes: int
                     ) -> dict[int, dict[str, float]]:
    """Plain per-class precision/recall/F1 (no CI, point estimate only)."""
    out: dict[int, dict[str, float]] = {}
    for k in range(n_classes):
        tp = int(((y_pred == k) & (y_true == k)).sum())
        fp = int(((y_pred == k) & (y_true != k)).sum())
        fn = int(((y_pred != k) & (y_true == k)).sum())
        prec = tp / (tp + fp) if (tp + fp) else 0.0
        rec = tp / (tp + fn) if (tp + fn) else 0.0
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
        out[k] = {"precision": prec, "recall": rec, "f1": f1, "support": int(tp + fn)}
    return out


def bootstrap_macro_f1(
    y_true: np.ndarray, y_pred: np.ndarray, n_classes: int,
    *, n_resamples: int = 1000, level: float = 0.95, seed: int = 0,
) -> CI:
    """Bootstrap CI for macro F1 by resampling test rows with replacement."""
    rng = np.random.default_rng(seed)
    n = len(y_true)
    point = _macro_f1(y_true, y_pred, n_classes)
    samples = np.empty(n_resamples, dtype=np.float64)
    for i in range(n_resamples):
        idx = rng.integers(0, n, size=n)
        samples[i] = _macro_f1(y_true[idx], y_pred[idx], n_classes)
    lo, hi = np.quantile(samples, [(1 - level) / 2, 1 - (1 - level) / 2])
    return CI(point=point, low=float(lo), high=float(hi), level=level)


def bootstrap_per_class_f1(
    y_true: np.ndarray, y_pred: np.ndarray, n_classes: int,
    *, n_resamples: int = 1000, level: float = 0.95, seed: int = 0,
) -> dict[int, CI]:
    """Per-class F1 CI."""
    rng = np.random.default_rng(seed)
    n = len(y_true)
    out: dict[int, list[float]] = {k: [] for k in range(n_classes)}
    for _ in range(n_resamples):
        idx = rng.integers(0, n, size=n)
        for k in range(n_classes):
            out[k].append(_f1(y_true[idx], y_pred[idx], k))
    cis: dict[int, CI] = {}
    for k in range(n_classes):
        arr = np.asarray(out[k])
        cis[k] = CI(
            point=_f1(y_true, y_pred, k),
            low=float(np.quantile(arr, (1 - level) / 2)),
            high=float(np.quantile(arr, 1 - (1 - level) / 2)),
            level=level,
        )
    return cis


def paired_bootstrap_macro_f1_diff(
    y_true: np.ndarray,
    y_pred_a: np.ndarray, y_pred_b: np.ndarray,
    n_classes: int,
    *, n_resamples: int = 1000, level: float = 0.95, seed: int = 0,
) -> CI:
    """Paired bootstrap of (A.macro_f1 - B.macro_f1).

    If the CI excludes 0, the difference is significant at ``level``.
    Same row indices applied to both predictions on each resample, so
    "which windows happened to be hard" cancels out.
    """
    rng = np.random.default_rng(seed)
    n = len(y_true)
    diffs = np.empty(n_resamples, dtype=np.float64)
    for i in range(n_resamples):
        idx = rng.integers(0, n, size=n)
        a = _macro_f1(y_true[idx], y_pred_a[idx], n_classes)
        b = _macro_f1(y_true[idx], y_pred_b[idx], n_classes)
        diffs[i] = a - b
    lo, hi = np.quantile(diffs, [(1 - level) / 2, 1 - (1 - level) / 2])
    return CI(
        point=_macro_f1(y_true, y_pred_a, n_classes)
              - _macro_f1(y_true, y_pred_b, n_classes),
        low=float(lo), high=float(hi), level=level,
    )


def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray,
                      n_classes: int) -> np.ndarray:
    """Returns a (n_classes, n_classes) integer matrix using the same
    label set for rows and columns. Avoids the bug where one side has
    a class the other doesn't."""
    cm = np.zeros((n_classes, n_classes), dtype=np.int64)
    for t, p in zip(y_true, y_pred):
        if 0 <= t < n_classes and 0 <= p < n_classes:
            cm[t, p] += 1
    return cm