"""Per-profile and per-host metric breakdown. A model with macro F1 = 0.55 might be 0.85 on five profiles and 0.10 on the sixth. The single number hides exactly the kind of failure mode this project cares about (one malware family the model can't see). This module produces the breakdown table. """ from __future__ import annotations from dataclasses import asdict, dataclass import numpy as np from training.eval_._metrics import _f1, _macro_f1, bootstrap_macro_f1 @dataclass class CellMetrics: n: int macro_f1: float macro_f1_lo: float macro_f1_hi: float per_class_f1: dict[int, float] def by_profile( *, y_true: np.ndarray, y_pred: np.ndarray, profiles: list[str], n_classes: int, n_resamples: int = 500, ) -> dict[str, CellMetrics]: """One row per profile observed in test.""" out: dict[str, CellMetrics] = {} profs = np.asarray(profiles) for prof in sorted({p for p in profs if p}): m = profs == prof if not m.any(): continue ci = bootstrap_macro_f1(y_true[m], y_pred[m], n_classes, n_resamples=n_resamples) per_class = {k: _f1(y_true[m], y_pred[m], k) for k in range(n_classes)} out[prof] = CellMetrics( n=int(m.sum()), macro_f1=ci.point, macro_f1_lo=ci.low, macro_f1_hi=ci.high, per_class_f1=per_class, ) return out def by_host( *, y_true: np.ndarray, y_pred: np.ndarray, hosts: list[str], n_classes: int, n_resamples: int = 500, ) -> dict[str, CellMetrics]: out: dict[str, CellMetrics] = {} hs = np.asarray(hosts) for h in sorted({x for x in hs if x}): m = hs == h if not m.any(): continue ci = bootstrap_macro_f1(y_true[m], y_pred[m], n_classes, n_resamples=n_resamples) per_class = {k: _f1(y_true[m], y_pred[m], k) for k in range(n_classes)} out[h] = CellMetrics( n=int(m.sum()), macro_f1=ci.point, macro_f1_lo=ci.low, macro_f1_hi=ci.high, per_class_f1=per_class, ) return out