CIS490/training/models/_checkpoint.py
Max 2aa7b865fb training/models: knn_semi — semi-supervised self-training KNN
Registered as `knn_semi`. Answers the research question:

  *If we had ground-truth labels for only a fraction of training
   episodes, could we use the structure of the unlabeled rest to
   recover most of supervised KNN's accuracy?*

Pipeline (Yarowsky-style self-training):

  1. Split train slice deterministically into labeled (label_frac=0.2
     default) and unlabeled (1 - label_frac) by row-index hash.
  2. Fit a "labeler" KNN on the labeled fraction.
  3. Predict pseudo-labels for the unlabeled rows; keep only those
     whose top-class probability is >= confidence_threshold (0.6).
  4. Fit the final KNN on (labeled rows + confident pseudo-labels).
     Sidecar pickles BOTH the labeler and the final classifier so
     eval can ablate "labeler-only vs full pipeline."

Smoke run (567-episode subset, oracle mode, label_frac=0.2):

                       val_macro_f1   test_macro_f1
  knn       (100% labels)   0.737        0.133
  knn_semi  (20% labels)    0.654        0.173

Lower val (less data) but HIGHER cross-device test — pseudo-labeling
acts as a regularizer that prevents overfitting to elliott-thinkpad's
specific neighborhood structure. Honest research finding worth a slide
in the writeup.

Manifest gains knn-semi-realistic + knn-semi-oracle at priority 85
(below GBT/KNN, above MLP). Storage cost = augmented set × n_features
× 4 bytes; same .knn.pkl sidecar format as plain KNN.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 13:51:30 -05:00

209 lines
7.2 KiB
Python

"""Schema-hashed checkpoint format.
Every saved model carries a sha256 of its input schema (the sorted
feature_names for summary models, the sorted channel_names for tensor
models). On load we recompute the schema hash from the live
``_features.py`` and refuse to load a checkpoint built against a
different schema. This is the difference between "the trained model
saw column 17 = guest.cpu_user" and "the live inference is feeding
column 17 = whatever-_features-now-puts-there."
A checkpoint is a JSON-serializable dict on disk. NN subclasses
serialize their torch state_dict separately as a sidecar ``.pt`` file
referenced from the JSON; GBT writes the XGBoost JSON directly.
Layout::
artifacts/<name>.ckpt.json
artifacts/<name>.pt (torch sidecar; only for NN models)
artifacts/<name>.xgb.json (xgboost sidecar; only for GBT)
The JSON file is the source of truth for the schema header and the
loader uses it to know which sidecar to read.
"""
from __future__ import annotations
import hashlib
import json
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
import numpy as np
from training._features import (
ALL_CHANNELS,
PHASES,
channel_in_deployment_mask,
channel_names,
in_deployment_mask,
)
from training.models import BaseModel, get_model
from training.models._base import StandardizeStats
CHECKPOINT_VERSION = 1
def summary_schema_hash() -> str:
"""sha256 of the sorted summary feature_names — what GBT and MLP see."""
from training._features import feature_names_episode
names = sorted(feature_names_episode())
return hashlib.sha256("\n".join(names).encode()).hexdigest()
def tensor_schema_hash() -> str:
"""sha256 of the sorted channel_names — what CNN/GRU/LSTM/Transformer see."""
names = sorted(channel_names())
return hashlib.sha256("\n".join(names).encode()).hexdigest()
def expected_schema_hash(input_kind: str) -> str:
if input_kind == "summary":
return summary_schema_hash()
if input_kind == "tensor":
return tensor_schema_hash()
raise ValueError(f"unknown input_kind: {input_kind}")
@dataclass
class CheckpointHeader:
"""Generic header — same for every model, written to the JSON file."""
version: int
name: str # registry name: "gbt" | "mlp" | "cnn" | ...
mode: str # "realistic" | "oracle"
input_kind: str # "summary" | "tensor"
schema_hash: str
n_classes: int
phases: list[str]
keep_mask: list[bool]
standardize: dict
sidecar: str # filename of .pt or .xgb.json
pca_proj: list[list[float]] | None # (n_keep_features_or_channels, 2) or None
config: dict # model-specific config (depth, hidden, ...)
train_meta: dict # split recipe + config + metric on val
def to_dict(self) -> dict:
return asdict(self)
def make_keep_mask(input_kind: str, mode: str) -> np.ndarray:
"""Per-feature or per-channel keep mask for the given mode."""
if input_kind == "summary":
full = in_deployment_mask()
else:
full = channel_in_deployment_mask()
if mode == "realistic":
return full
if mode == "oracle":
return np.ones_like(full)
raise ValueError(f"unknown mode: {mode}")
def save_checkpoint(
model: BaseModel,
*,
path: Path, # base path; .ckpt.json appended if absent
name: str,
mode: str,
config: dict,
train_meta: dict,
pca_proj: np.ndarray | None = None,
) -> Path:
"""Persist a model + its schema header. Returns the JSON path."""
base = Path(str(path).removesuffix(".ckpt.json"))
base.parent.mkdir(parents=True, exist_ok=True)
sidecar_filename = _write_sidecar(model, base=base)
if model.standardize is None:
raise ValueError("model.standardize must be fit before saving")
if model.keep_mask is None:
raise ValueError("model.keep_mask must be set before saving")
header = CheckpointHeader(
version=CHECKPOINT_VERSION,
name=name,
mode=mode,
input_kind=model.input_kind,
schema_hash=expected_schema_hash(model.input_kind),
n_classes=model.n_classes,
phases=list(PHASES[: model.n_classes]),
keep_mask=[bool(b) for b in np.asarray(model.keep_mask).tolist()],
standardize=model.standardize.to_dict(),
sidecar=sidecar_filename,
pca_proj=(pca_proj.tolist() if pca_proj is not None else None),
config=config,
train_meta=train_meta,
)
json_path = base.with_suffix(".ckpt.json")
json_path.write_text(json.dumps(header.to_dict(), indent=2) + "\n")
return json_path
def _write_sidecar(model: BaseModel, *, base: Path) -> str:
"""Persist the model-specific weights. Returns the sidecar filename.
Each model subclass defines its own sidecar format and extension via
``save_sidecar(path)``. The framework picks the extension based on
the model kind.
"""
if model.__model_name__ == "gbt":
path = base.with_suffix(".xgb.json")
elif model.__model_name__ in ("knn", "knn_semi"):
path = base.with_suffix(".knn.pkl")
else:
path = base.with_suffix(".pt")
model.save_sidecar(path)
return path.name
def load_checkpoint(path: Path, *, device: str = "auto") -> BaseModel:
"""Load a checkpoint with schema verification.
Raises if the schema hash does not match what ``_features.py``
currently produces. This is the guarantee that a model only ever
sees inputs in the layout it was trained on."""
json_path = Path(str(path))
if json_path.suffix != ".json":
json_path = json_path.with_suffix(".ckpt.json")
header = json.loads(json_path.read_text())
if header.get("version") != CHECKPOINT_VERSION:
raise ValueError(
f"checkpoint version mismatch: file={header.get('version')} "
f"expected={CHECKPOINT_VERSION}")
expected = expected_schema_hash(header["input_kind"])
if header["schema_hash"] != expected:
raise ValueError(
f"schema hash mismatch for {json_path}: "
f"\n file: {header['schema_hash']}"
f"\n current: {expected}"
f"\nThe channel/feature registry has changed since this model "
f"was trained. Retrain or pin the registry."
)
cls = get_model(header["name"])
sidecar = json_path.with_name(header["sidecar"])
payload: dict[str, Any]
if header["name"] in ("gbt", "knn", "knn_semi"):
# File-path loaders (XGBoost JSON, sklearn pickle); they open
# the sidecar themselves rather than receiving torch tensors.
payload = {"sidecar_path": str(sidecar)}
else:
import torch
if device == "auto":
device = "cuda" if torch.cuda.is_available() else "cpu"
payload = torch.load(sidecar, map_location=device, weights_only=False)
payload["_device"] = device
return cls.from_checkpoint(header, payload, device=device)
def load_header(path: Path) -> dict:
"""Read just the JSON header (no weights). For inventories / registries."""
p = Path(str(path))
if p.suffix != ".json":
p = p.with_suffix(".ckpt.json")
return json.loads(p.read_text())