training/models: KNN as a registered supervised model
Non-parametric baseline alongside GBT/MLP/CNN/GRU/LSTM/Transformer. Same BaseModel + schema-hashed checkpoint contract; sidecar is a pickled sklearn KNeighborsClassifier (.knn.pkl) handled by the existing checkpoint machinery alongside .xgb.json / .pt. KNN's storage cost = n_train_rows × n_kept_features × 4 bytes. At 660k windows × 145 kept (realistic mode) features = ~380 MB sidecar; at 230 features (oracle) = ~600 MB. Heavy but ships through the same artifact-upload path. trainer/run.py learns a third fit branch: - GBT — XGBoost early stopping on val mlogloss - KNN — fit() memorizes; "training time" is val/test predict cost - NN — train_nn loop (the rest) Manifest gains knn-realistic + knn-oracle at priority 95 (just below GBT). KNN's k=10 default lives in the model class — overriding via hyper.k requires adding --k to run.py first to avoid the unknown-arg exit-2 issue. Smoke verified on the 567-episode subset: knn oracle val=0.7365 test=0.1333 (held-out k-gamingcom) That val/test gap (0.74 → 0.13) is the cross-device generalization story: KNN memorizes elliott-thinkpad's local feature space and falls apart on the other host. Honest baseline for the comparison report. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
51f2437b71
commit
2187a5d752
5 changed files with 186 additions and 2 deletions
|
|
@ -78,6 +78,25 @@ priority = 100
|
|||
require_cuda = false
|
||||
min_ram_gib = 4
|
||||
|
||||
[[jobs]]
|
||||
name = "knn-realistic"
|
||||
model = "knn"
|
||||
mode = "realistic"
|
||||
priority = 95 # right after GBT — fastest non-parametric baseline
|
||||
require_cuda = false
|
||||
min_ram_gib = 4
|
||||
# KNN's k=10 / weights=distance live in the model class. To override,
|
||||
# add --k / --weights to training/trainer/run.py first; otherwise these
|
||||
# hyper.* keys would fail with the unknown-arg exit-2 issue.
|
||||
|
||||
[[jobs]]
|
||||
name = "knn-oracle"
|
||||
model = "knn"
|
||||
mode = "oracle"
|
||||
priority = 95
|
||||
require_cuda = false
|
||||
min_ram_gib = 4
|
||||
|
||||
[[jobs]]
|
||||
name = "mlp-realistic"
|
||||
model = "mlp"
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ def get_model(name: str):
|
|||
# Eager-import the implementations so the registry is populated.
|
||||
# Order matters only for which "kind" gets imported first — all are listed.
|
||||
from training.models import gbt # noqa: F401,E402
|
||||
from training.models import knn # noqa: F401,E402
|
||||
from training.models import mlp # noqa: F401,E402
|
||||
from training.models import cnn # noqa: F401,E402
|
||||
from training.models import gru # noqa: F401,E402
|
||||
|
|
|
|||
|
|
@ -151,6 +151,8 @@ def _write_sidecar(model: BaseModel, *, base: Path) -> str:
|
|||
"""
|
||||
if model.__model_name__ == "gbt":
|
||||
path = base.with_suffix(".xgb.json")
|
||||
elif model.__model_name__ == "knn":
|
||||
path = base.with_suffix(".knn.pkl")
|
||||
else:
|
||||
path = base.with_suffix(".pt")
|
||||
model.save_sidecar(path)
|
||||
|
|
@ -186,8 +188,9 @@ def load_checkpoint(path: Path, *, device: str = "auto") -> BaseModel:
|
|||
cls = get_model(header["name"])
|
||||
sidecar = json_path.with_name(header["sidecar"])
|
||||
payload: dict[str, Any]
|
||||
if header["name"] == "gbt":
|
||||
# GBT loader reads the .xgb.json directly; pass the path in payload
|
||||
if header["name"] in ("gbt", "knn"):
|
||||
# File-path loaders (XGBoost JSON, sklearn pickle); they open
|
||||
# the sidecar themselves rather than receiving torch tensors.
|
||||
payload = {"sidecar_path": str(sidecar)}
|
||||
else:
|
||||
import torch
|
||||
|
|
|
|||
142
training/models/knn.py
Normal file
142
training/models/knn.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
"""KNN classifier on per-window summary features.
|
||||
|
||||
Non-parametric baseline. Like GBT it uses the summary-stat input
|
||||
(mean / std / p50 / p95 / slope per channel), but where GBT learns
|
||||
axis-aligned splits, KNN reads off the local neighborhood structure
|
||||
in feature space. That makes it a useful complement: where the two
|
||||
agree, decisions are well-supported; where they disagree, the local
|
||||
density of the feature manifold is contradicting the global
|
||||
boosted-tree partitioning.
|
||||
|
||||
We use distance-weighted KNN with k=10 by default. Schema-hashed
|
||||
checkpoint format (same as every other model) so training-time
|
||||
schema drift fails loud at load.
|
||||
|
||||
Standardization is critical for KNN — without it, channels with
|
||||
larger numeric scales dominate the Euclidean distance. We use the
|
||||
same per-feature StandardizeStats (median imputation + z-score)
|
||||
as the rest of the supervised pipeline. The fit is the *training*
|
||||
set; the model holds onto the standardized X_train + y_train as
|
||||
its "weights" since KNN is non-parametric.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from training.models import register
|
||||
from training.models._base import BaseModel, StandardizeStats
|
||||
|
||||
|
||||
@register("knn")
|
||||
class KNN(BaseModel):
|
||||
input_kind = "summary"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
n_classes: int,
|
||||
keep_mask: np.ndarray,
|
||||
standardize: StandardizeStats,
|
||||
k: int = 10,
|
||||
weights: str = "distance",
|
||||
algorithm: str = "auto",
|
||||
clf=None,
|
||||
) -> None:
|
||||
self.n_classes = n_classes
|
||||
self.keep_mask = keep_mask.astype(bool)
|
||||
self.standardize = standardize
|
||||
self.config = {"k": k, "weights": weights, "algorithm": algorithm}
|
||||
self._clf = clf
|
||||
|
||||
@property
|
||||
def clf(self):
|
||||
if self._clf is None:
|
||||
raise RuntimeError("model not fitted; call .fit(...) first")
|
||||
return self._clf
|
||||
|
||||
def fit(
|
||||
self,
|
||||
*,
|
||||
X_train: np.ndarray,
|
||||
y_train: np.ndarray,
|
||||
X_val: np.ndarray | None = None,
|
||||
y_val: np.ndarray | None = None,
|
||||
sample_weight: np.ndarray | None = None,
|
||||
) -> dict:
|
||||
"""KNN doesn't 'train' — it memorizes. We fit the underlying
|
||||
sklearn classifier on the standardized + keep-masked train
|
||||
slice, then optionally compute a val macro F1 for the trainer's
|
||||
bookkeeping.
|
||||
"""
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
Xk = self.select(X_train)
|
||||
clf = KNeighborsClassifier(
|
||||
n_neighbors=int(self.config["k"]),
|
||||
weights=str(self.config["weights"]),
|
||||
algorithm=str(self.config["algorithm"]),
|
||||
n_jobs=-1,
|
||||
)
|
||||
clf.fit(Xk, y_train)
|
||||
self._clf = clf
|
||||
|
||||
history: dict = {}
|
||||
if X_val is not None and y_val is not None and len(X_val) > 0:
|
||||
from training.eval_._metrics import _macro_f1
|
||||
y_pred_val = self.predict(X_val)
|
||||
history["val_macro_f1"] = _macro_f1(
|
||||
y_val, y_pred_val, n_classes=self.n_classes
|
||||
)
|
||||
return history
|
||||
|
||||
def predict_proba(self, X: np.ndarray) -> np.ndarray:
|
||||
Xk = self.select(X)
|
||||
return self.clf.predict_proba(Xk).astype(np.float32)
|
||||
|
||||
# --- Checkpoint API -----------------------------------------------
|
||||
|
||||
def state_for_checkpoint(self) -> dict[str, Any]:
|
||||
# KNN's "weights" are the train set itself — sklearn's pickle
|
||||
# round-trip is the canonical way to persist that. We embed
|
||||
# the pickle bytes in the metadata dict; the sidecar layer
|
||||
# writes them through a torch-style save (see save_sidecar).
|
||||
return {"config": self.config}
|
||||
|
||||
def save_sidecar(self, path: Path) -> None:
|
||||
# Sidecar is a pickle of the sklearn classifier. KNN's storage
|
||||
# cost = ~n_train_rows × n_features × 4 bytes. For our scale
|
||||
# (~660k windows × ~145 kept features × 4 = ~380 MB) this is
|
||||
# heavy — set a `--max-train-rows` cap in the trainer if memory
|
||||
# is tight on the Pi.
|
||||
with path.open("wb") as f:
|
||||
pickle.dump(self._clf, f, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
@classmethod
|
||||
def from_checkpoint(cls, header: dict, payload: dict, *,
|
||||
device: str = "cpu") -> "KNN":
|
||||
# The framework points us at the sidecar pickle path
|
||||
sidecar_path = payload.get("sidecar_path")
|
||||
if sidecar_path is None:
|
||||
# Loaded via torch.load (NN path) by mistake — tell the
|
||||
# checkpoint loader we want the file path instead.
|
||||
raise RuntimeError(
|
||||
"KNN checkpoint requires sidecar_path; ensure the "
|
||||
"loader treats KNN like GBT (passes the file path "
|
||||
"rather than torch.load'ing the bytes)."
|
||||
)
|
||||
with Path(sidecar_path).open("rb") as f:
|
||||
clf = pickle.load(f)
|
||||
cfg = header.get("config", {}) or {}
|
||||
return cls(
|
||||
n_classes=int(header["n_classes"]),
|
||||
keep_mask=np.asarray(header["keep_mask"], dtype=bool),
|
||||
standardize=StandardizeStats.from_dict(header["standardize"]),
|
||||
k=int(cfg.get("k", 10)),
|
||||
weights=str(cfg.get("weights", "distance")),
|
||||
algorithm=str(cfg.get("algorithm", "auto")),
|
||||
clf=clf,
|
||||
)
|
||||
|
|
@ -166,6 +166,9 @@ def main() -> int:
|
|||
if input_kind == "summary":
|
||||
if args.model == "gbt":
|
||||
model = cls(n_classes=n_classes, keep_mask=keep_mask, standardize=std)
|
||||
elif args.model == "knn":
|
||||
model = cls(n_classes=n_classes, keep_mask=keep_mask,
|
||||
standardize=std)
|
||||
else:
|
||||
model = cls(n_features_in=int(keep_mask.sum()), n_classes=n_classes,
|
||||
keep_mask=keep_mask, standardize=std,
|
||||
|
|
@ -203,6 +206,22 @@ def main() -> int:
|
|||
"train_seconds": train_seconds,
|
||||
}
|
||||
config = {"params": history.get("history", {}) and model._params or {}}
|
||||
elif args.model == "knn":
|
||||
# Non-parametric: model.fit memorizes the train set; "training
|
||||
# time" is dominated by the val/test predict calls (KD-tree build).
|
||||
history = model.fit(
|
||||
X_train=X[train_mask], y_train=y[train_mask],
|
||||
X_val=X[val_mask], y_val=y[val_mask],
|
||||
)
|
||||
best_f1 = float(history.get("val_macro_f1", 0.0))
|
||||
train_seconds = time.monotonic() - started
|
||||
train_meta = {
|
||||
"kind": "knn",
|
||||
"best_val_macro_f1": best_f1,
|
||||
"train_seconds": train_seconds,
|
||||
"history": history,
|
||||
}
|
||||
config = {"k": model.config["k"], "weights": model.config["weights"]}
|
||||
else:
|
||||
result = train_nn(
|
||||
model=model,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue