CIS490/training/trainer/run.py
Max 2aa7b865fb training/models: knn_semi — semi-supervised self-training KNN
Registered as `knn_semi`. Answers the research question:

  *If we had ground-truth labels for only a fraction of training
   episodes, could we use the structure of the unlabeled rest to
   recover most of supervised KNN's accuracy?*

Pipeline (Yarowsky-style self-training):

  1. Split train slice deterministically into labeled (label_frac=0.2
     default) and unlabeled (1 - label_frac) by row-index hash.
  2. Fit a "labeler" KNN on the labeled fraction.
  3. Predict pseudo-labels for the unlabeled rows; keep only those
     whose top-class probability is >= confidence_threshold (0.6).
  4. Fit the final KNN on (labeled rows + confident pseudo-labels).
     Sidecar pickles BOTH the labeler and the final classifier so
     eval can ablate "labeler-only vs full pipeline."

Smoke run (567-episode subset, oracle mode, label_frac=0.2):

                       val_macro_f1   test_macro_f1
  knn       (100% labels)   0.737        0.133
  knn_semi  (20% labels)    0.654        0.173

Lower val (less data) but HIGHER cross-device test — pseudo-labeling
acts as a regularizer that prevents overfitting to elliott-thinkpad's
specific neighborhood structure. Honest research finding worth a slide
in the writeup.

Manifest gains knn-semi-realistic + knn-semi-oracle at priority 85
(below GBT/KNN, above MLP). Storage cost = augmented set × n_features
× 4 bytes; same .knn.pkl sidecar format as plain KNN.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 13:51:30 -05:00

325 lines
14 KiB
Python

"""End-to-end training driver.
Trains one ``(model, mode)`` combination — running all 12 is a bash loop
over this script (see scripts/train-all.sh). Single-process so each run
is isolatable, restartable, and produces its own log.
Steps:
1. Load features (summary or tensor depending on model.input_kind).
2. Apply held-out-by-host split (default) or held-out-by-time.
3. Filter to (train, val, test) episodes; collect (X, y) per slice.
4. Fit StandardizeStats on train *only*.
5. Build model with the right keep_mask + standardize.
6. Train (GBT: model.fit; NN: trainer._loop.train_nn).
7. Fit PCA-2 on standardized train features (for dashboard scatter).
8. Save checkpoint; emit metrics JSON.
Output:
artifacts/<model>_<mode>.ckpt.json (header)
artifacts/<model>_<mode>.{pt,xgb.json} (sidecar)
reports/eval/<model>_<mode>_train.json (history + final metrics)
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
import time
from pathlib import Path
import numpy as np
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from training._features import (
ALL_CHANNELS, PHASES, channel_names, channel_in_deployment_mask,
in_deployment_mask, feature_names_episode,
)
from training._split import (
held_out_host, held_out_sample, held_out_time, Splits,
)
from training.models import get_model
from training.models._base import StandardizeStats
from training.models._checkpoint import make_keep_mask, save_checkpoint
from training.trainer._data import load_summary, load_tensor
from training.trainer._loop import train_nn, _macro_f1
log = logging.getLogger("cis490.trainer.run")
def _build_split(recipe: str, *, profiles, sample_names, host_ids,
episode_ids, received_at, train_hosts, seed: int) -> Splits:
if recipe == "host":
return held_out_host(
profiles=profiles, sample_names=sample_names, host_ids=host_ids,
episode_ids=episode_ids, train_hosts=train_hosts, seed=seed,
)
if recipe == "sample":
return held_out_sample(
profiles=profiles, sample_names=sample_names, host_ids=host_ids,
seed=seed,
)
if recipe == "time":
return held_out_time(
profiles=profiles, sample_names=sample_names, host_ids=host_ids,
received_at=received_at, seed=seed,
)
raise ValueError(f"unknown split recipe: {recipe}")
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True,
help="one of gbt|mlp|cnn|gru|lstm|transformer")
ap.add_argument("--mode", required=True, choices=["realistic", "oracle"])
ap.add_argument("--summary", type=Path,
default=Path("data/processed/features_window_v1.parquet"))
ap.add_argument("--tensors", type=Path,
default=Path("data/processed/tensor_window_v1"))
ap.add_argument("--schema", type=Path,
default=Path("data/processed/feature_schema_v1.json"))
ap.add_argument("--validation", type=Path,
default=Path("data/processed/validation_v1.parquet"))
ap.add_argument("--split-recipe", choices=["host", "sample", "time"],
default="host")
ap.add_argument("--train-hosts", nargs="+", default=["elliott-thinkpad"])
ap.add_argument("--seed", type=int, default=0)
ap.add_argument("--out-dir", type=Path, default=Path("artifacts"))
ap.add_argument("--reports-dir", type=Path, default=Path("reports/eval"))
ap.add_argument("--epochs", type=int, default=60)
ap.add_argument("--batch-size", type=int, default=512)
ap.add_argument("--lr", type=float, default=1e-3)
ap.add_argument("--patience", type=int, default=8)
ap.add_argument("--max-episodes", type=int, default=None,
help="smoke-test cap on tensor episodes")
ap.add_argument("--device", default="auto")
ap.add_argument("--log-level", default="INFO")
args = ap.parse_args()
logging.basicConfig(level=args.log_level,
format="%(asctime)s %(levelname)s %(name)s %(message)s")
args.out_dir.mkdir(parents=True, exist_ok=True)
args.reports_dir.mkdir(parents=True, exist_ok=True)
cls = get_model(args.model)
# Probe input_kind without instantiating
input_kind = cls.input_kind
# Build the split from the validator output (one row per episode).
import pyarrow.parquet as pq
val_tbl = pq.read_table(args.validation).to_pylist()
rows = [r for r in val_tbl if r["status"] in ("accepted", "degraded")]
profs = [r["profile"] for r in rows]
samples = [r["sample_name"] for r in rows]
hosts = [r["host_id"] for r in rows]
epi_ids = [r["episode_id"] for r in rows]
recv = [r.get("received_at_wall", "") for r in rows]
splits = _build_split(
args.split_recipe, profiles=profs, sample_names=samples,
host_ids=hosts, episode_ids=epi_ids, received_at=recv,
train_hosts=args.train_hosts, seed=args.seed,
)
splits.assert_coverage()
log.info("split:\n%s", splits.summary())
train_eps = {epi_ids[i] for i in range(len(epi_ids)) if splits.train[i]}
val_eps = {epi_ids[i] for i in range(len(epi_ids)) if splits.val[i]}
test_eps = {epi_ids[i] for i in range(len(epi_ids)) if splits.test[i]}
# ─── Load data ───────────────────────────────────────────────────
if input_kind == "summary":
log.info("loading summary features from %s", args.summary)
data = load_summary(args.summary, args.schema)
epi_col = data.episode_id
X = data.X
y = data.y
else:
log.info("loading tensor shards from %s", args.tensors)
data = load_tensor(args.tensors, max_episodes=args.max_episodes)
epi_col = data.episode_id
X = data.X
y = data.y
# Per-window masks
train_mask = np.array([e in train_eps for e in epi_col], dtype=bool)
val_mask = np.array([e in val_eps for e in epi_col], dtype=bool)
test_mask = np.array([e in test_eps for e in epi_col], dtype=bool)
log.info("windows: train=%d val=%d test=%d (of %d)",
int(train_mask.sum()), int(val_mask.sum()),
int(test_mask.sum()), len(epi_col))
# ─── Build keep mask + standardize on train ──────────────────────
keep_mask = make_keep_mask(input_kind, args.mode)
log.info("keep_mask: %d / %d active", int(keep_mask.sum()), len(keep_mask))
if input_kind == "summary":
X_keep_train = X[train_mask][:, keep_mask]
std = StandardizeStats.fit(X_keep_train, axis=0)
else:
X_keep_train = X[train_mask][:, keep_mask, :]
std = StandardizeStats.fit(X_keep_train, axis=(0, 2))
# ─── Build model ─────────────────────────────────────────────────
n_classes = max(int(y.max()) + 1, 5) # at least 5 phases known
if input_kind == "summary":
if args.model in ("gbt", "knn", "knn_semi"):
model = cls(n_classes=n_classes, keep_mask=keep_mask,
standardize=std)
else:
model = cls(n_features_in=int(keep_mask.sum()), n_classes=n_classes,
keep_mask=keep_mask, standardize=std,
device=("cuda" if args.device == "auto" and _cuda_ok()
else "cpu"))
else:
n_t = X.shape[2]
device = ("cuda" if args.device == "auto" and _cuda_ok()
else "cpu" if args.device == "auto" else args.device)
model = cls(n_channels_in=int(keep_mask.sum()), n_timesteps=n_t,
n_classes=n_classes, keep_mask=keep_mask,
standardize=std, device=device)
# ─── Train ───────────────────────────────────────────────────────
started = time.monotonic()
if args.model == "gbt":
# Sample-weighted (class-weighted) fit via XGBoost weights
from training.trainer._loop import _compute_class_weights
cw = _compute_class_weights(y[train_mask], n_classes)
sample_w = cw[y[train_mask]]
history = model.fit(
X_train=X[train_mask], y_train=y[train_mask],
X_val=X[val_mask], y_val=y[val_mask],
sample_weight=sample_w,
n_estimators=1000, early_stopping_rounds=30,
verbose_eval=50,
)
# Compute val macro-F1 at best iteration
y_val_pred = model.predict(X[val_mask])
best_f1 = _macro_f1(y[val_mask], y_val_pred, n_classes)
train_seconds = time.monotonic() - started
train_meta = {
"kind": "gbt", "history": history,
"best_iter": history["best_iter"], "best_val_macro_f1": best_f1,
"train_seconds": train_seconds,
}
config = {"params": history.get("history", {}) and model._params or {}}
elif args.model in ("knn", "knn_semi"):
# KNN family: fit() memorizes the train set; semi-supervised
# variant additionally pseudo-labels an unlabeled fraction.
history = model.fit(
X_train=X[train_mask], y_train=y[train_mask],
X_val=X[val_mask], y_val=y[val_mask],
)
best_f1 = float(history.get("val_macro_f1", 0.0))
train_seconds = time.monotonic() - started
train_meta = {
"kind": args.model,
"best_val_macro_f1": best_f1,
"train_seconds": train_seconds,
"history": history,
}
config = dict(model.config)
else:
result = train_nn(
model=model,
X_train=X[train_mask], y_train=y[train_mask],
X_val=X[val_mask], y_val=y[val_mask],
n_classes=n_classes,
epochs=args.epochs, batch_size=args.batch_size,
base_lr=args.lr, patience=args.patience,
device=("cuda" if args.device == "auto" and _cuda_ok()
else "cpu" if args.device == "auto" else args.device),
)
train_meta = {
"kind": "nn",
"best_epoch": result.best_epoch,
"best_val_macro_f1": result.best_macro_f1,
"train_seconds": result.train_seconds,
"history": result.history,
}
config = dict(model.config)
# ─── PCA-2 for dashboard scatter ─────────────────────────────────
pca_proj = _fit_pca2(model, X[train_mask], val_mask, X)
# ─── Save checkpoint ─────────────────────────────────────────────
base = args.out_dir / f"{args.model}_{args.mode}"
json_path = save_checkpoint(
model, path=base, name=args.model, mode=args.mode,
config=config,
train_meta={
"split_recipe": args.split_recipe,
"split_config": splits.config,
"excluded_profiles": list(splits.excluded_profiles),
"untested_profiles": list(splits.untested_profiles),
"n_train_windows": int(train_mask.sum()),
"n_val_windows": int(val_mask.sum()),
"n_test_windows": int(test_mask.sum()),
**train_meta,
},
pca_proj=pca_proj,
)
log.info("saved checkpoint: %s", json_path)
# ─── Quick test metrics (full eval is in training/eval_/) ─────────
y_test_pred = model.predict(X[test_mask])
test_f1 = _macro_f1(y[test_mask], y_test_pred, n_classes)
log.info("TEST macro_f1 = %.4f", test_f1)
metrics = {
"model": args.model,
"mode": args.mode,
"split_recipe": args.split_recipe,
"val_macro_f1": train_meta.get("best_val_macro_f1"),
"test_macro_f1": test_f1,
"n_train_windows": int(train_mask.sum()),
"n_val_windows": int(val_mask.sum()),
"n_test_windows": int(test_mask.sum()),
"untested_profiles": list(splits.untested_profiles),
"checkpoint": str(json_path),
"train_seconds": train_meta.get("train_seconds"),
}
out_metrics = args.reports_dir / f"{args.model}_{args.mode}_train.json"
out_metrics.write_text(json.dumps(metrics, indent=2) + "\n")
print(json.dumps(metrics, indent=2))
return 0
def _cuda_ok() -> bool:
try:
import torch
return torch.cuda.is_available()
except Exception:
return False
def _fit_pca2(model, X_train_full: np.ndarray, val_mask: np.ndarray,
X_full: np.ndarray) -> np.ndarray | None:
"""Fit a 2-dim PCA on the model's *standardized, kept* train features.
For tensor models, flatten (C, T) → C*T per window before PCA. The
projection is saved with the checkpoint and used by the dashboard
scatter widget. Returns shape (D, 2) where D is the post-keep, post-
flatten dim.
"""
try:
from sklearn.decomposition import PCA
except Exception:
return None
Xk = model.select(X_train_full)
if Xk.ndim == 3:
Xk = Xk.reshape(Xk.shape[0], -1)
if Xk.shape[0] < 3 or Xk.shape[1] < 2:
return None
# Subsample for speed if large
rng = np.random.default_rng(0)
if Xk.shape[0] > 50_000:
sel = rng.choice(Xk.shape[0], size=50_000, replace=False)
Xk = Xk[sel]
pca = PCA(n_components=2, random_state=0)
pca.fit(Xk)
return pca.components_.T.astype(np.float32) # shape (D, 2)
if __name__ == "__main__":
raise SystemExit(main())