The model layer of the project, built honestly:
- tools/dataset_validate.py — full-sweep validator over the receiver
store (sha256, schema, monotonic labels, telemetry-row gate). On the
current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
is committed as the per-episode acceptance index.
- training/_features.py — channel registry (46 channels across
proc/guest/qmp/netflow), summary-stat windowing AND channel×time
tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
(Unix ns) — tested fix for a real netflow-vs-host clock-base
inconsistency that was silently dropping every netflow channel.
- training/_split.py — three held-out recipes (host / sample / time)
with profile-stratification assertions. held_out_host carries
untested_profiles for cases like scan-and-dial absent from the test
host (5 of 6 profiles tested cross-device, never silently averaged).
- training/models/ — 6 architectures behind a common BaseModel
interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
trained twice (realistic / oracle) per the deployment threat model.
Schema-hashed checkpoints refuse to load if _features.py changed
since training (silent-input-drift protection, tested).
- training/trainer/ — unified training loop: class-weighted CE, LR
warmup + cosine, gradient clipping, mixed precision when CUDA,
early stopping on val macro F1, best-on-val checkpoint. Same loop
runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
early_stopping_rounds on val mlogloss.
- training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
per-profile and per-host breakdown, paired-bootstrap significance
for model-vs-model gap. Confusion matrix uses union of seen labels.
- training/dashboard/producers/ — replay/metrics/perf/profiles
emitting the six event types the dashboard's awaiting scenes
consume; on-demand tensor extraction so the Pi can run live
inference without 65 GB of shards.
- 17 unit tests (split coverage, features round-trip, schema mismatch,
determinism, time-base alignment regression).
End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
127 lines
4.4 KiB
Python
127 lines
4.4 KiB
Python
"""Dataset loaders for the trainer.
|
|
|
|
Two flavors matching the model input kinds:
|
|
|
|
load_summary(...) → (X[N,F] float32, y[N] int64, meta_df)
|
|
from features_window_v1.parquet
|
|
load_tensor(...) → (X[N,C,T] float32, mask[N,C,T] bool,
|
|
y[N] int64, meta_df)
|
|
from tensor_window shards (one .npz per episode)
|
|
|
|
Both return episode-level metadata (episode_id, host_id, profile,
|
|
sample_name) that the split machinery needs.
|
|
|
|
Tensor data can be huge (~12 GB at the full dataset). For this reason:
|
|
|
|
- load_tensor() supports lazy mode (returns a generator over batches)
|
|
- load_tensor(..., max_episodes=N) for smoke tests
|
|
- the trainer can choose RAM vs disk based on data size
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pyarrow.parquet as pq
|
|
|
|
|
|
@dataclass
|
|
class SummaryData:
|
|
X: np.ndarray # (N, F) float32
|
|
y: np.ndarray # (N,) int64
|
|
feature_names: list[str]
|
|
episode_id: list[str]
|
|
host_id: list[str]
|
|
profile: list[str]
|
|
sample_name: list[str]
|
|
t_center: np.ndarray | None = None
|
|
|
|
|
|
@dataclass
|
|
class TensorData:
|
|
X: np.ndarray # (N, C, T) float32
|
|
mask: np.ndarray # (N, C, T) bool
|
|
y: np.ndarray # (N,) int64
|
|
channel_names: list[str]
|
|
episode_id: list[str]
|
|
host_id: list[str]
|
|
profile: list[str]
|
|
sample_name: list[str]
|
|
t_center: np.ndarray | None = None
|
|
|
|
|
|
def load_summary(window_parquet: Path, schema_path: Path) -> SummaryData:
|
|
"""Read the entire features_window_v1.parquet into RAM.
|
|
|
|
For a multi-GB parquet on a small box, pass column subsets via
|
|
pyarrow's dataset.dataset(...) instead. For this project we expect
|
|
< 5 GB summary parquet which fits in 32 GB workstation RAM.
|
|
"""
|
|
schema = json.loads(schema_path.read_text())
|
|
feat_names = schema["feature_names"]
|
|
columns = feat_names + ["phase", "episode_id", "host_id",
|
|
"profile", "sample_name", "t_center_s"]
|
|
tbl = pq.read_table(window_parquet, columns=columns)
|
|
cols = {n: tbl.column(n).to_numpy(zero_copy_only=False) for n in columns}
|
|
X = np.column_stack([cols[n] for n in feat_names]).astype(np.float32)
|
|
y = cols["phase"].astype(np.int64)
|
|
return SummaryData(
|
|
X=X, y=y, feature_names=feat_names,
|
|
episode_id=list(cols["episode_id"]),
|
|
host_id=list(cols["host_id"]),
|
|
profile=list(cols["profile"]),
|
|
sample_name=list(cols["sample_name"]),
|
|
t_center=cols["t_center_s"].astype(np.float64),
|
|
)
|
|
|
|
|
|
def load_tensor(shards_root: Path, *, max_episodes: int | None = None
|
|
) -> TensorData:
|
|
"""Load all tensor shards into RAM as one big (N, C, T) array.
|
|
|
|
Each shard is a .npz with keys:
|
|
X, mask, y, t_center, episode_id, host_id, profile, sample_name,
|
|
channel_names (only stored once per shard)
|
|
|
|
For datasets larger than RAM, use load_tensor_lazy() instead.
|
|
"""
|
|
paths = sorted(Path(shards_root).rglob("*.npz"))
|
|
if max_episodes is not None:
|
|
paths = paths[:max_episodes]
|
|
if not paths:
|
|
raise FileNotFoundError(f"no tensor shards under {shards_root}")
|
|
|
|
Xs, Ms, ys = [], [], []
|
|
epi_ids, hosts, profs, samples, centers = [], [], [], [], []
|
|
channel_names: list[str] | None = None
|
|
|
|
for p in paths:
|
|
with np.load(p, allow_pickle=True) as f:
|
|
if channel_names is None:
|
|
channel_names = list(f["channel_names"])
|
|
n_w = f["X"].shape[0]
|
|
if n_w == 0:
|
|
continue
|
|
Xs.append(f["X"])
|
|
Ms.append(f["mask"])
|
|
ys.append(f["y"])
|
|
centers.append(f["t_center"])
|
|
# Each shard's metadata is per-episode (1 value broadcast over its
|
|
# n_w windows).
|
|
epi_ids.extend([str(f["episode_id"])] * n_w)
|
|
hosts.extend([str(f["host_id"])] * n_w)
|
|
profs.extend([str(f["profile"])] * n_w)
|
|
samples.extend([str(f["sample_name"])] * n_w)
|
|
|
|
X = np.concatenate(Xs, axis=0)
|
|
M = np.concatenate(Ms, axis=0)
|
|
y = np.concatenate(ys, axis=0).astype(np.int64)
|
|
t = np.concatenate(centers, axis=0).astype(np.float64)
|
|
return TensorData(
|
|
X=X.astype(np.float32, copy=False),
|
|
mask=M, y=y, channel_names=channel_names or [],
|
|
episode_id=epi_ids, host_id=hosts, profile=profs, sample_name=samples,
|
|
t_center=t,
|
|
)
|