The model layer of the project, built honestly:
- tools/dataset_validate.py — full-sweep validator over the receiver
store (sha256, schema, monotonic labels, telemetry-row gate). On the
current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
is committed as the per-episode acceptance index.
- training/_features.py — channel registry (46 channels across
proc/guest/qmp/netflow), summary-stat windowing AND channel×time
tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
(Unix ns) — tested fix for a real netflow-vs-host clock-base
inconsistency that was silently dropping every netflow channel.
- training/_split.py — three held-out recipes (host / sample / time)
with profile-stratification assertions. held_out_host carries
untested_profiles for cases like scan-and-dial absent from the test
host (5 of 6 profiles tested cross-device, never silently averaged).
- training/models/ — 6 architectures behind a common BaseModel
interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
trained twice (realistic / oracle) per the deployment threat model.
Schema-hashed checkpoints refuse to load if _features.py changed
since training (silent-input-drift protection, tested).
- training/trainer/ — unified training loop: class-weighted CE, LR
warmup + cosine, gradient clipping, mixed precision when CUDA,
early stopping on val macro F1, best-on-val checkpoint. Same loop
runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
early_stopping_rounds on val mlogloss.
- training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
per-profile and per-host breakdown, paired-bootstrap significance
for model-vs-model gap. Confusion matrix uses union of seen labels.
- training/dashboard/producers/ — replay/metrics/perf/profiles
emitting the six event types the dashboard's awaiting scenes
consume; on-demand tensor extraction so the Pi can run live
inference without 65 GB of shards.
- 17 unit tests (split coverage, features round-trip, schema mismatch,
determinism, time-base alignment regression).
End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
186 lines
7.9 KiB
Python
186 lines
7.9 KiB
Python
"""Tests for training/_features.py — windowing + tensor extraction.
|
|
|
|
The feature extractor decides what every model sees. Bugs here are
|
|
the kind that are invisible until the model is wrong in production.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from training._features import (
|
|
ALL_CHANNELS, DEFAULT_STRIDE_S, DEFAULT_WINDOW_S, PHASE_TO_INT,
|
|
TENSOR_HZ, TENSOR_TIMESTEPS,
|
|
channel_arrays, episode_t0_wall_ns, summary_windows, tensor_windows,
|
|
)
|
|
|
|
|
|
class _FakeEpi:
|
|
"""Hand-built episode minimal enough to drive the extractor."""
|
|
def __init__(self, *, n_seconds: float = 30.0,
|
|
hz_proc: float = 10.0, hz_guest: float = 10.0,
|
|
hz_qmp: float = 1.0, hz_netflow: float = 10.0,
|
|
phases: list[tuple[float, str]] | None = None,
|
|
cpu_user_constant: float = 100.0):
|
|
# Phases default: clean → infected_running at 10s → clean at 25s
|
|
if phases is None:
|
|
phases = [(0.0, "clean"), (10.0, "infected_running"), (25.0, "clean")]
|
|
self.episode_id = "test-episode"
|
|
self.host_id = "test-host"
|
|
self.has_done_marker = True
|
|
self.has_pcap = False
|
|
self.raw_files = []
|
|
# Choose a recent t0 so the wall_ns values don't overflow assumptions
|
|
t0_wall = 1_777_583_279_000_000_000 # ~2026-04-30
|
|
self.labels = [
|
|
{"phase": p, "prev": None, "reason": "scheduled",
|
|
"t_mono_ns": int(t * 1e9), "t_wall_ns": int(t0_wall + t * 1e9)}
|
|
for t, p in phases
|
|
]
|
|
self.events = []
|
|
self.meta = {
|
|
"result": {"duration_observed_s": n_seconds,
|
|
"phases_observed": [p for _, p in phases],
|
|
"rows_proc": int(n_seconds * hz_proc),
|
|
"rows_guest": int(n_seconds * hz_guest),
|
|
"rows_qmp": int(n_seconds * hz_qmp),
|
|
"rows_netflow": int(n_seconds * hz_netflow)},
|
|
"sample": {"profile": "test", "name": "test-sample",
|
|
"kind": "synth", "sha256": None},
|
|
}
|
|
# Build proc rows (counter for cpu_user; instantaneous values
|
|
# would be cumulative jiffies)
|
|
self.proc = []
|
|
cum = 0.0
|
|
for k in range(int(n_seconds * hz_proc)):
|
|
t_s = k / hz_proc
|
|
cum += cpu_user_constant / hz_proc
|
|
self.proc.append({
|
|
"t_mono_ns": int(t_s * 1e9), "t_wall_ns": int(t0_wall + t_s * 1e9),
|
|
"source": "host_proc", "available_in_deployment": False,
|
|
"cpu_user_jiffies": cum, "cpu_sys_jiffies": 0,
|
|
"rss_bytes": 1_000_000, "vsize_bytes": 2_000_000,
|
|
"io_read_bytes": 0, "io_write_bytes": 0,
|
|
"voluntary_ctxsw": 0, "involuntary_ctxsw": 0,
|
|
"minor_faults": 0, "major_faults": 0,
|
|
})
|
|
# guest, qmp, netflow rows — empty bodies are fine, every getter returns None
|
|
self.guest = []
|
|
for k in range(int(n_seconds * hz_guest)):
|
|
t_s = k / hz_guest
|
|
self.guest.append({
|
|
"t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9),
|
|
"source": "guest_agent", "available_in_deployment": True,
|
|
"cpu_total_jiffies": {"user": k, "system": 0, "idle": 0,
|
|
"iowait": 0, "softirq": 0},
|
|
"load_1m_5m_15m": [0.1, 0.0, 0.0],
|
|
"mem_total_bytes": 1, "mem_available_bytes": 1,
|
|
"mem_buffers_bytes": 1, "mem_cached_bytes": 1, "swap_used_bytes": 0,
|
|
"net": {"eth0": {"rx_bytes": 0, "tx_bytes": 0,
|
|
"rx_pkts": 0, "tx_pkts": 0}},
|
|
"listen_ports": [], "top_procs": [],
|
|
})
|
|
self.qmp = []
|
|
for k in range(int(n_seconds * hz_qmp)):
|
|
t_s = k / hz_qmp
|
|
self.qmp.append({
|
|
"t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9),
|
|
"source": "host_qmp", "available_in_deployment": False,
|
|
"vm_status": "running", "vm_running": True,
|
|
"blockstats": {"virtio0": {"rd_ops": 0, "wr_ops": 0,
|
|
"rd_bytes": 0, "wr_bytes": 0}},
|
|
"kvm_stats": {"remote_tlb_flush": 0, "pages_4k": 0, "pages_2m": 0},
|
|
})
|
|
self.netflow = []
|
|
for k in range(int(n_seconds * hz_netflow)):
|
|
t_s = k / hz_netflow
|
|
self.netflow.append({
|
|
"t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9),
|
|
"source": "bridge_pcap", "available_in_deployment": True,
|
|
"bucket_ms": 100, "pkts_in": 0, "pkts_out": 0,
|
|
"bytes_in": 0, "bytes_out": 0, "syn_count": 0, "fin_count": 0,
|
|
"rst_count": 0, "udp_count": 0, "tcp_count": 0,
|
|
"dns_query_count": 0, "unique_dst_ips": 0, "unique_dst_ports": 0,
|
|
"tcp_new_flows": 0,
|
|
})
|
|
|
|
|
|
def test_summary_windows_shape():
|
|
epi = _FakeEpi(n_seconds=30.0)
|
|
X, y, t, info = summary_windows(epi)
|
|
# 30s episode, 10s window, 5s stride → 5 windows starting at 0,5,10,15,20
|
|
assert X.shape[0] == 5
|
|
assert X.shape[1] == len(ALL_CHANNELS) * 5
|
|
assert y.shape == (5,)
|
|
assert t.shape == (5,)
|
|
assert info["episode_id"] == "test-episode"
|
|
|
|
|
|
def test_tensor_windows_shape():
|
|
epi = _FakeEpi(n_seconds=30.0)
|
|
X, y, t, M, info = tensor_windows(epi)
|
|
assert X.shape == (5, len(ALL_CHANNELS), TENSOR_TIMESTEPS)
|
|
assert M.shape == X.shape
|
|
# All host-side channels should have data; mask should be ~all-True
|
|
assert M.mean() > 0.95
|
|
|
|
|
|
def test_phase_label_at_window_center():
|
|
"""Window centered on infected_running gets that label, not 'clean'."""
|
|
epi = _FakeEpi(n_seconds=30.0,
|
|
phases=[(0.0, "clean"), (10.0, "infected_running"),
|
|
(25.0, "clean")])
|
|
_, y, t, _ = summary_windows(epi)
|
|
# Window centers: 5, 10, 15, 20, 25
|
|
# phase_at(t=5) → clean (idx 0)
|
|
# phase_at(t=10) → infected_running (idx 3)
|
|
# phase_at(t=15) → infected_running (idx 3)
|
|
# phase_at(t=20) → infected_running (idx 3)
|
|
# phase_at(t=25) → clean (idx 0) — second 'clean'
|
|
assert y[0] == PHASE_TO_INT["clean"]
|
|
assert y[1] == PHASE_TO_INT["infected_running"]
|
|
assert y[2] == PHASE_TO_INT["infected_running"]
|
|
assert y[3] == PHASE_TO_INT["infected_running"]
|
|
assert y[4] == PHASE_TO_INT["clean"]
|
|
|
|
|
|
def test_counter_to_rate_constant_signal():
|
|
"""A counter incrementing by 100 jiffies per second should yield
|
|
a per-second rate of 100 in the resulting tensor."""
|
|
epi = _FakeEpi(n_seconds=30.0, cpu_user_constant=100.0)
|
|
X, _, _, M, _ = tensor_windows(epi)
|
|
ch_idx = next(i for i, c in enumerate(ALL_CHANNELS)
|
|
if c.name == "proc.cpu_user_jiffies")
|
|
valid = M[:, ch_idx, :]
|
|
# Mean of valid points should be ~100 (constant rate)
|
|
rates = X[:, ch_idx, :][valid]
|
|
assert 90.0 < rates.mean() < 110.0
|
|
|
|
|
|
def test_t_wall_ns_alignment_not_t_mono_ns():
|
|
"""Regression: netflow rows had different t_mono_ns semantics from
|
|
proc/guest/qmp. Producing aligned output requires using t_wall_ns.
|
|
|
|
Inject a netflow row with bogus t_mono_ns but correct t_wall_ns;
|
|
confirm it shows up at the right window."""
|
|
epi = _FakeEpi(n_seconds=30.0)
|
|
# Override the netflow rows to have intentionally garbage t_mono_ns
|
|
for r in epi.netflow:
|
|
r["t_mono_ns"] = 1_777_543_932_511_943_778 # boot-uptime-ish
|
|
X, _, _, M, _ = tensor_windows(epi)
|
|
# netflow channels should still be valid for most timesteps because
|
|
# the extractor uses t_wall_ns
|
|
ch_idx = next(i for i, c in enumerate(ALL_CHANNELS)
|
|
if c.name == "netflow.pkts_in")
|
|
assert M[:, ch_idx, :].mean() > 0.5
|
|
|
|
|
|
def test_no_labels_returns_empty():
|
|
epi = _FakeEpi()
|
|
epi.labels = []
|
|
Xs, ys, ts, info = summary_windows(epi)
|
|
Xt, yt, tt, mt, infot = tensor_windows(epi)
|
|
assert Xs.shape[0] == 0 and ys.shape[0] == 0
|
|
assert Xt.shape[0] == 0
|