CIS490/tests/test_training_features.py
Max 1fabd4a246 training: validator, feature/tensor extractors, 6 supervised models, schema-hashed checkpoints, eval suite, dashboard producers
The model layer of the project, built honestly:

  - tools/dataset_validate.py — full-sweep validator over the receiver
    store (sha256, schema, monotonic labels, telemetry-row gate). On the
    current corpus: 64,798 accepted + 8,154 degraded + 3,701 rejected +
    7 errored across 76,660 shipped episodes. data/processed/validation_v1.parquet
    is committed as the per-episode acceptance index.

  - training/_features.py — channel registry (46 channels across
    proc/guest/qmp/netflow), summary-stat windowing AND channel×time
    tensor extraction at 10s/5s windowing. Time alignment uses t_wall_ns
    (Unix ns) — tested fix for a real netflow-vs-host clock-base
    inconsistency that was silently dropping every netflow channel.

  - training/_split.py — three held-out recipes (host / sample / time)
    with profile-stratification assertions. held_out_host carries
    untested_profiles for cases like scan-and-dial absent from the test
    host (5 of 6 profiles tested cross-device, never silently averaged).

  - training/models/ — 6 architectures behind a common BaseModel
    interface: gbt (XGBoost), mlp, cnn, gru, lstm, transformer. Each
    trained twice (realistic / oracle) per the deployment threat model.
    Schema-hashed checkpoints refuse to load if _features.py changed
    since training (silent-input-drift protection, tested).

  - training/trainer/ — unified training loop: class-weighted CE, LR
    warmup + cosine, gradient clipping, mixed precision when CUDA,
    early stopping on val macro F1, best-on-val checkpoint. Same loop
    runs MLP/CNN/GRU/LSTM/Transformer; GBT uses XGBoost
    early_stopping_rounds on val mlogloss.

  - training/eval_/ — bootstrap 95% CIs on macro F1, per-class F1,
    per-profile and per-host breakdown, paired-bootstrap significance
    for model-vs-model gap. Confusion matrix uses union of seen labels.

  - training/dashboard/producers/ — replay/metrics/perf/profiles
    emitting the six event types the dashboard's awaiting scenes
    consume; on-demand tensor extraction so the Pi can run live
    inference without 65 GB of shards.

  - 17 unit tests (split coverage, features round-trip, schema mismatch,
    determinism, time-base alignment regression).

End-to-end smoke-trained all six on a 567-episode subset; held-out
test macro F1 reported with paired-bootstrap significance. The
methodology now reports honest cross-device generalization, not
in-distribution validation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 01:19:00 -05:00

186 lines
7.9 KiB
Python

"""Tests for training/_features.py — windowing + tensor extraction.
The feature extractor decides what every model sees. Bugs here are
the kind that are invisible until the model is wrong in production.
"""
from __future__ import annotations
import json
import numpy as np
import pytest
from training._features import (
ALL_CHANNELS, DEFAULT_STRIDE_S, DEFAULT_WINDOW_S, PHASE_TO_INT,
TENSOR_HZ, TENSOR_TIMESTEPS,
channel_arrays, episode_t0_wall_ns, summary_windows, tensor_windows,
)
class _FakeEpi:
"""Hand-built episode minimal enough to drive the extractor."""
def __init__(self, *, n_seconds: float = 30.0,
hz_proc: float = 10.0, hz_guest: float = 10.0,
hz_qmp: float = 1.0, hz_netflow: float = 10.0,
phases: list[tuple[float, str]] | None = None,
cpu_user_constant: float = 100.0):
# Phases default: clean → infected_running at 10s → clean at 25s
if phases is None:
phases = [(0.0, "clean"), (10.0, "infected_running"), (25.0, "clean")]
self.episode_id = "test-episode"
self.host_id = "test-host"
self.has_done_marker = True
self.has_pcap = False
self.raw_files = []
# Choose a recent t0 so the wall_ns values don't overflow assumptions
t0_wall = 1_777_583_279_000_000_000 # ~2026-04-30
self.labels = [
{"phase": p, "prev": None, "reason": "scheduled",
"t_mono_ns": int(t * 1e9), "t_wall_ns": int(t0_wall + t * 1e9)}
for t, p in phases
]
self.events = []
self.meta = {
"result": {"duration_observed_s": n_seconds,
"phases_observed": [p for _, p in phases],
"rows_proc": int(n_seconds * hz_proc),
"rows_guest": int(n_seconds * hz_guest),
"rows_qmp": int(n_seconds * hz_qmp),
"rows_netflow": int(n_seconds * hz_netflow)},
"sample": {"profile": "test", "name": "test-sample",
"kind": "synth", "sha256": None},
}
# Build proc rows (counter for cpu_user; instantaneous values
# would be cumulative jiffies)
self.proc = []
cum = 0.0
for k in range(int(n_seconds * hz_proc)):
t_s = k / hz_proc
cum += cpu_user_constant / hz_proc
self.proc.append({
"t_mono_ns": int(t_s * 1e9), "t_wall_ns": int(t0_wall + t_s * 1e9),
"source": "host_proc", "available_in_deployment": False,
"cpu_user_jiffies": cum, "cpu_sys_jiffies": 0,
"rss_bytes": 1_000_000, "vsize_bytes": 2_000_000,
"io_read_bytes": 0, "io_write_bytes": 0,
"voluntary_ctxsw": 0, "involuntary_ctxsw": 0,
"minor_faults": 0, "major_faults": 0,
})
# guest, qmp, netflow rows — empty bodies are fine, every getter returns None
self.guest = []
for k in range(int(n_seconds * hz_guest)):
t_s = k / hz_guest
self.guest.append({
"t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9),
"source": "guest_agent", "available_in_deployment": True,
"cpu_total_jiffies": {"user": k, "system": 0, "idle": 0,
"iowait": 0, "softirq": 0},
"load_1m_5m_15m": [0.1, 0.0, 0.0],
"mem_total_bytes": 1, "mem_available_bytes": 1,
"mem_buffers_bytes": 1, "mem_cached_bytes": 1, "swap_used_bytes": 0,
"net": {"eth0": {"rx_bytes": 0, "tx_bytes": 0,
"rx_pkts": 0, "tx_pkts": 0}},
"listen_ports": [], "top_procs": [],
})
self.qmp = []
for k in range(int(n_seconds * hz_qmp)):
t_s = k / hz_qmp
self.qmp.append({
"t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9),
"source": "host_qmp", "available_in_deployment": False,
"vm_status": "running", "vm_running": True,
"blockstats": {"virtio0": {"rd_ops": 0, "wr_ops": 0,
"rd_bytes": 0, "wr_bytes": 0}},
"kvm_stats": {"remote_tlb_flush": 0, "pages_4k": 0, "pages_2m": 0},
})
self.netflow = []
for k in range(int(n_seconds * hz_netflow)):
t_s = k / hz_netflow
self.netflow.append({
"t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9),
"source": "bridge_pcap", "available_in_deployment": True,
"bucket_ms": 100, "pkts_in": 0, "pkts_out": 0,
"bytes_in": 0, "bytes_out": 0, "syn_count": 0, "fin_count": 0,
"rst_count": 0, "udp_count": 0, "tcp_count": 0,
"dns_query_count": 0, "unique_dst_ips": 0, "unique_dst_ports": 0,
"tcp_new_flows": 0,
})
def test_summary_windows_shape():
epi = _FakeEpi(n_seconds=30.0)
X, y, t, info = summary_windows(epi)
# 30s episode, 10s window, 5s stride → 5 windows starting at 0,5,10,15,20
assert X.shape[0] == 5
assert X.shape[1] == len(ALL_CHANNELS) * 5
assert y.shape == (5,)
assert t.shape == (5,)
assert info["episode_id"] == "test-episode"
def test_tensor_windows_shape():
epi = _FakeEpi(n_seconds=30.0)
X, y, t, M, info = tensor_windows(epi)
assert X.shape == (5, len(ALL_CHANNELS), TENSOR_TIMESTEPS)
assert M.shape == X.shape
# All host-side channels should have data; mask should be ~all-True
assert M.mean() > 0.95
def test_phase_label_at_window_center():
"""Window centered on infected_running gets that label, not 'clean'."""
epi = _FakeEpi(n_seconds=30.0,
phases=[(0.0, "clean"), (10.0, "infected_running"),
(25.0, "clean")])
_, y, t, _ = summary_windows(epi)
# Window centers: 5, 10, 15, 20, 25
# phase_at(t=5) → clean (idx 0)
# phase_at(t=10) → infected_running (idx 3)
# phase_at(t=15) → infected_running (idx 3)
# phase_at(t=20) → infected_running (idx 3)
# phase_at(t=25) → clean (idx 0) — second 'clean'
assert y[0] == PHASE_TO_INT["clean"]
assert y[1] == PHASE_TO_INT["infected_running"]
assert y[2] == PHASE_TO_INT["infected_running"]
assert y[3] == PHASE_TO_INT["infected_running"]
assert y[4] == PHASE_TO_INT["clean"]
def test_counter_to_rate_constant_signal():
"""A counter incrementing by 100 jiffies per second should yield
a per-second rate of 100 in the resulting tensor."""
epi = _FakeEpi(n_seconds=30.0, cpu_user_constant=100.0)
X, _, _, M, _ = tensor_windows(epi)
ch_idx = next(i for i, c in enumerate(ALL_CHANNELS)
if c.name == "proc.cpu_user_jiffies")
valid = M[:, ch_idx, :]
# Mean of valid points should be ~100 (constant rate)
rates = X[:, ch_idx, :][valid]
assert 90.0 < rates.mean() < 110.0
def test_t_wall_ns_alignment_not_t_mono_ns():
"""Regression: netflow rows had different t_mono_ns semantics from
proc/guest/qmp. Producing aligned output requires using t_wall_ns.
Inject a netflow row with bogus t_mono_ns but correct t_wall_ns;
confirm it shows up at the right window."""
epi = _FakeEpi(n_seconds=30.0)
# Override the netflow rows to have intentionally garbage t_mono_ns
for r in epi.netflow:
r["t_mono_ns"] = 1_777_543_932_511_943_778 # boot-uptime-ish
X, _, _, M, _ = tensor_windows(epi)
# netflow channels should still be valid for most timesteps because
# the extractor uses t_wall_ns
ch_idx = next(i for i, c in enumerate(ALL_CHANNELS)
if c.name == "netflow.pkts_in")
assert M[:, ch_idx, :].mean() > 0.5
def test_no_labels_returns_empty():
epi = _FakeEpi()
epi.labels = []
Xs, ys, ts, info = summary_windows(epi)
Xt, yt, tt, mt, infot = tensor_windows(epi)
assert Xs.shape[0] == 0 and ys.shape[0] == 0
assert Xt.shape[0] == 0