"""Tests for training/_features.py — windowing + tensor extraction. The feature extractor decides what every model sees. Bugs here are the kind that are invisible until the model is wrong in production. """ from __future__ import annotations import json import numpy as np import pytest from training._features import ( ALL_CHANNELS, DEFAULT_STRIDE_S, DEFAULT_WINDOW_S, PHASE_TO_INT, TENSOR_HZ, TENSOR_TIMESTEPS, channel_arrays, episode_t0_wall_ns, summary_windows, tensor_windows, ) class _FakeEpi: """Hand-built episode minimal enough to drive the extractor.""" def __init__(self, *, n_seconds: float = 30.0, hz_proc: float = 10.0, hz_guest: float = 10.0, hz_qmp: float = 1.0, hz_netflow: float = 10.0, phases: list[tuple[float, str]] | None = None, cpu_user_constant: float = 100.0): # Phases default: clean → infected_running at 10s → clean at 25s if phases is None: phases = [(0.0, "clean"), (10.0, "infected_running"), (25.0, "clean")] self.episode_id = "test-episode" self.host_id = "test-host" self.has_done_marker = True self.has_pcap = False self.raw_files = [] # Choose a recent t0 so the wall_ns values don't overflow assumptions t0_wall = 1_777_583_279_000_000_000 # ~2026-04-30 self.labels = [ {"phase": p, "prev": None, "reason": "scheduled", "t_mono_ns": int(t * 1e9), "t_wall_ns": int(t0_wall + t * 1e9)} for t, p in phases ] self.events = [] self.meta = { "result": {"duration_observed_s": n_seconds, "phases_observed": [p for _, p in phases], "rows_proc": int(n_seconds * hz_proc), "rows_guest": int(n_seconds * hz_guest), "rows_qmp": int(n_seconds * hz_qmp), "rows_netflow": int(n_seconds * hz_netflow)}, "sample": {"profile": "test", "name": "test-sample", "kind": "synth", "sha256": None}, } # Build proc rows (counter for cpu_user; instantaneous values # would be cumulative jiffies) self.proc = [] cum = 0.0 for k in range(int(n_seconds * hz_proc)): t_s = k / hz_proc cum += cpu_user_constant / hz_proc self.proc.append({ "t_mono_ns": int(t_s * 1e9), "t_wall_ns": int(t0_wall + t_s * 1e9), "source": "host_proc", "available_in_deployment": False, "cpu_user_jiffies": cum, "cpu_sys_jiffies": 0, "rss_bytes": 1_000_000, "vsize_bytes": 2_000_000, "io_read_bytes": 0, "io_write_bytes": 0, "voluntary_ctxsw": 0, "involuntary_ctxsw": 0, "minor_faults": 0, "major_faults": 0, }) # guest, qmp, netflow rows — empty bodies are fine, every getter returns None self.guest = [] for k in range(int(n_seconds * hz_guest)): t_s = k / hz_guest self.guest.append({ "t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9), "source": "guest_agent", "available_in_deployment": True, "cpu_total_jiffies": {"user": k, "system": 0, "idle": 0, "iowait": 0, "softirq": 0}, "load_1m_5m_15m": [0.1, 0.0, 0.0], "mem_total_bytes": 1, "mem_available_bytes": 1, "mem_buffers_bytes": 1, "mem_cached_bytes": 1, "swap_used_bytes": 0, "net": {"eth0": {"rx_bytes": 0, "tx_bytes": 0, "rx_pkts": 0, "tx_pkts": 0}}, "listen_ports": [], "top_procs": [], }) self.qmp = [] for k in range(int(n_seconds * hz_qmp)): t_s = k / hz_qmp self.qmp.append({ "t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9), "source": "host_qmp", "available_in_deployment": False, "vm_status": "running", "vm_running": True, "blockstats": {"virtio0": {"rd_ops": 0, "wr_ops": 0, "rd_bytes": 0, "wr_bytes": 0}}, "kvm_stats": {"remote_tlb_flush": 0, "pages_4k": 0, "pages_2m": 0}, }) self.netflow = [] for k in range(int(n_seconds * hz_netflow)): t_s = k / hz_netflow self.netflow.append({ "t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9), "source": "bridge_pcap", "available_in_deployment": True, "bucket_ms": 100, "pkts_in": 0, "pkts_out": 0, "bytes_in": 0, "bytes_out": 0, "syn_count": 0, "fin_count": 0, "rst_count": 0, "udp_count": 0, "tcp_count": 0, "dns_query_count": 0, "unique_dst_ips": 0, "unique_dst_ports": 0, "tcp_new_flows": 0, }) def test_summary_windows_shape(): epi = _FakeEpi(n_seconds=30.0) X, y, t, info = summary_windows(epi) # 30s episode, 10s window, 5s stride → 5 windows starting at 0,5,10,15,20 assert X.shape[0] == 5 assert X.shape[1] == len(ALL_CHANNELS) * 5 assert y.shape == (5,) assert t.shape == (5,) assert info["episode_id"] == "test-episode" def test_tensor_windows_shape(): epi = _FakeEpi(n_seconds=30.0) X, y, t, M, info = tensor_windows(epi) assert X.shape == (5, len(ALL_CHANNELS), TENSOR_TIMESTEPS) assert M.shape == X.shape # All host-side channels should have data; mask should be ~all-True assert M.mean() > 0.95 def test_phase_label_at_window_center(): """Window centered on infected_running gets that label, not 'clean'.""" epi = _FakeEpi(n_seconds=30.0, phases=[(0.0, "clean"), (10.0, "infected_running"), (25.0, "clean")]) _, y, t, _ = summary_windows(epi) # Window centers: 5, 10, 15, 20, 25 # phase_at(t=5) → clean (idx 0) # phase_at(t=10) → infected_running (idx 3) # phase_at(t=15) → infected_running (idx 3) # phase_at(t=20) → infected_running (idx 3) # phase_at(t=25) → clean (idx 0) — second 'clean' assert y[0] == PHASE_TO_INT["clean"] assert y[1] == PHASE_TO_INT["infected_running"] assert y[2] == PHASE_TO_INT["infected_running"] assert y[3] == PHASE_TO_INT["infected_running"] assert y[4] == PHASE_TO_INT["clean"] def test_counter_to_rate_constant_signal(): """A counter incrementing by 100 jiffies per second should yield a per-second rate of 100 in the resulting tensor.""" epi = _FakeEpi(n_seconds=30.0, cpu_user_constant=100.0) X, _, _, M, _ = tensor_windows(epi) ch_idx = next(i for i, c in enumerate(ALL_CHANNELS) if c.name == "proc.cpu_user_jiffies") valid = M[:, ch_idx, :] # Mean of valid points should be ~100 (constant rate) rates = X[:, ch_idx, :][valid] assert 90.0 < rates.mean() < 110.0 def test_t_wall_ns_alignment_not_t_mono_ns(): """Regression: netflow rows had different t_mono_ns semantics from proc/guest/qmp. Producing aligned output requires using t_wall_ns. Inject a netflow row with bogus t_mono_ns but correct t_wall_ns; confirm it shows up at the right window.""" epi = _FakeEpi(n_seconds=30.0) # Override the netflow rows to have intentionally garbage t_mono_ns for r in epi.netflow: r["t_mono_ns"] = 1_777_543_932_511_943_778 # boot-uptime-ish X, _, _, M, _ = tensor_windows(epi) # netflow channels should still be valid for most timesteps because # the extractor uses t_wall_ns ch_idx = next(i for i, c in enumerate(ALL_CHANNELS) if c.name == "netflow.pkts_in") assert M[:, ch_idx, :].mean() > 0.5 def test_no_labels_returns_empty(): epi = _FakeEpi() epi.labels = [] Xs, ys, ts, info = summary_windows(epi) Xt, yt, tt, mt, infot = tensor_windows(epi) assert Xs.shape[0] == 0 and ys.shape[0] == 0 assert Xt.shape[0] == 0