CIS490/tests/test_training_features.py

"""Tests for training/_features.py — windowing + tensor extraction.

The feature extractor decides what every model sees. Bugs here are
the kind that are invisible until the model is wrong in production.
"""
from __future__ import annotations

import json

import numpy as np
import pytest

from training._features import (
    ALL_CHANNELS, DEFAULT_STRIDE_S, DEFAULT_WINDOW_S, PHASE_TO_INT,
    TENSOR_HZ, TENSOR_TIMESTEPS,
    channel_arrays, episode_t0_wall_ns, summary_windows, tensor_windows,
)


class _FakeEpi:
    """Hand-built episode minimal enough to drive the extractor."""
    def __init__(self, *, n_seconds: float = 30.0,
                 hz_proc: float = 10.0, hz_guest: float = 10.0,
                 hz_qmp: float = 1.0, hz_netflow: float = 10.0,
                 phases: list[tuple[float, str]] | None = None,
                 cpu_user_constant: float = 100.0):
        # Phases default: clean → infected_running at 10s → clean at 25s
        if phases is None:
            phases = [(0.0, "clean"), (10.0, "infected_running"), (25.0, "clean")]
        self.episode_id = "test-episode"
        self.host_id = "test-host"
        self.has_done_marker = True
        self.has_pcap = False
        self.raw_files = []
        # Choose a recent t0 so the wall_ns values don't overflow assumptions
        t0_wall = 1_777_583_279_000_000_000   # ~2026-04-30
        self.labels = [
            {"phase": p, "prev": None, "reason": "scheduled",
             "t_mono_ns": int(t * 1e9), "t_wall_ns": int(t0_wall + t * 1e9)}
            for t, p in phases
        ]
        self.events = []
        self.meta = {
            "result": {"duration_observed_s": n_seconds,
                       "phases_observed": [p for _, p in phases],
                       "rows_proc": int(n_seconds * hz_proc),
                       "rows_guest": int(n_seconds * hz_guest),
                       "rows_qmp": int(n_seconds * hz_qmp),
                       "rows_netflow": int(n_seconds * hz_netflow)},
            "sample": {"profile": "test", "name": "test-sample",
                       "kind": "synth", "sha256": None},
        }
        # Build proc rows (counter for cpu_user; instantaneous values
        # would be cumulative jiffies)
        self.proc = []
        cum = 0.0
        for k in range(int(n_seconds * hz_proc)):
            t_s = k / hz_proc
            cum += cpu_user_constant / hz_proc
            self.proc.append({
                "t_mono_ns": int(t_s * 1e9), "t_wall_ns": int(t0_wall + t_s * 1e9),
                "source": "host_proc", "available_in_deployment": False,
                "cpu_user_jiffies": cum, "cpu_sys_jiffies": 0,
                "rss_bytes": 1_000_000, "vsize_bytes": 2_000_000,
                "io_read_bytes": 0, "io_write_bytes": 0,
                "voluntary_ctxsw": 0, "involuntary_ctxsw": 0,
                "minor_faults": 0, "major_faults": 0,
            })
        # guest, qmp, netflow rows — empty bodies are fine, every getter returns None
        self.guest = []
        for k in range(int(n_seconds * hz_guest)):
            t_s = k / hz_guest
            self.guest.append({
                "t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9),
                "source": "guest_agent", "available_in_deployment": True,
                "cpu_total_jiffies": {"user": k, "system": 0, "idle": 0,
                                       "iowait": 0, "softirq": 0},
                "load_1m_5m_15m": [0.1, 0.0, 0.0],
                "mem_total_bytes": 1, "mem_available_bytes": 1,
                "mem_buffers_bytes": 1, "mem_cached_bytes": 1, "swap_used_bytes": 0,
                "net": {"eth0": {"rx_bytes": 0, "tx_bytes": 0,
                                  "rx_pkts": 0, "tx_pkts": 0}},
                "listen_ports": [], "top_procs": [],
            })
        self.qmp = []
        for k in range(int(n_seconds * hz_qmp)):
            t_s = k / hz_qmp
            self.qmp.append({
                "t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9),
                "source": "host_qmp", "available_in_deployment": False,
                "vm_status": "running", "vm_running": True,
                "blockstats": {"virtio0": {"rd_ops": 0, "wr_ops": 0,
                                            "rd_bytes": 0, "wr_bytes": 0}},
                "kvm_stats": {"remote_tlb_flush": 0, "pages_4k": 0, "pages_2m": 0},
            })
        self.netflow = []
        for k in range(int(n_seconds * hz_netflow)):
            t_s = k / hz_netflow
            self.netflow.append({
                "t_mono_ns": 0, "t_wall_ns": int(t0_wall + t_s * 1e9),
                "source": "bridge_pcap", "available_in_deployment": True,
                "bucket_ms": 100, "pkts_in": 0, "pkts_out": 0,
                "bytes_in": 0, "bytes_out": 0, "syn_count": 0, "fin_count": 0,
                "rst_count": 0, "udp_count": 0, "tcp_count": 0,
                "dns_query_count": 0, "unique_dst_ips": 0, "unique_dst_ports": 0,
                "tcp_new_flows": 0,
            })


def test_summary_windows_shape():
    epi = _FakeEpi(n_seconds=30.0)
    X, y, t, info = summary_windows(epi)
    # 30s episode, 10s window, 5s stride → 5 windows starting at 0,5,10,15,20
    assert X.shape[0] == 5
    assert X.shape[1] == len(ALL_CHANNELS) * 5
    assert y.shape == (5,)
    assert t.shape == (5,)
    assert info["episode_id"] == "test-episode"


def test_tensor_windows_shape():
    epi = _FakeEpi(n_seconds=30.0)
    X, y, t, M, info = tensor_windows(epi)
    assert X.shape == (5, len(ALL_CHANNELS), TENSOR_TIMESTEPS)
    assert M.shape == X.shape
    # All host-side channels should have data; mask should be ~all-True
    assert M.mean() > 0.95


def test_phase_label_at_window_center():
    """Window centered on infected_running gets that label, not 'clean'."""
    epi = _FakeEpi(n_seconds=30.0,
                   phases=[(0.0, "clean"), (10.0, "infected_running"),
                            (25.0, "clean")])
    _, y, t, _ = summary_windows(epi)
    # Window centers: 5, 10, 15, 20, 25
    # phase_at(t=5) → clean (idx 0)
    # phase_at(t=10) → infected_running (idx 3)
    # phase_at(t=15) → infected_running (idx 3)
    # phase_at(t=20) → infected_running (idx 3)
    # phase_at(t=25) → clean (idx 0)  — second 'clean'
    assert y[0] == PHASE_TO_INT["clean"]
    assert y[1] == PHASE_TO_INT["infected_running"]
    assert y[2] == PHASE_TO_INT["infected_running"]
    assert y[3] == PHASE_TO_INT["infected_running"]
    assert y[4] == PHASE_TO_INT["clean"]


def test_counter_to_rate_constant_signal():
    """A counter incrementing by 100 jiffies per second should yield
    a per-second rate of 100 in the resulting tensor."""
    epi = _FakeEpi(n_seconds=30.0, cpu_user_constant=100.0)
    X, _, _, M, _ = tensor_windows(epi)
    ch_idx = next(i for i, c in enumerate(ALL_CHANNELS)
                   if c.name == "proc.cpu_user_jiffies")
    valid = M[:, ch_idx, :]
    # Mean of valid points should be ~100 (constant rate)
    rates = X[:, ch_idx, :][valid]
    assert 90.0 < rates.mean() < 110.0


def test_t_wall_ns_alignment_not_t_mono_ns():
    """Regression: netflow rows had different t_mono_ns semantics from
    proc/guest/qmp. Producing aligned output requires using t_wall_ns.

    Inject a netflow row with bogus t_mono_ns but correct t_wall_ns;
    confirm it shows up at the right window."""
    epi = _FakeEpi(n_seconds=30.0)
    # Override the netflow rows to have intentionally garbage t_mono_ns
    for r in epi.netflow:
        r["t_mono_ns"] = 1_777_543_932_511_943_778   # boot-uptime-ish
    X, _, _, M, _ = tensor_windows(epi)
    # netflow channels should still be valid for most timesteps because
    # the extractor uses t_wall_ns
    ch_idx = next(i for i, c in enumerate(ALL_CHANNELS)
                   if c.name == "netflow.pkts_in")
    assert M[:, ch_idx, :].mean() > 0.5


def test_no_labels_returns_empty():
    epi = _FakeEpi()
    epi.labels = []
    Xs, ys, ts, info = summary_windows(epi)
    Xt, yt, tt, mt, infot = tensor_windows(epi)
    assert Xs.shape[0] == 0 and ys.shape[0] == 0
    assert Xt.shape[0] == 0