CIS490/training/trainer/_data.py

"""Dataset loaders for the trainer.

Two flavors matching the model input kinds:

  load_summary(...)   → (X[N,F] float32, y[N] int64, meta_df)
                        from features_window_v1.parquet
  load_tensor(...)    → (X[N,C,T] float32, mask[N,C,T] bool,
                          y[N] int64, meta_df)
                        from tensor_window shards (one .npz per episode)

Both return episode-level metadata (episode_id, host_id, profile,
sample_name) that the split machinery needs.

Tensor data can be huge (~12 GB at the full dataset). For this reason:

  - load_tensor() supports lazy mode (returns a generator over batches)
  - load_tensor(..., max_episodes=N) for smoke tests
  - the trainer can choose RAM vs disk based on data size
"""
from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pyarrow.parquet as pq


@dataclass
class SummaryData:
    X: np.ndarray              # (N, F) float32
    y: np.ndarray              # (N,) int64
    feature_names: list[str]
    episode_id: list[str]
    host_id: list[str]
    profile: list[str]
    sample_name: list[str]
    t_center: np.ndarray | None = None


@dataclass
class TensorData:
    X: np.ndarray              # (N, C, T) float32
    mask: np.ndarray           # (N, C, T) bool
    y: np.ndarray              # (N,) int64
    channel_names: list[str]
    episode_id: list[str]
    host_id: list[str]
    profile: list[str]
    sample_name: list[str]
    t_center: np.ndarray | None = None


def load_summary(window_parquet: Path, schema_path: Path) -> SummaryData:
    """Read the entire features_window_v1.parquet into RAM.

    For a multi-GB parquet on a small box, pass column subsets via
    pyarrow's dataset.dataset(...) instead. For this project we expect
    < 5 GB summary parquet which fits in 32 GB workstation RAM.
    """
    schema = json.loads(schema_path.read_text())
    feat_names = schema["feature_names"]
    columns = feat_names + ["phase", "episode_id", "host_id",
                            "profile", "sample_name", "t_center_s"]
    tbl = pq.read_table(window_parquet, columns=columns)
    cols = {n: tbl.column(n).to_numpy(zero_copy_only=False) for n in columns}
    X = np.column_stack([cols[n] for n in feat_names]).astype(np.float32)
    y = cols["phase"].astype(np.int64)
    return SummaryData(
        X=X, y=y, feature_names=feat_names,
        episode_id=list(cols["episode_id"]),
        host_id=list(cols["host_id"]),
        profile=list(cols["profile"]),
        sample_name=list(cols["sample_name"]),
        t_center=cols["t_center_s"].astype(np.float64),
    )


def load_tensor(shards_root: Path, *, max_episodes: int | None = None
                 ) -> TensorData:
    """Load all tensor shards into RAM as one big (N, C, T) array.

    Each shard is a .npz with keys:
      X, mask, y, t_center, episode_id, host_id, profile, sample_name,
      channel_names (only stored once per shard)

    For datasets larger than RAM, use load_tensor_lazy() instead.
    """
    paths = sorted(Path(shards_root).rglob("*.npz"))
    if max_episodes is not None:
        paths = paths[:max_episodes]
    if not paths:
        raise FileNotFoundError(f"no tensor shards under {shards_root}")

    Xs, Ms, ys = [], [], []
    epi_ids, hosts, profs, samples, centers = [], [], [], [], []
    channel_names: list[str] | None = None

    for p in paths:
        with np.load(p, allow_pickle=True) as f:
            if channel_names is None:
                channel_names = list(f["channel_names"])
            n_w = f["X"].shape[0]
            if n_w == 0:
                continue
            Xs.append(f["X"])
            Ms.append(f["mask"])
            ys.append(f["y"])
            centers.append(f["t_center"])
            # Each shard's metadata is per-episode (1 value broadcast over its
            # n_w windows).
            epi_ids.extend([str(f["episode_id"])] * n_w)
            hosts.extend([str(f["host_id"])] * n_w)
            profs.extend([str(f["profile"])] * n_w)
            samples.extend([str(f["sample_name"])] * n_w)

    X = np.concatenate(Xs, axis=0)
    M = np.concatenate(Ms, axis=0)
    y = np.concatenate(ys, axis=0).astype(np.int64)
    t = np.concatenate(centers, axis=0).astype(np.float64)
    return TensorData(
        X=X.astype(np.float32, copy=False),
        mask=M, y=y, channel_names=channel_names or [],
        episode_id=epi_ids, host_id=hosts, profile=profs, sample_name=samples,
        t_center=t,
    )