CIS490/training/models/gru.py

"""Gated Recurrent Unit over channel × time windows.

Sees the window one timestep at a time and accumulates state. Cheaper
than LSTM, often comparable on short sequences. Last-step output → linear.
"""
from __future__ import annotations

from training.models import register
from training.models._torch_seq import _SeqBase


@register("gru")
class GRU(_SeqBase):
    def _build_module(self, *, n_channels_in: int, n_timesteps: int,
                      n_classes: int, hidden: int = 128, n_layers: int = 2,
                      dropout: float = 0.1, bidirectional: bool = False):
        from torch import nn
        return _GRUClassifier(n_channels_in=n_channels_in, n_classes=n_classes,
                              hidden=hidden, n_layers=n_layers,
                              dropout=dropout, bidirectional=bidirectional)


from torch import nn  # noqa: E402


class _GRUClassifier(nn.Module):
    def __init__(self, *, n_channels_in: int, n_classes: int, hidden: int,
                 n_layers: int, dropout: float, bidirectional: bool):
        super().__init__()
        self.gru = nn.GRU(
            input_size=n_channels_in, hidden_size=hidden,
            num_layers=n_layers, dropout=dropout if n_layers > 1 else 0.0,
            batch_first=True, bidirectional=bidirectional,
        )
        d_out = hidden * (2 if bidirectional else 1)
        self.head = nn.Sequential(nn.Dropout(dropout), nn.Linear(d_out, n_classes))

    def forward(self, x):                       # x: (B, C, T)
        x = x.transpose(1, 2)                   # → (B, T, C)
        out, _ = self.gru(x)                    # (B, T, hidden*dirs)
        return self.head(out[:, -1, :])         # last timestep