diff --git a/references/links.md b/references/links.md
index 14a2e5a..a7f393e 100644
--- a/references/links.md
+++ b/references/links.md
@@ -1,6 +1,10 @@
 # Reference Links
 
-- https://github.com/mitre/caldera?tab=security-ov-file
-- https://github.com/PiyushxJangid/DLHIDS
-- https://github.com/ArpanDFrank/Host-Intrusion-Detection-System-using-Hybrid-CNN-LSTM-Models-and-RL-Actor-Critic-Models
+- https://github.com/mitre/caldera?tab=security-ov-file — adversary emulation framework (red-team/blue-team)
+- https://github.com/PiyushxJangid/DLHIDS — deep-learning HIDS reference implementation
+- https://github.com/ArpanDFrank/Host-Intrusion-Detection-System-using-Hybrid-CNN-LSTM-Models-and-RL-Actor-Critic-Models — Hybrid CNN-LSTM + RL HIDS prior art
 - https://ieeexplore.ieee.org/document/9881803 — per-device trust establishment from network behaviour (cited on motivation scene)
+- https://pytorch.org/docs/stable/index.html — PyTorch reference (LSTM / GRU / CNN / Transformer module APIs used by the model zoo)
+- https://xgboost.readthedocs.io/en/stable/ — XGBoost reference (gradient-boosted-trees baseline; Chen & Guestrin, KDD 2016)
+- https://scikit-learn.org/stable/ — scikit-learn reference (KNN, KMeans, PCA, evaluation metrics)
+- https://man7.org/linux/man-pages/man5/proc.5.html — proc(5) — the Linux kernel interface this project's telemetry comes from
diff --git a/training/dashboard/static/dashboard.css b/training/dashboard/static/dashboard.css
index 2b0a7f6..3bd22ab 100644
--- a/training/dashboard/static/dashboard.css
+++ b/training/dashboard/static/dashboard.css
@@ -1067,6 +1067,205 @@ html, body { overflow-anchor: none; }
   color: var(--fg-dim);
 }
 
+/* ─── Problem statement (scene: problem-statement) ─────────────────── */
+.problem-claim {
+  padding: clamp(16px, 2vh, 28px) clamp(18px, 2vw, 28px);
+  background: var(--bg-elev, rgba(255, 255, 255, 0.03));
+  border: 1px solid var(--line);
+  border-left: 4px solid var(--accent);
+  border-radius: 4px;
+}
+.problem-claim-text {
+  font-size: clamp(16px, 1.5vw, 22px);
+  line-height: 1.45;
+  color: var(--fg);
+  font-weight: 500;
+}
+.problem-stats {
+  display: grid;
+  grid-template-columns: repeat(3, 1fr);
+  gap: clamp(10px, 1.4vw, 18px);
+}
+.problem-stat {
+  padding: clamp(14px, 1.8vh, 22px);
+  background: var(--bg-elev, rgba(255, 255, 255, 0.03));
+  border: 1px solid var(--line);
+  border-radius: 4px;
+  display: flex; flex-direction: column; gap: 4px;
+  align-items: flex-start;
+}
+.problem-stat-num {
+  font: 700 clamp(28px, 3.4vw, 44px)
+    ui-monospace, SFMono-Regular, Menlo, monospace;
+  color: var(--accent);
+  line-height: 1;
+}
+.problem-stat-lbl {
+  font-size: clamp(12px, 0.95vw, 14px);
+  color: var(--fg-dim);
+  line-height: 1.35;
+}
+.problem-task {
+  padding: 12px 16px;
+  background: var(--bg);
+  border: 1px solid var(--line);
+  border-radius: 4px;
+  font-size: clamp(13px, 1vw, 15px);
+  color: var(--fg-dim);
+  line-height: 1.5;
+}
+.problem-task-label { color: var(--fg-mute); margin-right: 6px; }
+.problem-task-value { color: var(--fg); font-weight: 600; }
+.problem-task-detail { color: var(--fg-dim); }
+
+/* ─── Research questions (scene: research-questions) ───────────────── */
+.research-grid {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: clamp(12px, 1.6vw, 22px);
+}
+.research-col {
+  padding: clamp(14px, 1.8vh, 22px);
+  background: var(--bg-elev, rgba(255, 255, 255, 0.03));
+  border: 1px solid var(--line);
+  border-radius: 4px;
+}
+.research-col-title {
+  font: 600 clamp(13px, 1.05vw, 15px)
+    ui-monospace, SFMono-Regular, Menlo, monospace;
+  color: var(--accent);
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  margin-bottom: 12px;
+}
+.research-list {
+  list-style: none; padding: 0; margin: 0;
+  display: flex; flex-direction: column; gap: 10px;
+  font-size: clamp(13px, 1vw, 15px);
+  line-height: 1.45;
+  color: var(--fg-dim);
+}
+.research-list li::before {
+  content: '·'; color: var(--accent); margin-right: 8px;
+}
+.research-list strong { color: var(--fg); }
+
+/* ─── Solution overview (scene: solution-overview) ─────────────────── */
+.pipeline-svg {
+  width: 100%;
+  height: clamp(360px, 60vh, 640px);
+  background: var(--bg-elev, rgba(255, 255, 255, 0.03));
+  border: 1px solid var(--line);
+  border-radius: 4px;
+  padding: 12px;
+  box-sizing: border-box;
+}
+.pipeline-stage rect {
+  fill: var(--bg);
+  stroke: var(--accent);
+  stroke-width: 1.5;
+}
+.pipeline-stage-models rect {
+  fill: var(--accent-soft, rgba(80, 140, 220, 0.08));
+  stroke-width: 2;
+}
+.pipeline-stage-final rect {
+  stroke: var(--phase-clean);
+}
+.pipeline-stage text {
+  fill: var(--fg);
+  font: 600 14px ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+.pipeline-stage-title {
+  font-size: 16px !important;
+}
+.pipeline-detail {
+  fill: var(--fg-dim) !important;
+  font-weight: 400 !important;
+  font-size: 11px !important;
+}
+.pipeline-detail-mini {
+  fill: var(--fg-mute) !important;
+  font-weight: 400 !important;
+  font-size: 10px !important;
+}
+.pipeline-arrow path {
+  stroke: var(--fg-mute);
+  stroke-width: 1.5;
+  stroke-linecap: round;
+  marker-end: url(#pipe-arrow);
+}
+
+/* ─── Evaluation setup (scene: evaluation-setup) ───────────────────── */
+.eval-blocks {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: clamp(10px, 1.4vw, 18px);
+}
+.eval-block {
+  padding: clamp(12px, 1.6vh, 18px);
+  background: var(--bg-elev, rgba(255, 255, 255, 0.03));
+  border: 1px solid var(--line);
+  border-radius: 4px;
+  display: flex; flex-direction: column; gap: 8px;
+}
+.eval-block-title {
+  font: 600 clamp(12px, 0.95vw, 14px)
+    ui-monospace, SFMono-Regular, Menlo, monospace;
+  color: var(--accent);
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+}
+.eval-block-body {
+  display: flex; flex-direction: column; gap: 6px;
+  font-size: clamp(13px, 1vw, 15px);
+  color: var(--fg-dim);
+  line-height: 1.45;
+}
+.eval-block-body strong { color: var(--fg); }
+.eval-detail {
+  margin-top: 4px;
+  color: var(--fg-mute);
+  font-size: clamp(12px, 0.9vw, 13px);
+  font-style: italic;
+}
+
+/* ─── Conclusion + future (scene: conclusion-future) ───────────────── */
+.conclusion-grid {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: clamp(12px, 1.6vw, 22px);
+}
+.conclusion-col {
+  padding: clamp(14px, 1.8vh, 22px);
+  background: var(--bg-elev, rgba(255, 255, 255, 0.03));
+  border: 1px solid var(--line);
+  border-radius: 4px;
+}
+.conclusion-col-title {
+  font: 600 clamp(13px, 1.05vw, 15px)
+    ui-monospace, SFMono-Regular, Menlo, monospace;
+  color: var(--accent);
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  margin-bottom: 12px;
+}
+.conclusion-list {
+  list-style: none; padding: 0; margin: 0;
+  display: flex; flex-direction: column; gap: 10px;
+  font-size: clamp(13px, 1vw, 15px);
+  line-height: 1.45;
+  color: var(--fg-dim);
+}
+.conclusion-list li::before {
+  content: '·'; color: var(--accent); margin-right: 8px;
+}
+.conclusion-list strong { color: var(--fg); }
+
+/* ─── Limitations card uses the motivation-card pattern with an
+   armed-phase marker for the "warning" feel.                      ─── */
+.motivation-card-marker.mc-armed { background: var(--phase-armed); }
+
 /* ─── Live detections (scene: live) ────────────────────────────────── */
 .live-stack { gap: clamp(10px, 1.6vh, 20px); }
 
diff --git a/training/dashboard/static/index.html b/training/dashboard/static/index.html
index b5b356d..3e6abd0 100644
--- a/training/dashboard/static/index.html
+++ b/training/dashboard/static/index.html
@@ -4,7 +4,7 @@
   <meta charset="utf-8">
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <title>CIS490 — live</title>
-  <link rel="stylesheet" href="/static/dashboard.css?v=8675cea9">
+  <link rel="stylesheet" href="/static/dashboard.css?v=0ef6cb6d">
 </head>
 <body>
   <!-- SVG filter defs for the lava-lamp goo effect. Width/height 0
@@ -203,7 +203,143 @@
           </div>
         </div>
 
-        <!-- 3. stack — Python stack & libraries used in the project -->
+        <!-- 3. problem-statement — what we're solving + task type -->
+        <div class="stage-view" data-view="problem-statement">
+          <div class="metric-stack metric-stack-wide">
+            <div class="metric-eyebrow">the problem · single sentence + numbers</div>
+            <div class="problem-claim">
+              <div class="problem-claim-text">Classify each ten-second window of fleet
+                <code>/proc</code> telemetry into one of five workload phases —
+                accurately enough to drive automated containment.</div>
+            </div>
+            <div class="problem-stats">
+              <div class="problem-stat">
+                <div class="problem-stat-num">5</div>
+                <div class="problem-stat-lbl">phase classes<br><code>clean</code> → <code>infected_running</code></div>
+              </div>
+              <div class="problem-stat">
+                <div class="problem-stat-num">12</div>
+                <div class="problem-stat-lbl"><code>/proc</code> channels<br>no syscalls, no kernel hooks</div>
+              </div>
+              <div class="problem-stat">
+                <div class="problem-stat-num">10s</div>
+                <div class="problem-stat-lbl">classification window<br>100 samples × 12 channels</div>
+              </div>
+            </div>
+            <div class="problem-task">
+              <span class="problem-task-label">task type:</span>
+              <span class="problem-task-value">multi-class classification</span>
+              <span class="problem-task-detail">— five mutually-exclusive
+                phase labels, balanced via class-weighted cross-entropy.
+                Not regression (no continuous target), not ranking
+                (downstream policy is a categorical containment decision).</span>
+            </div>
+          </div>
+        </div>
+
+        <!-- 4. research-questions — literature gaps and questions -->
+        <div class="stage-view" data-view="research-questions">
+          <div class="metric-stack metric-stack-wide">
+            <div class="metric-eyebrow">literature gaps · positioning the work</div>
+            <div class="research-grid">
+              <div class="research-col">
+                <div class="research-col-title">what prior work covers</div>
+                <ul class="research-list">
+                  <li><strong>LSTM on syscall traces</strong> in VMs —
+                    deeper telemetry than <code>/proc</code></li>
+                  <li><strong>Transformer on per-process resource metrics</strong>
+                    — related signal, single-host eval</li>
+                  <li><strong>BERT on system logs</strong> (LogBERT) —
+                    text-form telemetry, not numeric channels</li>
+                  <li><strong>Insider-threat LSTM on event logs</strong>
+                    (DANTE) — categorical events, not continuous</li>
+                  <li><strong>Network-behaviour trust establishment</strong>
+                    (IEEE 9881803) — cross-device aggregation,
+                    not per-host classifier</li>
+                </ul>
+              </div>
+              <div class="research-col">
+                <div class="research-col-title">what's missing</div>
+                <ul class="research-list">
+                  <li><strong>/proc-only signal</strong> — most work
+                    assumes syscalls or kernel hooks</li>
+                  <li><strong>Cross-host generalization</strong> — eval
+                    splits often hide it (held-out by sample, not host)</li>
+                  <li><strong>Real-time per-window classification</strong>
+                    for containment, not post-hoc batch labelling</li>
+                  <li><strong>Side-by-side cell-choice comparison</strong>
+                    (RNN/GRU/LSTM/CNN/Transformer) on one dataset</li>
+                  <li><strong>Direct integration</strong> with a
+                    fleet-wide trust score, not standalone output</li>
+                </ul>
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <!-- 5. solution-overview — pipeline block diagram -->
+        <div class="stage-view" data-view="solution-overview">
+          <div class="metric-stack metric-stack-wide">
+            <div class="metric-eyebrow">pipeline · what each stage produces</div>
+            <svg class="pipeline-svg" viewBox="0 0 800 480"
+                 xmlns="http://www.w3.org/2000/svg"
+                 preserveAspectRatio="xMidYMid meet">
+              <g class="pipeline-stage">
+                <rect x="20" y="40" width="140" height="60" rx="4"/>
+                <text x="90" y="68" text-anchor="middle">fleet hosts</text>
+                <text x="90" y="86" text-anchor="middle" class="pipeline-detail">/proc · 10 Hz</text>
+              </g>
+              <g class="pipeline-stage">
+                <rect x="200" y="40" width="140" height="60" rx="4"/>
+                <text x="270" y="68" text-anchor="middle">receiver (Pi)</text>
+                <text x="270" y="86" text-anchor="middle" class="pipeline-detail">bearer auth</text>
+              </g>
+              <g class="pipeline-stage">
+                <rect x="380" y="40" width="140" height="60" rx="4"/>
+                <text x="450" y="68" text-anchor="middle">episode store</text>
+                <text x="450" y="86" text-anchor="middle" class="pipeline-detail">zstd · tar</text>
+              </g>
+              <g class="pipeline-stage">
+                <rect x="560" y="40" width="220" height="60" rx="4"/>
+                <text x="670" y="68" text-anchor="middle">windowing + features</text>
+                <text x="670" y="86" text-anchor="middle" class="pipeline-detail">10 s · 100 samples × 12 ch</text>
+              </g>
+              <g class="pipeline-stage pipeline-stage-models">
+                <rect x="180" y="170" width="440" height="120" rx="4"/>
+                <text x="400" y="198" text-anchor="middle" class="pipeline-stage-title">model zoo</text>
+                <text x="400" y="226" text-anchor="middle" class="pipeline-detail">KNN · GBT · MLP · CNN · RNN · GRU · LSTM · Transformer</text>
+                <text x="400" y="252" text-anchor="middle" class="pipeline-detail">trained per (model × split-recipe)</text>
+                <text x="400" y="276" text-anchor="middle" class="pipeline-detail-mini">cross-host eval · class-weighted CE · early stop on val macro-F1</text>
+              </g>
+              <g class="pipeline-stage">
+                <rect x="60" y="350" width="200" height="60" rx="4"/>
+                <text x="160" y="378" text-anchor="middle">per-window phase</text>
+                <text x="160" y="396" text-anchor="middle" class="pipeline-detail">5-class softmax</text>
+              </g>
+              <g class="pipeline-stage pipeline-stage-final">
+                <rect x="300" y="350" width="200" height="60" rx="4"/>
+                <text x="400" y="378" text-anchor="middle">trust score</text>
+                <text x="400" y="396" text-anchor="middle" class="pipeline-detail">+ network signals (9881803)</text>
+              </g>
+              <g class="pipeline-stage pipeline-stage-final">
+                <rect x="540" y="350" width="220" height="60" rx="4"/>
+                <text x="650" y="378" text-anchor="middle">containment + reset</text>
+                <text x="650" y="396" text-anchor="middle" class="pipeline-detail">snapshot rollback</text>
+              </g>
+              <g class="pipeline-arrow" fill="none">
+                <path d="M160 70 L200 70" />
+                <path d="M340 70 L380 70" />
+                <path d="M520 70 L560 70" />
+                <path d="M670 100 L670 130 L400 130 L400 170" />
+                <path d="M400 290 L400 320 L160 320 L160 350" />
+                <path d="M260 380 L300 380" />
+                <path d="M500 380 L540 380" />
+              </g>
+            </svg>
+          </div>
+        </div>
+
+        <!-- 6. stack — Python stack & libraries used in the project -->
         <div class="stage-view" data-view="stack">
           <div class="metric-stack metric-stack-wide">
             <div class="metric-eyebrow">the stack behind the live data on the right</div>
@@ -316,7 +452,60 @@
           </div>
         </div>
 
-        <!-- 9. models -->
+        <!-- 9. evaluation-setup — splits, metrics, baselines -->
+        <div class="stage-view" data-view="evaluation-setup">
+          <div class="metric-stack metric-stack-wide">
+            <div class="metric-eyebrow">evaluation setup · how the numbers get made</div>
+            <div class="eval-blocks">
+              <div class="eval-block">
+                <div class="eval-block-title">split recipe</div>
+                <div class="eval-block-body">
+                  <div><strong>train ∪ val:</strong> elliott-thinkpad</div>
+                  <div><strong>test:</strong> k-gamingcom</div>
+                  <div class="eval-detail">held-out by host so the test set
+                    measures cross-device generalization, not in-distribution
+                    self-prediction. A 90 % accuracy that comes from
+                    recognising the host's idle profile is worthless for
+                    a fleet detector.</div>
+                </div>
+              </div>
+              <div class="eval-block">
+                <div class="eval-block-title">primary metric</div>
+                <div class="eval-block-body">
+                  <div><strong>macro-F1</strong> averaged across the five phases</div>
+                  <div class="eval-detail">accuracy lies under class
+                    imbalance — ~50 % <code>infected_running</code>,
+                    ~5 % <code>armed</code>. A constant majority predictor
+                    hits 0.5 accuracy. macro-F1 averages per-class F1,
+                    so rare phases actually count toward the score.</div>
+                </div>
+              </div>
+              <div class="eval-block">
+                <div class="eval-block-title">baselines compared</div>
+                <div class="eval-block-body">
+                  <div><strong>KNN</strong> — non-parametric, instance-based</div>
+                  <div><strong>GBT (XGBoost)</strong> — tabular non-NN</div>
+                  <div><strong>MLP</strong> — feedforward ablation</div>
+                  <div><strong>CNN</strong> — local-pattern ablation</div>
+                  <div><strong>RNN / GRU / LSTM</strong> — recurrent family</div>
+                  <div><strong>Transformer</strong> — attention</div>
+                </div>
+              </div>
+              <div class="eval-block">
+                <div class="eval-block-title">reported alongside accuracy</div>
+                <div class="eval-block-body">
+                  <div><strong>μs / window</strong> — inference cost at batch=64</div>
+                  <div><strong>cross-host gap</strong> — val − test macro-F1</div>
+                  <div class="eval-detail">latency translates to containment
+                    lag; the gap is the honest measure of generalization.
+                    Both are plotted on the perf scene.</div>
+                </div>
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <!-- 10. models -->
         <div class="stage-view" data-view="models">
           <div class="metric-stack">
             <div class="metric-eyebrow">sequence models · accuracy on held-out samples</div>
@@ -386,7 +575,7 @@
           </div>
         </div>
 
-        <!-- 13. live — fleet-wide live detections feed -->
+        <!-- 14. live — fleet-wide live detections feed -->
         <div class="stage-view" data-view="live">
           <div class="metric-stack metric-stack-wide live-stack">
             <div class="live-stats">
@@ -403,6 +592,228 @@
           </div>
         </div>
 
+        <!-- 15. theoretical-contributions -->
+        <div class="stage-view" data-view="theoretical">
+          <div class="metric-stack metric-stack-wide">
+            <div class="metric-eyebrow">theoretical contributions · what's new methodologically</div>
+            <div class="motivation-cards">
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-trust"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">window-centre labelling</div>
+                  <div class="motivation-card-text">A 10-second
+                    classification window is labelled by the phase that
+                    occupies its centre, not by majority vote across the
+                    window. Cleaner training signal at phase boundaries,
+                    and avoids the spurious "ambiguous" class.</div>
+                </div>
+              </div>
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-contain"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">schema-hashed checkpoints</div>
+                  <div class="motivation-card-text">Each checkpoint
+                    embeds a hash of the feature schema; loading a model
+                    against the wrong schema fails fast instead of
+                    silently scoring on misaligned columns. Makes
+                    retroactive comparison reproducible.</div>
+                </div>
+              </div>
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-recover"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">cross-host as the eval axis</div>
+                  <div class="motivation-card-text">Held-out-by-host
+                    is reported as a first-class number alongside
+                    held-out-by-sample. The two often disagree by 0.4
+                    macro-F1, and only the cross-host number predicts
+                    fleet behaviour.</div>
+                </div>
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <!-- 16. practical-contributions -->
+        <div class="stage-view" data-view="practical">
+          <div class="metric-stack metric-stack-wide">
+            <div class="metric-eyebrow">practical contributions · what others can use</div>
+            <div class="motivation-cards">
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-trust"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">/proc-only deployment</div>
+                  <div class="motivation-card-text">No syscall hooks, no
+                    eBPF, no kernel module — runs on hosts that don't
+                    permit deep instrumentation. The detector is one
+                    Python service plus a model file.</div>
+                </div>
+              </div>
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-contain"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">producer-agnostic dashboard</div>
+                  <div class="motivation-card-text">The deck consumes
+                    typed events; the inference loop runs anywhere
+                    (Pi, A100, cloud) and just POSTs back. Same UI for
+                    a lab demo and an operational console.</div>
+                </div>
+              </div>
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-recover"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">labelled dataset on disk</div>
+                  <div class="motivation-card-text">78,000+ episodes,
+                    five phases, two hosts, six attack profiles —
+                    archived in zstd-compressed tarballs with a
+                    schema-versioned format. Ready for downstream
+                    work without re-running the orchestrator.</div>
+                </div>
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <!-- 17. design-principles -->
+        <div class="stage-view" data-view="design-principles">
+          <div class="metric-stack metric-stack-wide">
+            <div class="metric-eyebrow">design principles · patterns that emerged</div>
+            <div class="motivation-cards">
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-trust"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">one loop, many models</div>
+                  <div class="motivation-card-text">Every NN architecture
+                    plugs into the same training loop — class weights,
+                    AMP, cosine LR, early stop. Architecture changes
+                    don't ripple into orchestration.</div>
+                </div>
+              </div>
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-contain"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">typed events as contract</div>
+                  <div class="motivation-card-text">Producers and
+                    consumers agree on dataclasses
+                    (<code>events.py</code>), not free-form dicts.
+                    Adding a new scene means adding a new dataclass;
+                    adding a new producer means importing it.</div>
+                </div>
+              </div>
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-recover"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">two-agent path ownership</div>
+                  <div class="motivation-card-text">Dashboard work and
+                    model work live in two parallel sessions with a
+                    documented path-ownership boundary. Merges go
+                    through git with explicit rebases instead of a
+                    shared workspace.</div>
+                </div>
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <!-- 18. limitations -->
+        <div class="stage-view" data-view="limitations">
+          <div class="metric-stack metric-stack-wide">
+            <div class="metric-eyebrow">limitations · the honest list</div>
+            <div class="motivation-cards">
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-armed"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">two-host fleet</div>
+                  <div class="motivation-card-text">Cross-host generalization
+                    is reported between exactly two machines
+                    (elliott-thinkpad → k-gamingcom). N-host claims need
+                    more hosts on the WireGuard mesh.</div>
+                </div>
+              </div>
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-armed"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">synthetic attack profiles</div>
+                  <div class="motivation-card-text">Six profiles cover the
+                    main shapes (cpu-saturate, ransomware-lite, bursty-c2,
+                    fork-bomb, crypto-miner, distccd-exec) but real-world
+                    malware can sit between or outside these envelopes.</div>
+                </div>
+              </div>
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-armed"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">10 Hz sampling floor</div>
+                  <div class="motivation-card-text">Sub-100ms attack
+                    behaviours fall inside a single sample. Detection of
+                    extremely short-lived attacks (millisecond-scale
+                    privilege checks) requires faster sampling than
+                    <code>/proc</code> currently provides.</div>
+                </div>
+              </div>
+              <div class="motivation-card">
+                <div class="motivation-card-marker mc-armed"></div>
+                <div class="motivation-card-body">
+                  <div class="motivation-card-title">KNN cross-host gap</div>
+                  <div class="motivation-card-text">KNN scores val
+                    macro-F1 ≈ 0.74 on elliott-thinkpad but only 0.13 on
+                    the held-out k-gamingcom. Instance-based memorization
+                    of the training host's feature space — informative
+                    as a baseline, but not a deployment candidate.</div>
+                </div>
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <!-- 19. conclusion-future — summary + unsupervised next steps -->
+        <div class="stage-view" data-view="conclusion-future">
+          <div class="metric-stack metric-stack-wide">
+            <div class="metric-eyebrow">conclusion + future work</div>
+            <div class="conclusion-grid">
+              <div class="conclusion-col">
+                <div class="conclusion-col-title">what we showed</div>
+                <ul class="conclusion-list">
+                  <li>A per-host detector trained on
+                    <strong>/proc-only telemetry</strong> can classify
+                    workload phases at multi-class macro-F1 well above
+                    chance.</li>
+                  <li>Held-out-<strong>by-host</strong> evaluation is the
+                    right generalization axis; held-out-by-sample
+                    overstates real fleet performance by 0.3+ F1.</li>
+                  <li>The recurrent family (LSTM/GRU) and Transformer
+                    sit on the upper-left of the
+                    <strong>accuracy-vs-cost frontier</strong>; KNN and
+                    GBT round out the comparison as honest baselines.</li>
+                  <li>The detector slots into a wider <strong>trust /
+                    containment / recovery</strong> loop — the per-host
+                    verdict isn't the final answer, it's one input.</li>
+                </ul>
+              </div>
+              <div class="conclusion-col">
+                <div class="conclusion-col-title">next steps · unsupervised</div>
+                <ul class="conclusion-list">
+                  <li><strong>Clustering</strong> the unlabeled tail of
+                    new fleet data (KMeans / HDBSCAN) to surface novel
+                    workload shapes the supervised model has no class
+                    for — a self-training feedback loop.</li>
+                  <li><strong>Anomaly detection</strong> on the
+                    last-layer embedding (one-class SVM, isolation forest)
+                    so a "none of the five known phases" verdict is
+                    available alongside the classifier output.</li>
+                  <li><strong>Self-supervised pretraining</strong> on
+                    the much larger pool of unlabeled telemetry from
+                    operational hosts; supervised fine-tune on the
+                    smaller orchestrated dataset.</li>
+                  <li><strong>Embedding visualisation</strong> via
+                    UMAP / t-SNE for human-in-the-loop labelling of
+                    the unlabeled tail (already prototyped in scene 12).</li>
+                </ul>
+              </div>
+            </div>
+          </div>
+        </div>
+
       </div>
       <button id="next-fab" class="fab" data-no-advance title="Next (→)">▼</button>
     </div>
@@ -453,6 +864,79 @@
         </div>
       </section>
 
+      <section class="scene" data-stage="problem-statement">
+        <div class="prose">
+          <h2>Problem statement</h2>
+          <p>Today's behaviour-based IDS systems rely on syscall traces,
+            kernel hooks, or rich endpoint agents that can't ship to
+            constrained or untrusted hosts. We want a detector that
+            runs on the only telemetry every modern Linux already
+            exports — <code>/proc</code> — and labels each ten-second
+            window of activity with the phase the workload is in.</p>
+          <p><strong>Research question.</strong> Can a sequence model
+            trained on twelve channels of <code>/proc</code> telemetry
+            classify five workload phases (clean / armed / infecting /
+            infected_running / dormant) accurately enough to drive
+            automated containment, <em>and</em> generalize across hosts
+            and malware profiles it has never seen during training?</p>
+          <p>The task is <strong>multi-class classification</strong>:
+            the target is one of five mutually-exclusive phase labels.
+            Not regression (no continuous target), not ranking
+            (downstream policy is a categorical containment decision).
+            We deliberately chose 10-second windows so detection
+            latency stays bounded for a real fleet.</p>
+        </div>
+      </section>
+
+      <section class="scene" data-stage="research-questions">
+        <div class="prose">
+          <h2>Research gaps + questions</h2>
+          <p>Literature on behaviour-based malware detection is rich but
+            uneven. Most published results either (a) use richer
+            telemetry than what a constrained host actually exports, or
+            (b) frame evaluation in ways that hide the cross-host
+            generalization problem. The card on the left summarises the
+            gap.</p>
+          <p>This project asks three concrete questions:</p>
+          <p><strong>RQ1.</strong> How well can a per-window classifier
+            identify workload phases from <code>/proc</code> alone, with
+            no syscall traces and no kernel hooks?</p>
+          <p><strong>RQ2.</strong> Does the model still work when test
+            episodes come from a host the training set never saw?</p>
+          <p><strong>RQ3.</strong> Of the standard sequence-model
+            families (RNN, GRU, LSTM, CNN, Transformer) plus a
+            non-parametric baseline (KNN) and a tabular baseline
+            (gradient-boosted trees), which trade off accuracy and
+            inference cost best for a deployment that has to run on a
+            constrained host?</p>
+        </div>
+      </section>
+
+      <section class="scene" data-stage="solution-overview">
+        <div class="prose">
+          <h2>Proposed solution</h2>
+          <p>A single end-to-end pipeline turns raw <code>/proc</code>
+            telemetry on a fleet host into a per-window phase verdict
+            in under a second. Each stage of the diagram on the left
+            is a thin, independently-deployable component — the
+            receiver doesn't know what model is running; the model
+            doesn't know where the episode came from.</p>
+          <p>The <strong>model zoo</strong> is the key abstraction:
+            every model class registers itself by name, declares its
+            input kind (summary features or window tensors), and plugs
+            into one shared training loop. KNN, GBT, MLP, CNN, RNN,
+            GRU, LSTM, and Transformer all reuse the same standardization,
+            schema-hashed checkpoint format, class-weighted CE loss,
+            and held-out-by-host evaluation — so the comparison is
+            genuinely apples-to-apples.</p>
+          <p>The detector's per-window verdict feeds two downstream
+            loops: a fleet-wide <strong>trust score</strong> that
+            combines local classification with network-behaviour
+            signals (per IEEE 9881803), and a <strong>fast-recovery</strong>
+            snapshot rollback when an infection time is known.</p>
+        </div>
+      </section>
+
       <section class="scene" data-stage="stack">
         <div class="prose">
           <h2>Live, not staged</h2>
@@ -553,6 +1037,31 @@
         </div>
       </section>
 
+      <section class="scene" data-stage="evaluation-setup">
+        <div class="prose">
+          <h2>Evaluation setup</h2>
+          <p>Three choices anchor every result on the next slides — the
+            split recipe, the primary metric, and what we measure next
+            to accuracy. The temptation is to report a single big
+            number; we report a number you can argue with.</p>
+          <p><strong>Held-out by host.</strong> Train and validate on
+            one machine; test on a different machine. A model that
+            wins by memorising the train host's idle profile loses
+            here, which is what you want — a fleet detector has to
+            generalize across hosts it never saw at training time.</p>
+          <p><strong>Macro-F1, not accuracy.</strong> The dataset is
+            heavily skewed: roughly half the labelled time is
+            <code>infected_running</code> and only ~5 % is
+            <code>armed</code>. A "predict the majority class"
+            baseline already hits 0.5 accuracy. Macro-F1 averages F1
+            across all five phases so rare classes count.</p>
+          <p><strong>Latency reported with accuracy.</strong> A model
+            that's one F1 point better but ten milliseconds slower
+            may still be the wrong choice for an on-host detector.
+            The perf scene plots both axes so the trade-off is visible.</p>
+        </div>
+      </section>
+
       <section class="scene" data-stage="models">
         <div class="prose">
           <h2>Sequence models</h2>
@@ -632,6 +1141,141 @@
         </div>
       </section>
 
+      <section class="scene" data-stage="theoretical">
+        <div class="prose">
+          <h2>Theoretical contributions</h2>
+          <p>Three methodological claims this project makes — small in
+            isolation, but together they change how the comparison is
+            run. Each shows up explicitly in the codebase.</p>
+          <p><strong>Window-centre labelling.</strong> Instead of
+            majority-voting phase labels across each 10-second window
+            (which creates noisy boundaries), we label each window by
+            the phase that occupies its centre. Cleaner training
+            signal at transitions, no spurious "ambiguous" class.</p>
+          <p><strong>Schema-hashed checkpoints.</strong> Every
+            checkpoint embeds a hash of the feature schema it was
+            trained on. Loading a model against a different schema
+            fails fast. Without this, retroactive comparison silently
+            scores models on misaligned columns and reports nonsense.</p>
+          <p><strong>Cross-host as the eval axis.</strong>
+            Held-out-by-host is reported as a first-class number
+            alongside held-out-by-sample — the two often disagree by
+            ~0.4 macro-F1, and only the cross-host number predicts
+            real fleet behaviour.</p>
+        </div>
+      </section>
+
+      <section class="scene" data-stage="practical">
+        <div class="prose">
+          <h2>Practical contributions</h2>
+          <p>What others can pick up and use from this project — beyond
+            the published numbers.</p>
+          <p><strong>/proc-only deployment.</strong> The detector needs
+            no syscall hooks, no eBPF, no kernel module. It runs on
+            hosts that don't permit deeper instrumentation — a small
+            VM, a container with limited capabilities, an embedded
+            device. One Python service plus a model file.</p>
+          <p><strong>Producer-agnostic dashboard.</strong> The deck
+            consumes typed events
+            (<code>training/dashboard/events.py</code>); the inference
+            loop runs anywhere — Pi, A100, cloud — and just POSTs back.
+            Same UI for a lab demo and an operational console.</p>
+          <p><strong>Labelled dataset on disk.</strong> 78 000+
+            episodes across two hosts and six attack profiles, archived
+            in zstd-compressed tarballs with a schema-versioned format.
+            Anyone reproducing or extending this work can start from
+            the dataset directly without re-running the orchestrator.</p>
+        </div>
+      </section>
+
+      <section class="scene" data-stage="design-principles">
+        <div class="prose">
+          <h2>Design principles</h2>
+          <p>Three patterns that emerged during the project and earned
+            their keep enough that we'd repeat them.</p>
+          <p><strong>One loop, many models.</strong> Every NN
+            architecture plugs into the same training loop — class
+            weights, AMP autocast, cosine LR with warmup, gradient
+            clipping, early stop on val macro-F1. Architecture changes
+            don't ripple into orchestration, and adding a new model
+            class costs ~80 lines.</p>
+          <p><strong>Typed events as contract.</strong> Producers and
+            consumers agree on dataclasses, not free-form dicts.
+            Adding a new dashboard scene means adding a new dataclass;
+            adding a new producer means importing it. Static checking
+            and editor autocomplete do most of the work that a
+            schema-validation library would do at runtime.</p>
+          <p><strong>Two-agent path ownership.</strong> Dashboard work
+            and model work live in two parallel sessions with a
+            documented path-ownership boundary
+            (<code>training/dashboard/</code> vs everywhere else).
+            Merges go through git with explicit rebases instead of a
+            shared workspace — slow up front, fewer subtle stomps
+            over time.</p>
+        </div>
+      </section>
+
+      <section class="scene" data-stage="limitations">
+        <div class="prose">
+          <h2>Limitations</h2>
+          <p>What this project cannot honestly claim — and why each
+            line on the left matters for how the results should be read.</p>
+          <p><strong>Two-host fleet.</strong> Cross-host generalization
+            is reported between exactly two machines; it's the right
+            <em>shape</em> of evaluation but not a population claim.
+            More hosts on the WireGuard mesh would let us report
+            distributional bounds rather than single point comparisons.</p>
+          <p><strong>Synthetic attack profiles.</strong> Our six
+            profiles cover the main behavioural envelopes
+            (cpu-saturate, ransomware-lite, bursty-c2, fork-bomb,
+            crypto-miner, distccd-exec) but real-world malware can
+            sit between or outside these envelopes. Generalization to
+            unseen profiles is reported via held-out-by-sample, but
+            in-the-wild distribution shift is unknown.</p>
+          <p><strong>10 Hz sampling floor.</strong> Sub-100ms
+            behaviours fall inside a single sample. Detection of
+            millisecond-scale privilege checks would need faster
+            telemetry than <code>/proc</code> provides.</p>
+          <p><strong>KNN cross-host gap.</strong> KNN scores val
+            macro-F1 ≈ 0.74 on the train host but only ≈ 0.13 on the
+            held-out one. Instance-based memorization of the training
+            host's feature space — informative as a baseline, not a
+            deployment candidate.</p>
+        </div>
+      </section>
+
+      <section class="scene" data-stage="conclusion-future">
+        <div class="prose">
+          <h2>Conclusion + future work</h2>
+          <p>A per-host classifier trained on <code>/proc</code>-only
+            telemetry can identify workload phases at multi-class
+            macro-F1 well above chance and slot into a wider
+            trust / containment / recovery loop. The recurrent family
+            (LSTM/GRU) and Transformer sit on the upper-left of the
+            accuracy-vs-cost frontier; KNN and GBT are honest baselines.
+            Held-out-by-host evaluation is the right generalization
+            axis — held-out-by-sample overstates real fleet
+            performance by 0.3+ F1.</p>
+          <p><strong>Unsupervised next steps.</strong> The natural
+            extensions are unsupervised:</p>
+          <p>• <strong>Clustering</strong> the unlabeled tail of new
+            fleet data (KMeans / HDBSCAN) to surface novel workload
+            shapes the supervised model has no class for — a
+            self-training feedback loop that enrolls new phases as
+            the fleet grows.</p>
+          <p>• <strong>Anomaly detection</strong> on the last-layer
+            embedding (one-class SVM, isolation forest) so a "none of
+            the five known phases" verdict is available alongside the
+            classifier output.</p>
+          <p>• <strong>Self-supervised pretraining</strong> on the much
+            larger pool of unlabeled telemetry from operational hosts;
+            supervised fine-tune on the smaller orchestrated dataset.</p>
+          <p>• <strong>Embedding visualisation</strong> via UMAP /
+            t-SNE for human-in-the-loop labelling — already prototyped
+            in the KNN scene's interactive 3-D scatter.</p>
+        </div>
+      </section>
+
       <section class="scene" data-stage="references">
         <div class="prose">
           <h2>References</h2>