diff --git a/references/links.md b/references/links.md index 14a2e5a..a7f393e 100644 --- a/references/links.md +++ b/references/links.md @@ -1,6 +1,10 @@ # Reference Links -- https://github.com/mitre/caldera?tab=security-ov-file -- https://github.com/PiyushxJangid/DLHIDS -- https://github.com/ArpanDFrank/Host-Intrusion-Detection-System-using-Hybrid-CNN-LSTM-Models-and-RL-Actor-Critic-Models +- https://github.com/mitre/caldera?tab=security-ov-file — adversary emulation framework (red-team/blue-team) +- https://github.com/PiyushxJangid/DLHIDS — deep-learning HIDS reference implementation +- https://github.com/ArpanDFrank/Host-Intrusion-Detection-System-using-Hybrid-CNN-LSTM-Models-and-RL-Actor-Critic-Models — Hybrid CNN-LSTM + RL HIDS prior art - https://ieeexplore.ieee.org/document/9881803 — per-device trust establishment from network behaviour (cited on motivation scene) +- https://pytorch.org/docs/stable/index.html — PyTorch reference (LSTM / GRU / CNN / Transformer module APIs used by the model zoo) +- https://xgboost.readthedocs.io/en/stable/ — XGBoost reference (gradient-boosted-trees baseline; Chen & Guestrin, KDD 2016) +- https://scikit-learn.org/stable/ — scikit-learn reference (KNN, KMeans, PCA, evaluation metrics) +- https://man7.org/linux/man-pages/man5/proc.5.html — proc(5) — the Linux kernel interface this project's telemetry comes from diff --git a/training/dashboard/static/dashboard.css b/training/dashboard/static/dashboard.css index 2b0a7f6..3bd22ab 100644 --- a/training/dashboard/static/dashboard.css +++ b/training/dashboard/static/dashboard.css @@ -1067,6 +1067,205 @@ html, body { overflow-anchor: none; } color: var(--fg-dim); } +/* ─── Problem statement (scene: problem-statement) ─────────────────── */ +.problem-claim { + padding: clamp(16px, 2vh, 28px) clamp(18px, 2vw, 28px); + background: var(--bg-elev, rgba(255, 255, 255, 0.03)); + border: 1px solid var(--line); + border-left: 4px solid var(--accent); + border-radius: 4px; +} +.problem-claim-text { + font-size: clamp(16px, 1.5vw, 22px); + line-height: 1.45; + color: var(--fg); + font-weight: 500; +} +.problem-stats { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: clamp(10px, 1.4vw, 18px); +} +.problem-stat { + padding: clamp(14px, 1.8vh, 22px); + background: var(--bg-elev, rgba(255, 255, 255, 0.03)); + border: 1px solid var(--line); + border-radius: 4px; + display: flex; flex-direction: column; gap: 4px; + align-items: flex-start; +} +.problem-stat-num { + font: 700 clamp(28px, 3.4vw, 44px) + ui-monospace, SFMono-Regular, Menlo, monospace; + color: var(--accent); + line-height: 1; +} +.problem-stat-lbl { + font-size: clamp(12px, 0.95vw, 14px); + color: var(--fg-dim); + line-height: 1.35; +} +.problem-task { + padding: 12px 16px; + background: var(--bg); + border: 1px solid var(--line); + border-radius: 4px; + font-size: clamp(13px, 1vw, 15px); + color: var(--fg-dim); + line-height: 1.5; +} +.problem-task-label { color: var(--fg-mute); margin-right: 6px; } +.problem-task-value { color: var(--fg); font-weight: 600; } +.problem-task-detail { color: var(--fg-dim); } + +/* ─── Research questions (scene: research-questions) ───────────────── */ +.research-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: clamp(12px, 1.6vw, 22px); +} +.research-col { + padding: clamp(14px, 1.8vh, 22px); + background: var(--bg-elev, rgba(255, 255, 255, 0.03)); + border: 1px solid var(--line); + border-radius: 4px; +} +.research-col-title { + font: 600 clamp(13px, 1.05vw, 15px) + ui-monospace, SFMono-Regular, Menlo, monospace; + color: var(--accent); + letter-spacing: 0.04em; + text-transform: uppercase; + margin-bottom: 12px; +} +.research-list { + list-style: none; padding: 0; margin: 0; + display: flex; flex-direction: column; gap: 10px; + font-size: clamp(13px, 1vw, 15px); + line-height: 1.45; + color: var(--fg-dim); +} +.research-list li::before { + content: '·'; color: var(--accent); margin-right: 8px; +} +.research-list strong { color: var(--fg); } + +/* ─── Solution overview (scene: solution-overview) ─────────────────── */ +.pipeline-svg { + width: 100%; + height: clamp(360px, 60vh, 640px); + background: var(--bg-elev, rgba(255, 255, 255, 0.03)); + border: 1px solid var(--line); + border-radius: 4px; + padding: 12px; + box-sizing: border-box; +} +.pipeline-stage rect { + fill: var(--bg); + stroke: var(--accent); + stroke-width: 1.5; +} +.pipeline-stage-models rect { + fill: var(--accent-soft, rgba(80, 140, 220, 0.08)); + stroke-width: 2; +} +.pipeline-stage-final rect { + stroke: var(--phase-clean); +} +.pipeline-stage text { + fill: var(--fg); + font: 600 14px ui-monospace, SFMono-Regular, Menlo, monospace; +} +.pipeline-stage-title { + font-size: 16px !important; +} +.pipeline-detail { + fill: var(--fg-dim) !important; + font-weight: 400 !important; + font-size: 11px !important; +} +.pipeline-detail-mini { + fill: var(--fg-mute) !important; + font-weight: 400 !important; + font-size: 10px !important; +} +.pipeline-arrow path { + stroke: var(--fg-mute); + stroke-width: 1.5; + stroke-linecap: round; + marker-end: url(#pipe-arrow); +} + +/* ─── Evaluation setup (scene: evaluation-setup) ───────────────────── */ +.eval-blocks { + display: grid; + grid-template-columns: 1fr 1fr; + gap: clamp(10px, 1.4vw, 18px); +} +.eval-block { + padding: clamp(12px, 1.6vh, 18px); + background: var(--bg-elev, rgba(255, 255, 255, 0.03)); + border: 1px solid var(--line); + border-radius: 4px; + display: flex; flex-direction: column; gap: 8px; +} +.eval-block-title { + font: 600 clamp(12px, 0.95vw, 14px) + ui-monospace, SFMono-Regular, Menlo, monospace; + color: var(--accent); + letter-spacing: 0.04em; + text-transform: uppercase; +} +.eval-block-body { + display: flex; flex-direction: column; gap: 6px; + font-size: clamp(13px, 1vw, 15px); + color: var(--fg-dim); + line-height: 1.45; +} +.eval-block-body strong { color: var(--fg); } +.eval-detail { + margin-top: 4px; + color: var(--fg-mute); + font-size: clamp(12px, 0.9vw, 13px); + font-style: italic; +} + +/* ─── Conclusion + future (scene: conclusion-future) ───────────────── */ +.conclusion-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: clamp(12px, 1.6vw, 22px); +} +.conclusion-col { + padding: clamp(14px, 1.8vh, 22px); + background: var(--bg-elev, rgba(255, 255, 255, 0.03)); + border: 1px solid var(--line); + border-radius: 4px; +} +.conclusion-col-title { + font: 600 clamp(13px, 1.05vw, 15px) + ui-monospace, SFMono-Regular, Menlo, monospace; + color: var(--accent); + letter-spacing: 0.04em; + text-transform: uppercase; + margin-bottom: 12px; +} +.conclusion-list { + list-style: none; padding: 0; margin: 0; + display: flex; flex-direction: column; gap: 10px; + font-size: clamp(13px, 1vw, 15px); + line-height: 1.45; + color: var(--fg-dim); +} +.conclusion-list li::before { + content: '·'; color: var(--accent); margin-right: 8px; +} +.conclusion-list strong { color: var(--fg); } + +/* ─── Limitations card uses the motivation-card pattern with an + armed-phase marker for the "warning" feel. ─── */ +.motivation-card-marker.mc-armed { background: var(--phase-armed); } + /* ─── Live detections (scene: live) ────────────────────────────────── */ .live-stack { gap: clamp(10px, 1.6vh, 20px); } diff --git a/training/dashboard/static/index.html b/training/dashboard/static/index.html index b5b356d..3e6abd0 100644 --- a/training/dashboard/static/index.html +++ b/training/dashboard/static/index.html @@ -4,7 +4,7 @@
/proc telemetry into one of five workload phases —
+ accurately enough to drive automated containment.clean → infected_running/proc channels/procinfected_running,
+ ~5 % armed. A constant majority predictor
+ hits 0.5 accuracy. macro-F1 averages per-class F1,
+ so rare phases actually count toward the score.events.py), not free-form dicts.
+ Adding a new scene means adding a new dataclass;
+ adding a new producer means importing it./proc currently provides.Today's behaviour-based IDS systems rely on syscall traces,
+ kernel hooks, or rich endpoint agents that can't ship to
+ constrained or untrusted hosts. We want a detector that
+ runs on the only telemetry every modern Linux already
+ exports — /proc — and labels each ten-second
+ window of activity with the phase the workload is in.
Research question. Can a sequence model
+ trained on twelve channels of /proc telemetry
+ classify five workload phases (clean / armed / infecting /
+ infected_running / dormant) accurately enough to drive
+ automated containment, and generalize across hosts
+ and malware profiles it has never seen during training?
The task is multi-class classification: + the target is one of five mutually-exclusive phase labels. + Not regression (no continuous target), not ranking + (downstream policy is a categorical containment decision). + We deliberately chose 10-second windows so detection + latency stays bounded for a real fleet.
+Literature on behaviour-based malware detection is rich but + uneven. Most published results either (a) use richer + telemetry than what a constrained host actually exports, or + (b) frame evaluation in ways that hide the cross-host + generalization problem. The card on the left summarises the + gap.
+This project asks three concrete questions:
+RQ1. How well can a per-window classifier
+ identify workload phases from /proc alone, with
+ no syscall traces and no kernel hooks?
RQ2. Does the model still work when test + episodes come from a host the training set never saw?
+RQ3. Of the standard sequence-model + families (RNN, GRU, LSTM, CNN, Transformer) plus a + non-parametric baseline (KNN) and a tabular baseline + (gradient-boosted trees), which trade off accuracy and + inference cost best for a deployment that has to run on a + constrained host?
+A single end-to-end pipeline turns raw /proc
+ telemetry on a fleet host into a per-window phase verdict
+ in under a second. Each stage of the diagram on the left
+ is a thin, independently-deployable component — the
+ receiver doesn't know what model is running; the model
+ doesn't know where the episode came from.
The model zoo is the key abstraction: + every model class registers itself by name, declares its + input kind (summary features or window tensors), and plugs + into one shared training loop. KNN, GBT, MLP, CNN, RNN, + GRU, LSTM, and Transformer all reuse the same standardization, + schema-hashed checkpoint format, class-weighted CE loss, + and held-out-by-host evaluation — so the comparison is + genuinely apples-to-apples.
+The detector's per-window verdict feeds two downstream + loops: a fleet-wide trust score that + combines local classification with network-behaviour + signals (per IEEE 9881803), and a fast-recovery + snapshot rollback when an infection time is known.
+Three choices anchor every result on the next slides — the + split recipe, the primary metric, and what we measure next + to accuracy. The temptation is to report a single big + number; we report a number you can argue with.
+Held-out by host. Train and validate on + one machine; test on a different machine. A model that + wins by memorising the train host's idle profile loses + here, which is what you want — a fleet detector has to + generalize across hosts it never saw at training time.
+Macro-F1, not accuracy. The dataset is
+ heavily skewed: roughly half the labelled time is
+ infected_running and only ~5 % is
+ armed. A "predict the majority class"
+ baseline already hits 0.5 accuracy. Macro-F1 averages F1
+ across all five phases so rare classes count.
Latency reported with accuracy. A model + that's one F1 point better but ten milliseconds slower + may still be the wrong choice for an on-host detector. + The perf scene plots both axes so the trade-off is visible.
+Three methodological claims this project makes — small in + isolation, but together they change how the comparison is + run. Each shows up explicitly in the codebase.
+Window-centre labelling. Instead of + majority-voting phase labels across each 10-second window + (which creates noisy boundaries), we label each window by + the phase that occupies its centre. Cleaner training + signal at transitions, no spurious "ambiguous" class.
+Schema-hashed checkpoints. Every + checkpoint embeds a hash of the feature schema it was + trained on. Loading a model against a different schema + fails fast. Without this, retroactive comparison silently + scores models on misaligned columns and reports nonsense.
+Cross-host as the eval axis. + Held-out-by-host is reported as a first-class number + alongside held-out-by-sample — the two often disagree by + ~0.4 macro-F1, and only the cross-host number predicts + real fleet behaviour.
+What others can pick up and use from this project — beyond + the published numbers.
+/proc-only deployment. The detector needs + no syscall hooks, no eBPF, no kernel module. It runs on + hosts that don't permit deeper instrumentation — a small + VM, a container with limited capabilities, an embedded + device. One Python service plus a model file.
+Producer-agnostic dashboard. The deck
+ consumes typed events
+ (training/dashboard/events.py); the inference
+ loop runs anywhere — Pi, A100, cloud — and just POSTs back.
+ Same UI for a lab demo and an operational console.
Labelled dataset on disk. 78 000+ + episodes across two hosts and six attack profiles, archived + in zstd-compressed tarballs with a schema-versioned format. + Anyone reproducing or extending this work can start from + the dataset directly without re-running the orchestrator.
+Three patterns that emerged during the project and earned + their keep enough that we'd repeat them.
+One loop, many models. Every NN + architecture plugs into the same training loop — class + weights, AMP autocast, cosine LR with warmup, gradient + clipping, early stop on val macro-F1. Architecture changes + don't ripple into orchestration, and adding a new model + class costs ~80 lines.
+Typed events as contract. Producers and + consumers agree on dataclasses, not free-form dicts. + Adding a new dashboard scene means adding a new dataclass; + adding a new producer means importing it. Static checking + and editor autocomplete do most of the work that a + schema-validation library would do at runtime.
+Two-agent path ownership. Dashboard work
+ and model work live in two parallel sessions with a
+ documented path-ownership boundary
+ (training/dashboard/ vs everywhere else).
+ Merges go through git with explicit rebases instead of a
+ shared workspace — slow up front, fewer subtle stomps
+ over time.
What this project cannot honestly claim — and why each + line on the left matters for how the results should be read.
+Two-host fleet. Cross-host generalization + is reported between exactly two machines; it's the right + shape of evaluation but not a population claim. + More hosts on the WireGuard mesh would let us report + distributional bounds rather than single point comparisons.
+Synthetic attack profiles. Our six + profiles cover the main behavioural envelopes + (cpu-saturate, ransomware-lite, bursty-c2, fork-bomb, + crypto-miner, distccd-exec) but real-world malware can + sit between or outside these envelopes. Generalization to + unseen profiles is reported via held-out-by-sample, but + in-the-wild distribution shift is unknown.
+10 Hz sampling floor. Sub-100ms
+ behaviours fall inside a single sample. Detection of
+ millisecond-scale privilege checks would need faster
+ telemetry than /proc provides.
KNN cross-host gap. KNN scores val + macro-F1 ≈ 0.74 on the train host but only ≈ 0.13 on the + held-out one. Instance-based memorization of the training + host's feature space — informative as a baseline, not a + deployment candidate.
+A per-host classifier trained on /proc-only
+ telemetry can identify workload phases at multi-class
+ macro-F1 well above chance and slot into a wider
+ trust / containment / recovery loop. The recurrent family
+ (LSTM/GRU) and Transformer sit on the upper-left of the
+ accuracy-vs-cost frontier; KNN and GBT are honest baselines.
+ Held-out-by-host evaluation is the right generalization
+ axis — held-out-by-sample overstates real fleet
+ performance by 0.3+ F1.
Unsupervised next steps. The natural + extensions are unsupervised:
+• Clustering the unlabeled tail of new + fleet data (KMeans / HDBSCAN) to surface novel workload + shapes the supervised model has no class for — a + self-training feedback loop that enrolls new phases as + the fleet grows.
+• Anomaly detection on the last-layer + embedding (one-class SVM, isolation forest) so a "none of + the five known phases" verdict is available alongside the + classifier output.
+• Self-supervised pretraining on the much + larger pool of unlabeled telemetry from operational hosts; + supervised fine-tune on the smaller orchestrated dataset.
+• Embedding visualisation via UMAP / + t-SNE for human-in-the-loop labelling — already prototyped + in the KNN scene's interactive 3-D scatter.
+