baseline: phase mix from sampled dataset, not 5-min window

The widget was waiting on live `phase` events that don't flow when no
orchestrator is running, so it sat empty. Replace the rolling
5-minute window with a periodic feeder that samples 500 random
episode tarballs from /var/lib/cis490/episodes, extracts each
labels.jsonl, and aggregates phase durations using consecutive
t_mono_ns deltas. Result lands in broadcaster.state["phase_mix"]
(survives snapshot cycles via dict.update) and re-broadcasts every
~10 min.

Frontend reads phase_mix from snapshot on connect and from live
phase_mix events on refresh; the bar uses time-weighted proportions
when available (falls back to label counts), and only sums canonical
phases for the denominator so non-displayed `failed` records don't
shrink the visible bars. Eyebrow and sub-line update with live
sample/population/label counts.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Max Gorog 2026-05-08 13:04:30 -05:00
parent ac9b5b6f07
commit 51f2437b71
3 changed files with 260 additions and 23 deletions

View file

@ -23,6 +23,9 @@ import asyncio
import json
import logging
import os
import random
import subprocess
import tarfile
from pathlib import Path
from typing import Any, Awaitable, Callable
@ -252,7 +255,9 @@ async def snapshot_loop(
snap = await asyncio.to_thread(
_snapshot_state, data_root, index_path, alerts_path
)
broadcaster.state = snap
# `update` instead of `=` so out-of-band keys (phase_mix,
# anything future loops add) survive snapshot cycles.
broadcaster.state.update(snap)
if first:
log.info(
"snapshot: total_episodes=%d total_alerts=%d hosts=%d",
@ -267,6 +272,205 @@ async def snapshot_loop(
await asyncio.sleep(poll_interval)
# ─────────────────────────────────────────────────────────────────────
# Phase mix (dataset)
# ─────────────────────────────────────────────────────────────────────
#
# The baseline scene shows the proportion of time the workload spent
# in each labelled phase. Originally a rolling 5-min window of live
# `phase` events — but live events only flow when the orchestrator is
# running. To keep the slide reflecting *actual data* we sample N
# random episode tarballs on disk, extract the labels.jsonl from
# each (which is a list of phase-transition events stamped with
# t_mono_ns), and aggregate phase durations across the sample.
PHASE_MIX_SAMPLE = 500
PHASE_MIX_INTERVAL = 600.0 # seconds; ~10 min
def _read_episode_labels(path: Path, *, timeout: float = 10.0) -> list[dict] | None:
"""Stream-extract labels.jsonl from an episode tarball.
Uses ``zstd -dc | tarfile r|`` so we can break out of the stream as
soon as labels.jsonl appears (it's near the front of the tar) and
kill the zstd subprocess immediately after avoiding a full
decompress when we only need ~10 lines.
"""
if not path.is_file():
return None
try:
proc = subprocess.Popen(
["zstd", "-dc", str(path)],
stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
)
except OSError:
return None
out: list[dict] | None = None
try:
with tarfile.open(fileobj=proc.stdout, mode="r|") as tar:
for member in tar:
if not member.isfile():
continue
name = member.name.rsplit("/", 1)[-1]
if name == "labels.jsonl":
f = tar.extractfile(member)
if f is None:
continue
data = f.read()
out = []
for line in data.splitlines():
line = line.strip()
if not line:
continue
try:
out.append(json.loads(line))
except json.JSONDecodeError:
pass
break
except (tarfile.TarError, OSError):
out = None
finally:
if proc.stdout:
try: proc.stdout.close()
except Exception: pass
try: proc.kill() # short-circuit zstd if labels was the only file we needed
except Exception: pass
try: proc.wait(timeout=timeout)
except Exception: pass
return out
def _aggregate_episode_labels(
records: list[dict],
counts: dict[str, int],
weighted: dict[str, float],
) -> int:
"""Fold one episode's labels into running counters.
Each record is a phase-transition event with ``t_mono_ns``. The
duration of phase ``rec[i].phase`` is ``t_mono_ns[i+1] -
t_mono_ns[i]``; the trailing record gets a count bump but no
duration weight (we don't know its end without meta.json, and the
last phase is usually short ``infected_running`` cleanup so the
bias is acceptable for proportional display).
"""
if not records:
return 0
rec = sorted(records, key=lambda r: r.get("t_mono_ns") or 0)
n_labels = 0
if len(rec) == 1:
p = rec[0].get("phase")
if p:
counts[p] = counts.get(p, 0) + 1
n_labels = 1
return n_labels
for i in range(len(rec) - 1):
p = rec[i].get("phase")
if not p:
continue
t0 = rec[i].get("t_mono_ns")
t1 = rec[i + 1].get("t_mono_ns")
if not (isinstance(t0, (int, float)) and isinstance(t1, (int, float))):
continue
dur_s = max(0.0, (float(t1) - float(t0)) / 1e9)
weighted[p] = weighted.get(p, 0.0) + dur_s
counts[p] = counts.get(p, 0) + 1
n_labels += 1
tail = rec[-1].get("phase")
if tail:
counts[tail] = counts.get(tail, 0) + 1
n_labels += 1
return n_labels
def _compute_phase_mix(data_root: Path, sample: int = PHASE_MIX_SAMPLE) -> dict:
"""Sample N random episodes, aggregate their phase durations.
Returns ``{}`` if no episodes are on disk yet. Returns a dict with
``counts``, ``weighted_seconds``, ``sampled_episodes``,
``population_episodes``, and ``total_labels`` otherwise.
"""
episodes_root = data_root / "episodes"
if not episodes_root.is_dir():
return {}
files: list[Path] = []
try:
for host_dir in episodes_root.iterdir():
if not host_dir.is_dir():
continue
try:
for entry in host_dir.iterdir():
if entry.is_file() and entry.name.endswith(".tar.zst"):
files.append(entry)
except OSError:
continue
except OSError:
return {}
if not files:
return {}
population = len(files)
chosen = files if sample >= population else random.sample(files, sample)
counts: dict[str, int] = {}
weighted: dict[str, float] = {}
sampled_episodes = 0
total_labels = 0
for path in chosen:
labels = _read_episode_labels(path)
if not labels:
continue
added = _aggregate_episode_labels(labels, counts, weighted)
if added:
sampled_episodes += 1
total_labels += added
return {
"counts": counts,
"weighted_seconds": weighted,
"sampled_episodes": sampled_episodes,
"population_episodes": population,
"total_labels": total_labels,
}
async def phase_mix_loop(
broadcaster,
*,
data_root: Path,
poll_interval: float = PHASE_MIX_INTERVAL,
sample: int = PHASE_MIX_SAMPLE,
) -> None:
"""Recompute the dataset phase mix on a slow timer.
Lives off the main event loop via ``to_thread`` because the zstd
decompress + tar parse for a few hundred episodes takes long
enough to be noticeable on a Pi (still typically < 30 s).
"""
while True:
try:
mix = await asyncio.to_thread(_compute_phase_mix, data_root, sample)
if mix:
broadcaster.state["phase_mix"] = mix
await broadcaster.publish({"type": "phase_mix", **mix})
log.info(
"phase_mix: %d/%d episodes sampled, %d labels, "
"weighted=%s",
mix.get("sampled_episodes", 0),
mix.get("population_episodes", 0),
mix.get("total_labels", 0),
{k: round(v, 1) for k, v in mix.get("weighted_seconds", {}).items()},
)
else:
log.info("phase_mix: no episodes on disk yet, retrying")
except asyncio.CancelledError:
raise
except Exception:
log.exception("phase_mix_loop error")
await asyncio.sleep(poll_interval)
# ─────────────────────────────────────────────────────────────────────
# Lifecycle
# ─────────────────────────────────────────────────────────────────────
@ -288,5 +492,8 @@ def start_feeders(broadcaster, *, data_root: Path = DEFAULT_DATA_ROOT) -> list[a
asyncio.create_task(
watch_alerts_jsonl(publish, alerts_path),
name="cis490.feeder.alerts"),
asyncio.create_task(
phase_mix_loop(broadcaster, data_root=data_root),
name="cis490.feeder.phase_mix"),
]
return tasks

View file

@ -1132,14 +1132,19 @@ for epoch in range(20):
});
})();
// ── Phase mix (rolling 5 min) ─────────────────────────────────
// Real-data widget. Will be empty until phase events flow.
// ── Phase mix (dataset-derived) ───────────────────────────────
// Real-data widget. Driven by the dashboard's `phase_mix` feeder,
// which periodically samples random episode tarballs on disk and
// aggregates labels.jsonl phase durations across them. The feeder
// tucks the result into `broadcaster.state["phase_mix"]` so it
// arrives in the snapshot the WS sends on connect, and republishes
// a `phase_mix` event each time it recomputes.
(function () {
const stack = document.getElementById('phase-stack');
const legend = document.getElementById('phase-legend');
const eyebrow = document.getElementById('phase-mix-eyebrow');
const sub = document.getElementById('phase-mix-sub');
const PHASES = ['clean', 'armed', 'infecting', 'infected_running', 'dormant'];
const WINDOW_MS = 5 * 60 * 1000;
const samples = [];
const segs = new Map();
PHASES.forEach(p => {
@ -1151,21 +1156,45 @@ for epoch in range(20):
legend.appendChild(li);
});
function render() {
const now = Date.now();
while (samples.length && now - samples[0].t > WINDOW_MS) samples.shift();
const counts = Object.fromEntries(PHASES.map(p => [p, 0]));
samples.forEach(s => { if (counts[s.phase] !== undefined) counts[s.phase]++; });
const total = Math.max(1, samples.length);
PHASES.forEach(p => { segs.get(p).style.flexGrow = (counts[p] / total).toFixed(4); });
function fmtInt(n) { return (typeof n === 'number') ? n.toLocaleString() : '—'; }
function applyMix(mix) {
if (!mix) return;
const w = mix.weighted_seconds || {};
const c = mix.counts || {};
// Prefer time-weighted proportions; fall back to label counts.
const useWeighted = Object.values(w).some(v => v > 0);
const src = useWeighted ? w : c;
// Sum only the canonical phases so non-displayed phases (e.g.
// `failed` from the orchestrator) don't shrink the visible bars.
const total = PHASES.reduce((a, p) => a + (src[p] || 0), 0) || 1;
PHASES.forEach(p => {
segs.get(p).style.flexGrow = ((src[p] || 0) / total).toFixed(4);
});
if (eyebrow) {
const tag = useWeighted ? 'time-weighted' : 'label-count';
eyebrow.textContent =
`phase mix · ${fmtInt(mix.sampled_episodes)} of ${fmtInt(mix.population_episodes)} episodes · ${tag}`;
}
if (sub) {
const hours = useWeighted
? Math.round(PHASES.reduce((a, p) => a + (w[p] || 0), 0) / 3600)
: null;
sub.innerHTML =
`Aggregated across <strong>${fmtInt(mix.sampled_episodes)}</strong> ` +
`randomly-sampled episodes ` +
`(<strong>${fmtInt(mix.total_labels)}</strong> phase records` +
(hours != null ? `, ~<strong>${fmtInt(hours)}</strong> hours` : '') +
`). Refreshes every ~10 min from disk.`;
}
}
on('phase', m => {
if (!m.phase) return;
samples.push({ phase: m.phase, t: Date.now() }); render();
on('snapshot', m => { if (m.phase_mix) applyMix(m.phase_mix); });
on('phase_mix', applyMix);
on('demo_stop', () => {
// Demo toggle off doesn't wipe the dataset mix — the dataset is
// ground truth, the demo only fakes per-event widgets.
});
on('demo_stop', () => { samples.length = 0; render(); });
setInterval(render, 1000);
})();
// ── Database explorer ─────────────────────────────────────────

View file

@ -243,13 +243,14 @@
<!-- 6. baseline -->
<div class="stage-view" data-view="baseline">
<div class="metric-stack">
<div class="metric-eyebrow">phase mix · last 5 min</div>
<div class="metric-eyebrow" id="phase-mix-eyebrow">phase mix · sampling dataset…</div>
<div class="phase-stack" id="phase-stack"></div>
<div class="phase-legend" id="phase-legend"></div>
<div class="metric-sub">awaiting <code>phase</code> events from
the orchestrator. A clean fleet sits mostly in
<code>clean</code>; skew toward <code>infecting</code> means
the workload is firing.</div>
<div class="metric-sub" id="phase-mix-sub">computing the phase
distribution across a random sample of episodes on disk.
A clean fleet sits mostly in <code>clean</code>; skew toward
<code>infecting</code> / <code>infected_running</code>
reflects time spent under attack workloads.</div>
</div>
</div>
@ -528,6 +529,6 @@
</article>
</div>
<script src="/static/dashboard.js?v=246d8985"></script>
<script src="/static/dashboard.js?v=a087e0a4"></script>
</body>
</html>