baseline: phase mix from sampled dataset, not 5-min window
The widget was waiting on live `phase` events that don't flow when no orchestrator is running, so it sat empty. Replace the rolling 5-minute window with a periodic feeder that samples 500 random episode tarballs from /var/lib/cis490/episodes, extracts each labels.jsonl, and aggregates phase durations using consecutive t_mono_ns deltas. Result lands in broadcaster.state["phase_mix"] (survives snapshot cycles via dict.update) and re-broadcasts every ~10 min. Frontend reads phase_mix from snapshot on connect and from live phase_mix events on refresh; the bar uses time-weighted proportions when available (falls back to label counts), and only sums canonical phases for the denominator so non-displayed `failed` records don't shrink the visible bars. Eyebrow and sub-line update with live sample/population/label counts. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ac9b5b6f07
commit
51f2437b71
3 changed files with 260 additions and 23 deletions
|
|
@ -23,6 +23,9 @@ import asyncio
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import subprocess
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Awaitable, Callable
|
||||
|
||||
|
|
@ -252,7 +255,9 @@ async def snapshot_loop(
|
|||
snap = await asyncio.to_thread(
|
||||
_snapshot_state, data_root, index_path, alerts_path
|
||||
)
|
||||
broadcaster.state = snap
|
||||
# `update` instead of `=` so out-of-band keys (phase_mix,
|
||||
# anything future loops add) survive snapshot cycles.
|
||||
broadcaster.state.update(snap)
|
||||
if first:
|
||||
log.info(
|
||||
"snapshot: total_episodes=%d total_alerts=%d hosts=%d",
|
||||
|
|
@ -267,6 +272,205 @@ async def snapshot_loop(
|
|||
await asyncio.sleep(poll_interval)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
# Phase mix (dataset)
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
#
|
||||
# The baseline scene shows the proportion of time the workload spent
|
||||
# in each labelled phase. Originally a rolling 5-min window of live
|
||||
# `phase` events — but live events only flow when the orchestrator is
|
||||
# running. To keep the slide reflecting *actual data* we sample N
|
||||
# random episode tarballs on disk, extract the labels.jsonl from
|
||||
# each (which is a list of phase-transition events stamped with
|
||||
# t_mono_ns), and aggregate phase durations across the sample.
|
||||
|
||||
PHASE_MIX_SAMPLE = 500
|
||||
PHASE_MIX_INTERVAL = 600.0 # seconds; ~10 min
|
||||
|
||||
|
||||
def _read_episode_labels(path: Path, *, timeout: float = 10.0) -> list[dict] | None:
|
||||
"""Stream-extract labels.jsonl from an episode tarball.
|
||||
|
||||
Uses ``zstd -dc | tarfile r|`` so we can break out of the stream as
|
||||
soon as labels.jsonl appears (it's near the front of the tar) and
|
||||
kill the zstd subprocess immediately after — avoiding a full
|
||||
decompress when we only need ~10 lines.
|
||||
"""
|
||||
if not path.is_file():
|
||||
return None
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
["zstd", "-dc", str(path)],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||
)
|
||||
except OSError:
|
||||
return None
|
||||
out: list[dict] | None = None
|
||||
try:
|
||||
with tarfile.open(fileobj=proc.stdout, mode="r|") as tar:
|
||||
for member in tar:
|
||||
if not member.isfile():
|
||||
continue
|
||||
name = member.name.rsplit("/", 1)[-1]
|
||||
if name == "labels.jsonl":
|
||||
f = tar.extractfile(member)
|
||||
if f is None:
|
||||
continue
|
||||
data = f.read()
|
||||
out = []
|
||||
for line in data.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
out.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
break
|
||||
except (tarfile.TarError, OSError):
|
||||
out = None
|
||||
finally:
|
||||
if proc.stdout:
|
||||
try: proc.stdout.close()
|
||||
except Exception: pass
|
||||
try: proc.kill() # short-circuit zstd if labels was the only file we needed
|
||||
except Exception: pass
|
||||
try: proc.wait(timeout=timeout)
|
||||
except Exception: pass
|
||||
return out
|
||||
|
||||
|
||||
def _aggregate_episode_labels(
|
||||
records: list[dict],
|
||||
counts: dict[str, int],
|
||||
weighted: dict[str, float],
|
||||
) -> int:
|
||||
"""Fold one episode's labels into running counters.
|
||||
|
||||
Each record is a phase-transition event with ``t_mono_ns``. The
|
||||
duration of phase ``rec[i].phase`` is ``t_mono_ns[i+1] -
|
||||
t_mono_ns[i]``; the trailing record gets a count bump but no
|
||||
duration weight (we don't know its end without meta.json, and the
|
||||
last phase is usually short ``infected_running`` cleanup so the
|
||||
bias is acceptable for proportional display).
|
||||
"""
|
||||
if not records:
|
||||
return 0
|
||||
rec = sorted(records, key=lambda r: r.get("t_mono_ns") or 0)
|
||||
n_labels = 0
|
||||
if len(rec) == 1:
|
||||
p = rec[0].get("phase")
|
||||
if p:
|
||||
counts[p] = counts.get(p, 0) + 1
|
||||
n_labels = 1
|
||||
return n_labels
|
||||
for i in range(len(rec) - 1):
|
||||
p = rec[i].get("phase")
|
||||
if not p:
|
||||
continue
|
||||
t0 = rec[i].get("t_mono_ns")
|
||||
t1 = rec[i + 1].get("t_mono_ns")
|
||||
if not (isinstance(t0, (int, float)) and isinstance(t1, (int, float))):
|
||||
continue
|
||||
dur_s = max(0.0, (float(t1) - float(t0)) / 1e9)
|
||||
weighted[p] = weighted.get(p, 0.0) + dur_s
|
||||
counts[p] = counts.get(p, 0) + 1
|
||||
n_labels += 1
|
||||
tail = rec[-1].get("phase")
|
||||
if tail:
|
||||
counts[tail] = counts.get(tail, 0) + 1
|
||||
n_labels += 1
|
||||
return n_labels
|
||||
|
||||
|
||||
def _compute_phase_mix(data_root: Path, sample: int = PHASE_MIX_SAMPLE) -> dict:
|
||||
"""Sample N random episodes, aggregate their phase durations.
|
||||
|
||||
Returns ``{}`` if no episodes are on disk yet. Returns a dict with
|
||||
``counts``, ``weighted_seconds``, ``sampled_episodes``,
|
||||
``population_episodes``, and ``total_labels`` otherwise.
|
||||
"""
|
||||
episodes_root = data_root / "episodes"
|
||||
if not episodes_root.is_dir():
|
||||
return {}
|
||||
|
||||
files: list[Path] = []
|
||||
try:
|
||||
for host_dir in episodes_root.iterdir():
|
||||
if not host_dir.is_dir():
|
||||
continue
|
||||
try:
|
||||
for entry in host_dir.iterdir():
|
||||
if entry.is_file() and entry.name.endswith(".tar.zst"):
|
||||
files.append(entry)
|
||||
except OSError:
|
||||
continue
|
||||
except OSError:
|
||||
return {}
|
||||
if not files:
|
||||
return {}
|
||||
population = len(files)
|
||||
|
||||
chosen = files if sample >= population else random.sample(files, sample)
|
||||
|
||||
counts: dict[str, int] = {}
|
||||
weighted: dict[str, float] = {}
|
||||
sampled_episodes = 0
|
||||
total_labels = 0
|
||||
for path in chosen:
|
||||
labels = _read_episode_labels(path)
|
||||
if not labels:
|
||||
continue
|
||||
added = _aggregate_episode_labels(labels, counts, weighted)
|
||||
if added:
|
||||
sampled_episodes += 1
|
||||
total_labels += added
|
||||
|
||||
return {
|
||||
"counts": counts,
|
||||
"weighted_seconds": weighted,
|
||||
"sampled_episodes": sampled_episodes,
|
||||
"population_episodes": population,
|
||||
"total_labels": total_labels,
|
||||
}
|
||||
|
||||
|
||||
async def phase_mix_loop(
|
||||
broadcaster,
|
||||
*,
|
||||
data_root: Path,
|
||||
poll_interval: float = PHASE_MIX_INTERVAL,
|
||||
sample: int = PHASE_MIX_SAMPLE,
|
||||
) -> None:
|
||||
"""Recompute the dataset phase mix on a slow timer.
|
||||
|
||||
Lives off the main event loop via ``to_thread`` because the zstd
|
||||
decompress + tar parse for a few hundred episodes takes long
|
||||
enough to be noticeable on a Pi (still typically < 30 s).
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
mix = await asyncio.to_thread(_compute_phase_mix, data_root, sample)
|
||||
if mix:
|
||||
broadcaster.state["phase_mix"] = mix
|
||||
await broadcaster.publish({"type": "phase_mix", **mix})
|
||||
log.info(
|
||||
"phase_mix: %d/%d episodes sampled, %d labels, "
|
||||
"weighted=%s",
|
||||
mix.get("sampled_episodes", 0),
|
||||
mix.get("population_episodes", 0),
|
||||
mix.get("total_labels", 0),
|
||||
{k: round(v, 1) for k, v in mix.get("weighted_seconds", {}).items()},
|
||||
)
|
||||
else:
|
||||
log.info("phase_mix: no episodes on disk yet, retrying")
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception:
|
||||
log.exception("phase_mix_loop error")
|
||||
await asyncio.sleep(poll_interval)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
# Lifecycle
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -288,5 +492,8 @@ def start_feeders(broadcaster, *, data_root: Path = DEFAULT_DATA_ROOT) -> list[a
|
|||
asyncio.create_task(
|
||||
watch_alerts_jsonl(publish, alerts_path),
|
||||
name="cis490.feeder.alerts"),
|
||||
asyncio.create_task(
|
||||
phase_mix_loop(broadcaster, data_root=data_root),
|
||||
name="cis490.feeder.phase_mix"),
|
||||
]
|
||||
return tasks
|
||||
|
|
|
|||
|
|
@ -1132,14 +1132,19 @@ for epoch in range(20):
|
|||
});
|
||||
})();
|
||||
|
||||
// ── Phase mix (rolling 5 min) ─────────────────────────────────
|
||||
// Real-data widget. Will be empty until phase events flow.
|
||||
// ── Phase mix (dataset-derived) ───────────────────────────────
|
||||
// Real-data widget. Driven by the dashboard's `phase_mix` feeder,
|
||||
// which periodically samples random episode tarballs on disk and
|
||||
// aggregates labels.jsonl phase durations across them. The feeder
|
||||
// tucks the result into `broadcaster.state["phase_mix"]` so it
|
||||
// arrives in the snapshot the WS sends on connect, and republishes
|
||||
// a `phase_mix` event each time it recomputes.
|
||||
(function () {
|
||||
const stack = document.getElementById('phase-stack');
|
||||
const legend = document.getElementById('phase-legend');
|
||||
const eyebrow = document.getElementById('phase-mix-eyebrow');
|
||||
const sub = document.getElementById('phase-mix-sub');
|
||||
const PHASES = ['clean', 'armed', 'infecting', 'infected_running', 'dormant'];
|
||||
const WINDOW_MS = 5 * 60 * 1000;
|
||||
const samples = [];
|
||||
const segs = new Map();
|
||||
|
||||
PHASES.forEach(p => {
|
||||
|
|
@ -1151,21 +1156,45 @@ for epoch in range(20):
|
|||
legend.appendChild(li);
|
||||
});
|
||||
|
||||
function render() {
|
||||
const now = Date.now();
|
||||
while (samples.length && now - samples[0].t > WINDOW_MS) samples.shift();
|
||||
const counts = Object.fromEntries(PHASES.map(p => [p, 0]));
|
||||
samples.forEach(s => { if (counts[s.phase] !== undefined) counts[s.phase]++; });
|
||||
const total = Math.max(1, samples.length);
|
||||
PHASES.forEach(p => { segs.get(p).style.flexGrow = (counts[p] / total).toFixed(4); });
|
||||
function fmtInt(n) { return (typeof n === 'number') ? n.toLocaleString() : '—'; }
|
||||
|
||||
function applyMix(mix) {
|
||||
if (!mix) return;
|
||||
const w = mix.weighted_seconds || {};
|
||||
const c = mix.counts || {};
|
||||
// Prefer time-weighted proportions; fall back to label counts.
|
||||
const useWeighted = Object.values(w).some(v => v > 0);
|
||||
const src = useWeighted ? w : c;
|
||||
// Sum only the canonical phases so non-displayed phases (e.g.
|
||||
// `failed` from the orchestrator) don't shrink the visible bars.
|
||||
const total = PHASES.reduce((a, p) => a + (src[p] || 0), 0) || 1;
|
||||
PHASES.forEach(p => {
|
||||
segs.get(p).style.flexGrow = ((src[p] || 0) / total).toFixed(4);
|
||||
});
|
||||
if (eyebrow) {
|
||||
const tag = useWeighted ? 'time-weighted' : 'label-count';
|
||||
eyebrow.textContent =
|
||||
`phase mix · ${fmtInt(mix.sampled_episodes)} of ${fmtInt(mix.population_episodes)} episodes · ${tag}`;
|
||||
}
|
||||
if (sub) {
|
||||
const hours = useWeighted
|
||||
? Math.round(PHASES.reduce((a, p) => a + (w[p] || 0), 0) / 3600)
|
||||
: null;
|
||||
sub.innerHTML =
|
||||
`Aggregated across <strong>${fmtInt(mix.sampled_episodes)}</strong> ` +
|
||||
`randomly-sampled episodes ` +
|
||||
`(<strong>${fmtInt(mix.total_labels)}</strong> phase records` +
|
||||
(hours != null ? `, ~<strong>${fmtInt(hours)}</strong> hours` : '') +
|
||||
`). Refreshes every ~10 min from disk.`;
|
||||
}
|
||||
}
|
||||
|
||||
on('phase', m => {
|
||||
if (!m.phase) return;
|
||||
samples.push({ phase: m.phase, t: Date.now() }); render();
|
||||
on('snapshot', m => { if (m.phase_mix) applyMix(m.phase_mix); });
|
||||
on('phase_mix', applyMix);
|
||||
on('demo_stop', () => {
|
||||
// Demo toggle off doesn't wipe the dataset mix — the dataset is
|
||||
// ground truth, the demo only fakes per-event widgets.
|
||||
});
|
||||
on('demo_stop', () => { samples.length = 0; render(); });
|
||||
setInterval(render, 1000);
|
||||
})();
|
||||
|
||||
// ── Database explorer ─────────────────────────────────────────
|
||||
|
|
|
|||
|
|
@ -243,13 +243,14 @@
|
|||
<!-- 6. baseline -->
|
||||
<div class="stage-view" data-view="baseline">
|
||||
<div class="metric-stack">
|
||||
<div class="metric-eyebrow">phase mix · last 5 min</div>
|
||||
<div class="metric-eyebrow" id="phase-mix-eyebrow">phase mix · sampling dataset…</div>
|
||||
<div class="phase-stack" id="phase-stack"></div>
|
||||
<div class="phase-legend" id="phase-legend"></div>
|
||||
<div class="metric-sub">awaiting <code>phase</code> events from
|
||||
the orchestrator. A clean fleet sits mostly in
|
||||
<code>clean</code>; skew toward <code>infecting</code> means
|
||||
the workload is firing.</div>
|
||||
<div class="metric-sub" id="phase-mix-sub">computing the phase
|
||||
distribution across a random sample of episodes on disk.
|
||||
A clean fleet sits mostly in <code>clean</code>; skew toward
|
||||
<code>infecting</code> / <code>infected_running</code>
|
||||
reflects time spent under attack workloads.</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
@ -528,6 +529,6 @@
|
|||
</article>
|
||||
</div>
|
||||
|
||||
<script src="/static/dashboard.js?v=246d8985"></script>
|
||||
<script src="/static/dashboard.js?v=a087e0a4"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue