perf: emit per-episode lifecycle events; emit row even with empty agg

Validation on k-gamingcom (commit ac7b85f) showed perf enabled in production but rows_perf=0 on every episode. Without lifecycle events the failure mode is indistinguishable from "perf wasn't enabled" — §1 silent-downgrade. The events now surface the actual cause: - perf_unavailable — binary missing OR launch failed (with reason) - perf_started — perf is running (pid, events, interval) - perf_first_row — first row written; counters_populated tells whether any event was actually counted - perf_finished — final tally (intervals_seen, intervals_with_values) - perf_no_counters — perf was alive but every interval came back <not counted> (likely paranoid > 2 or PID ownership mismatch) `_flush()` now writes a row whenever an interval is observed, even when every event was <not counted>. The all-None row is honest data ("perf observed this interval and counted nothing"), and the rows become a count of observed intervals rather than a count of successful measurements — distinct from rows_proc / rows_qmp which do count successful measurements. Trainers filter on `cycles is not None` etc. when they need only populated rows. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 18:08:42 -05:00 · 2026-05-03 18:08:42 -05:00 · dac03d2eff
commit dac03d2eff
parent 0390eb20b6
2 changed files with 59 additions and 3 deletions
--- a/collectors/perf_qemu.py
+++ b/collectors/perf_qemu.py
@ -120,11 +120,21 @@ def run_loop(
    stop_event: threading.Event,
    *,
    events: tuple[str, ...] = DEFAULT_EVENTS,
+    emit_event: "callable | None" = None,
 ) -> int:
    """Spawn perf stat -j against ``pid`` and stream rows until stop.
-    Returns the number of rows written."""
+    Returns the number of rows written.
+
+    When ``emit_event`` is provided, perf lifecycle markers
+    (perf_unavailable / perf_started / perf_first_row / perf_no_counters)
+    are written into the orchestrator's events.jsonl. Without this, a
+    silent perf produces only a `rows_perf=0` count in meta.json which
+    is indistinguishable from "perf was never enabled" — the §1
+    silent-downgrade pattern."""
    if not perf_available():
        log.warning("perf binary not on PATH — perf collector disabled")
+        if emit_event is not None:
+            emit_event("perf_unavailable", reason="binary_not_on_path")
        return 0

    # perf stat writes its output (including -j JSON) to stderr by
@ -152,20 +162,43 @@ def run_loop(
        )
    except (FileNotFoundError, PermissionError) as e:
        log.warning("perf launch failed: %s", e)
+        if emit_event is not None:
+            emit_event("perf_unavailable", reason=f"launch_failed: {e}")
        return 0

+    if emit_event is not None:
+        emit_event("perf_started", pid=pid, events=list(events),
+                   interval_ms=interval_ms)
+
    rows = 0
    output_path.parent.mkdir(parents=True, exist_ok=True)
    cur_interval: float | None = None
    agg: dict[str, int] = {}
+    intervals_seen = 0
+    intervals_with_values = 0

    def _flush() -> None:
-        nonlocal rows
-        if cur_interval is None or not agg:
+        nonlocal rows, intervals_seen, intervals_with_values
+        if cur_interval is None:
            return
+        intervals_seen += 1
+        # Always emit a row when an interval is observed, even if every
+        # event came back <not counted>. agg-empty means perf saw the
+        # interval but couldn't measure anything (paranoid level too
+        # high, qemu PID owned by different user, hardware counters
+        # unavailable, etc). Emitting None-valued rows distinguishes
+        # "perf was running but counters were unavailable" from "perf
+        # never ran at all" — which is the §1 visibility requirement
+        # the silent return-without-write was violating.
+        if agg:
+            intervals_with_values += 1
        row = _build_row(t_mono_origin_ns, cur_interval, agg)
        out_f.write(json.dumps(row) + "\n")
        rows += 1
+        if (rows == 1) and emit_event is not None:
+            emit_event("perf_first_row",
+                       counters_populated=bool(agg),
+                       counters=list(agg.keys()))

    try:
        with output_path.open("a", buffering=1) as out_f:
@ -201,6 +234,24 @@ def run_loop(
                    agg[event_name] = value
            # End of stream — flush the last partial row.
            _flush()
+        if emit_event is not None:
+            emit_event(
+                "perf_finished",
+                rows=rows,
+                intervals_seen=intervals_seen,
+                intervals_with_values=intervals_with_values,
+            )
+            if intervals_seen > 0 and intervals_with_values == 0:
+                # Loud signal: perf was running, intervals ticked, but
+                # every event came back <not counted>. Likely cause:
+                # perf_event_paranoid > 2, or qemu PID owned by a
+                # different user than the perf collector.
+                emit_event(
+                    "perf_no_counters",
+                    intervals_seen=intervals_seen,
+                    likely_cause=("perf_event_paranoid > 2 or "
+                                  "qemu PID owned by different user"),
+                )
    finally:
        if proc.poll() is None:
            proc.terminate()
--- a/orchestrator/episode.py
+++ b/orchestrator/episode.py
@ -301,6 +301,11 @@ class EpisodeRunner:
                t_mono_origin_ns=self._t_mono_origin_ns,
                interval_ms=self.cfg.perf_interval_ms,
                stop_event=self._stop,
+                # Per-episode visibility into perf lifecycle so silent
+                # failures (binary missing, paranoid level too high,
+                # all counters <not counted>) surface in events.jsonl
+                # rather than only as rows_perf=0. Refs §1.
+                emit_event=self.emit_event,
            )

        threads: list[threading.Thread] = []