diff --git a/collectors/perf_qemu.py b/collectors/perf_qemu.py index 17397e2..6e82017 100644 --- a/collectors/perf_qemu.py +++ b/collectors/perf_qemu.py @@ -120,11 +120,21 @@ def run_loop( stop_event: threading.Event, *, events: tuple[str, ...] = DEFAULT_EVENTS, + emit_event: "callable | None" = None, ) -> int: """Spawn perf stat -j against ``pid`` and stream rows until stop. - Returns the number of rows written.""" + Returns the number of rows written. + + When ``emit_event`` is provided, perf lifecycle markers + (perf_unavailable / perf_started / perf_first_row / perf_no_counters) + are written into the orchestrator's events.jsonl. Without this, a + silent perf produces only a `rows_perf=0` count in meta.json which + is indistinguishable from "perf was never enabled" — the §1 + silent-downgrade pattern.""" if not perf_available(): log.warning("perf binary not on PATH — perf collector disabled") + if emit_event is not None: + emit_event("perf_unavailable", reason="binary_not_on_path") return 0 # perf stat writes its output (including -j JSON) to stderr by @@ -152,20 +162,43 @@ def run_loop( ) except (FileNotFoundError, PermissionError) as e: log.warning("perf launch failed: %s", e) + if emit_event is not None: + emit_event("perf_unavailable", reason=f"launch_failed: {e}") return 0 + if emit_event is not None: + emit_event("perf_started", pid=pid, events=list(events), + interval_ms=interval_ms) + rows = 0 output_path.parent.mkdir(parents=True, exist_ok=True) cur_interval: float | None = None agg: dict[str, int] = {} + intervals_seen = 0 + intervals_with_values = 0 def _flush() -> None: - nonlocal rows - if cur_interval is None or not agg: + nonlocal rows, intervals_seen, intervals_with_values + if cur_interval is None: return + intervals_seen += 1 + # Always emit a row when an interval is observed, even if every + # event came back . agg-empty means perf saw the + # interval but couldn't measure anything (paranoid level too + # high, qemu PID owned by different user, hardware counters + # unavailable, etc). Emitting None-valued rows distinguishes + # "perf was running but counters were unavailable" from "perf + # never ran at all" — which is the §1 visibility requirement + # the silent return-without-write was violating. + if agg: + intervals_with_values += 1 row = _build_row(t_mono_origin_ns, cur_interval, agg) out_f.write(json.dumps(row) + "\n") rows += 1 + if (rows == 1) and emit_event is not None: + emit_event("perf_first_row", + counters_populated=bool(agg), + counters=list(agg.keys())) try: with output_path.open("a", buffering=1) as out_f: @@ -201,6 +234,24 @@ def run_loop( agg[event_name] = value # End of stream — flush the last partial row. _flush() + if emit_event is not None: + emit_event( + "perf_finished", + rows=rows, + intervals_seen=intervals_seen, + intervals_with_values=intervals_with_values, + ) + if intervals_seen > 0 and intervals_with_values == 0: + # Loud signal: perf was running, intervals ticked, but + # every event came back . Likely cause: + # perf_event_paranoid > 2, or qemu PID owned by a + # different user than the perf collector. + emit_event( + "perf_no_counters", + intervals_seen=intervals_seen, + likely_cause=("perf_event_paranoid > 2 or " + "qemu PID owned by different user"), + ) finally: if proc.poll() is None: proc.terminate() diff --git a/orchestrator/episode.py b/orchestrator/episode.py index ac493bf..3ce1c8e 100644 --- a/orchestrator/episode.py +++ b/orchestrator/episode.py @@ -301,6 +301,11 @@ class EpisodeRunner: t_mono_origin_ns=self._t_mono_origin_ns, interval_ms=self.cfg.perf_interval_ms, stop_event=self._stop, + # Per-episode visibility into perf lifecycle so silent + # failures (binary missing, paranoid level too high, + # all counters ) surface in events.jsonl + # rather than only as rows_perf=0. Refs §1. + emit_event=self.emit_event, ) threads: list[threading.Thread] = []