perf: emit per-episode lifecycle events; emit row even with empty agg
Validation on k-gamingcom (commit ac7b85f) showed perf enabled in
production but rows_perf=0 on every episode. Without lifecycle events
the failure mode is indistinguishable from "perf wasn't enabled" — §1
silent-downgrade. The events now surface the actual cause:
- perf_unavailable — binary missing OR launch failed (with reason)
- perf_started — perf is running (pid, events, interval)
- perf_first_row — first row written; counters_populated tells
whether any event was actually counted
- perf_finished — final tally (intervals_seen,
intervals_with_values)
- perf_no_counters — perf was alive but every interval came back
<not counted> (likely paranoid > 2 or PID
ownership mismatch)
`_flush()` now writes a row whenever an interval is observed, even
when every event was <not counted>. The all-None row is honest data
("perf observed this interval and counted nothing"), and the rows
become a count of observed intervals rather than a count of
successful measurements — distinct from rows_proc / rows_qmp which
do count successful measurements. Trainers filter on
`cycles is not None` etc. when they need only populated rows.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0390eb20b6
commit
dac03d2eff
2 changed files with 59 additions and 3 deletions
|
|
@ -120,11 +120,21 @@ def run_loop(
|
||||||
stop_event: threading.Event,
|
stop_event: threading.Event,
|
||||||
*,
|
*,
|
||||||
events: tuple[str, ...] = DEFAULT_EVENTS,
|
events: tuple[str, ...] = DEFAULT_EVENTS,
|
||||||
|
emit_event: "callable | None" = None,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Spawn perf stat -j against ``pid`` and stream rows until stop.
|
"""Spawn perf stat -j against ``pid`` and stream rows until stop.
|
||||||
Returns the number of rows written."""
|
Returns the number of rows written.
|
||||||
|
|
||||||
|
When ``emit_event`` is provided, perf lifecycle markers
|
||||||
|
(perf_unavailable / perf_started / perf_first_row / perf_no_counters)
|
||||||
|
are written into the orchestrator's events.jsonl. Without this, a
|
||||||
|
silent perf produces only a `rows_perf=0` count in meta.json which
|
||||||
|
is indistinguishable from "perf was never enabled" — the §1
|
||||||
|
silent-downgrade pattern."""
|
||||||
if not perf_available():
|
if not perf_available():
|
||||||
log.warning("perf binary not on PATH — perf collector disabled")
|
log.warning("perf binary not on PATH — perf collector disabled")
|
||||||
|
if emit_event is not None:
|
||||||
|
emit_event("perf_unavailable", reason="binary_not_on_path")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# perf stat writes its output (including -j JSON) to stderr by
|
# perf stat writes its output (including -j JSON) to stderr by
|
||||||
|
|
@ -152,20 +162,43 @@ def run_loop(
|
||||||
)
|
)
|
||||||
except (FileNotFoundError, PermissionError) as e:
|
except (FileNotFoundError, PermissionError) as e:
|
||||||
log.warning("perf launch failed: %s", e)
|
log.warning("perf launch failed: %s", e)
|
||||||
|
if emit_event is not None:
|
||||||
|
emit_event("perf_unavailable", reason=f"launch_failed: {e}")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
if emit_event is not None:
|
||||||
|
emit_event("perf_started", pid=pid, events=list(events),
|
||||||
|
interval_ms=interval_ms)
|
||||||
|
|
||||||
rows = 0
|
rows = 0
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
cur_interval: float | None = None
|
cur_interval: float | None = None
|
||||||
agg: dict[str, int] = {}
|
agg: dict[str, int] = {}
|
||||||
|
intervals_seen = 0
|
||||||
|
intervals_with_values = 0
|
||||||
|
|
||||||
def _flush() -> None:
|
def _flush() -> None:
|
||||||
nonlocal rows
|
nonlocal rows, intervals_seen, intervals_with_values
|
||||||
if cur_interval is None or not agg:
|
if cur_interval is None:
|
||||||
return
|
return
|
||||||
|
intervals_seen += 1
|
||||||
|
# Always emit a row when an interval is observed, even if every
|
||||||
|
# event came back <not counted>. agg-empty means perf saw the
|
||||||
|
# interval but couldn't measure anything (paranoid level too
|
||||||
|
# high, qemu PID owned by different user, hardware counters
|
||||||
|
# unavailable, etc). Emitting None-valued rows distinguishes
|
||||||
|
# "perf was running but counters were unavailable" from "perf
|
||||||
|
# never ran at all" — which is the §1 visibility requirement
|
||||||
|
# the silent return-without-write was violating.
|
||||||
|
if agg:
|
||||||
|
intervals_with_values += 1
|
||||||
row = _build_row(t_mono_origin_ns, cur_interval, agg)
|
row = _build_row(t_mono_origin_ns, cur_interval, agg)
|
||||||
out_f.write(json.dumps(row) + "\n")
|
out_f.write(json.dumps(row) + "\n")
|
||||||
rows += 1
|
rows += 1
|
||||||
|
if (rows == 1) and emit_event is not None:
|
||||||
|
emit_event("perf_first_row",
|
||||||
|
counters_populated=bool(agg),
|
||||||
|
counters=list(agg.keys()))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with output_path.open("a", buffering=1) as out_f:
|
with output_path.open("a", buffering=1) as out_f:
|
||||||
|
|
@ -201,6 +234,24 @@ def run_loop(
|
||||||
agg[event_name] = value
|
agg[event_name] = value
|
||||||
# End of stream — flush the last partial row.
|
# End of stream — flush the last partial row.
|
||||||
_flush()
|
_flush()
|
||||||
|
if emit_event is not None:
|
||||||
|
emit_event(
|
||||||
|
"perf_finished",
|
||||||
|
rows=rows,
|
||||||
|
intervals_seen=intervals_seen,
|
||||||
|
intervals_with_values=intervals_with_values,
|
||||||
|
)
|
||||||
|
if intervals_seen > 0 and intervals_with_values == 0:
|
||||||
|
# Loud signal: perf was running, intervals ticked, but
|
||||||
|
# every event came back <not counted>. Likely cause:
|
||||||
|
# perf_event_paranoid > 2, or qemu PID owned by a
|
||||||
|
# different user than the perf collector.
|
||||||
|
emit_event(
|
||||||
|
"perf_no_counters",
|
||||||
|
intervals_seen=intervals_seen,
|
||||||
|
likely_cause=("perf_event_paranoid > 2 or "
|
||||||
|
"qemu PID owned by different user"),
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
if proc.poll() is None:
|
if proc.poll() is None:
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
|
|
|
||||||
|
|
@ -301,6 +301,11 @@ class EpisodeRunner:
|
||||||
t_mono_origin_ns=self._t_mono_origin_ns,
|
t_mono_origin_ns=self._t_mono_origin_ns,
|
||||||
interval_ms=self.cfg.perf_interval_ms,
|
interval_ms=self.cfg.perf_interval_ms,
|
||||||
stop_event=self._stop,
|
stop_event=self._stop,
|
||||||
|
# Per-episode visibility into perf lifecycle so silent
|
||||||
|
# failures (binary missing, paranoid level too high,
|
||||||
|
# all counters <not counted>) surface in events.jsonl
|
||||||
|
# rather than only as rows_perf=0. Refs §1.
|
||||||
|
emit_event=self.emit_event,
|
||||||
)
|
)
|
||||||
|
|
||||||
threads: list[threading.Thread] = []
|
threads: list[threading.Thread] = []
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue