PIPELINE §5 step 5: collector admission emit tests (§4.4)

Adds the missing emit-tests so every collector in KNOWN_COLLECTORS has end-to-end coverage: * test_proc_emits_rows_against_self_pid Samples /proc/<own pid> for ~0.6s. Asserts ≥3 rows + populated core fields (cpu_user_jiffies, rss_bytes, vsize_bytes). Works anywhere with /proc. * test_pcap_bucketize_emits_rows_from_synthetic_capture Builds a 2-packet Ethernet+IPv4+TCP pcap in-memory, feeds it to pcap.bucketize, asserts ≥1 row written + total packet count across buckets matches input. Covers BOTH the pcap and netflow collectors (netflow IS the bucketized pcap output). * test_every_known_collector_has_emit_coverage Cross-cutting tripwire: for every name in KNOWN_COLLECTORS, either there's a test_collectors_emit.py test or there's an explicit COLLECTOR_TEST_CARVE_OUTS entry. Adding a collector to KNOWN_COLLECTORS without an emit test fails this. Carve-outs today: qmp (covered by tests/test_qmp.py — needs running QEMU for real-binary emit) and guest_agent (covered by tests/test_guest_agent.py — needs a real VM with the agent baked in). The carve-outs are explicit, not implicit. A drift where someone adds a new collector without a real-binary emit test fails CI before the manifest can include it. 272 tests passing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 01:37:40 -05:00 · 2026-05-04 01:37:40 -05:00 · 0d51b9b253
commit 0d51b9b253
parent 22269e175d
1 changed files with 169 additions and 0 deletions
--- a/tests/test_collectors_emit.py
+++ b/tests/test_collectors_emit.py
@ -143,6 +143,175 @@ def test_perf_emits_rows_against_busy_pid(tmp_path: Path) -> None:
 # ---------------------------------------------------------------------------


+# ---------------------------------------------------------------------------
+# proc — must emit ≥1 row when run against a live PID
+# ---------------------------------------------------------------------------
+
+
+def test_proc_emits_rows_against_self_pid(tmp_path: Path) -> None:
+    """The /proc collector samples a live PID. Anyone running this test
+    has a /proc/<pid>/stat for their own process, so this works in any
+    environment that has /proc — Linux, including CI containers."""
+    from collectors import proc_qemu
+    out = tmp_path / "telemetry-proc.jsonl"
+    rows = _run_collector_briefly(
+        proc_qemu.run_loop,
+        seconds=0.6,
+        pid=os.getpid(),
+        output_path=out,
+        t_mono_origin_ns=0,
+        interval_ms=100,
+    )
+    assert rows >= 3, (
+        f"proc collector wrote {rows} rows for ~5 expected ticks "
+        f"in 600ms. PIPELINE.md §4.4."
+    )
+    parsed = [json.loads(l) for l in out.read_text().splitlines()]
+    assert all(r["source"] == "host_proc" for r in parsed)
+    # Every row must have non-None values for the core fields — those
+    # are direct /proc reads that always succeed for a live PID.
+    for r in parsed:
+        assert r["cpu_user_jiffies"] is not None
+        assert r["rss_bytes"] is not None
+        assert r["vsize_bytes"] is not None
+
+
+# ---------------------------------------------------------------------------
+# pcap + netflow — bucketize a synthesized minimal pcap
+# ---------------------------------------------------------------------------
+
+
+def _build_minimal_pcap(path: Path) -> None:
+    """Write a tiny pcap with two TCP packets so bucketize() has
+    something to count. Avoids needing tcpdump in CI."""
+    import struct
+    # pcap global header — microsec resolution, link-type 1 (Ethernet).
+    hdr = struct.pack("<IHHiIII",
+                      0xa1b2c3d4,  # magic
+                      2, 4,        # version major.minor
+                      0, 0,        # thiszone, sigfigs
+                      65535,       # snaplen
+                      1)           # linktype: ethernet
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("wb") as f:
+        f.write(hdr)
+        # Two synthetic frames: Ethernet + IPv4 + TCP. Keep them short
+        # but well-formed enough that the collector's parser walks them.
+        for ts_us in (1_000_000, 1_050_000):
+            # 14B Ethernet (src + dst MAC + ethertype 0x0800)
+            eth = (b"\x00\x11\x22\x33\x44\x55"
+                   b"\x66\x77\x88\x99\xaa\xbb"
+                   b"\x08\x00")
+            # 20B IPv4 — TCP (proto 6) from 10.200.0.1 → 10.200.0.10.
+            ipv4 = struct.pack(
+                ">BBHHHBBHII",
+                0x45, 0, 40,        # ver/ihl, tos, total len
+                0x1234, 0,          # id, flags+frag
+                64, 6, 0,           # ttl, proto=tcp, checksum (skip)
+                int.from_bytes(b"\x0a\xc8\x00\x01", "big"),  # 10.200.0.1
+                int.from_bytes(b"\x0a\xc8\x00\x0a", "big"),  # 10.200.0.10
+            )
+            # 20B TCP — sport 1234, dport 80, ACK flag
+            tcp = struct.pack(
+                ">HHIIBBHHH",
+                1234, 80,
+                0, 0, 0x50, 0x10, 65535, 0, 0,
+            )
+            payload = eth + ipv4 + tcp
+            rec_hdr = struct.pack(
+                "<IIII",
+                ts_us // 1_000_000,
+                ts_us % 1_000_000,
+                len(payload), len(payload),
+            )
+            f.write(rec_hdr + payload)
+
+
+def test_pcap_bucketize_emits_rows_from_synthetic_capture(tmp_path: Path) -> None:
+    """bucketize is the netflow collector. With a synthesized
+    Ethernet+IPv4+TCP pcap, it MUST emit at least one bucket row.
+
+    Refs: PIPELINE.md §4.4 — netflow must produce ≥1 row against a
+    realistic packet capture. Synthesizing the pcap keeps the test
+    hermetic (no tcpdump or live interface needed in CI)."""
+    from collectors import pcap as pcap_collector
+    pcap_path = tmp_path / "network.pcap"
+    _build_minimal_pcap(pcap_path)
+    netflow_path = tmp_path / "netflow.jsonl"
+    rows = pcap_collector.bucketize(
+        pcap_path, netflow_path,
+        bucket_ms=100,
+        t_mono_origin_ns=0,
+        bridge_ip="10.200.0.1",
+    )
+    assert rows >= 1, (
+        f"netflow bucketize wrote {rows} rows for a 2-packet pcap. "
+        f"PIPELINE.md §4.4."
+    )
+    parsed = [json.loads(l) for l in netflow_path.read_text().splitlines()]
+    assert parsed
+    # The aggregator must have counted both packets — over the 50ms
+    # span of our synthetic capture, packet count should be > 0.
+    total_pkts = sum(r.get("pkts_in", 0) + r.get("pkts_out", 0)
+                     for r in parsed)
+    assert total_pkts == 2, f"expected 2 packets across all buckets, got {total_pkts}"
+
+
+# ---------------------------------------------------------------------------
+# Cross-cutting: every name in KNOWN_COLLECTORS must be exercised by
+# either an emit-test here or a documented carve-out below.
+# ---------------------------------------------------------------------------
+
+
+# Carve-outs: collectors that need a running guest VM to exercise
+# end-to-end. Each entry MUST cite the unit-level test that exercises
+# its parser/protocol code and the specific reason a real-binary emit
+# test isn't tractable in CI.
+COLLECTOR_TEST_CARVE_OUTS = {
+    "qmp": (
+        "tests/test_qmp.py — exercises the QMP wire parser against "
+        "captured query-status / query-blockstats fixtures. Real-binary "
+        "emit needs a running QEMU; covered in production by the "
+        "Tier-3 emit-test on lab hosts."
+    ),
+    "guest_agent": (
+        "tests/test_guest_agent.py — exercises run_loop with a fake "
+        "agent unix-socket server feeding JSONL. Real-VM emit needs "
+        "a guest with the agent baked in (cidata path, covered by "
+        "production data on lab hosts)."
+    ),
+}
+
+
+def test_every_known_collector_has_emit_coverage() -> None:
+    """For each collector name in KNOWN_COLLECTORS, either there's a
+    test_collectors_emit.py emit test against a real binary, or there's
+    an explicit carve-out in COLLECTOR_TEST_CARVE_OUTS naming the
+    parser-level test that covers it. Adding a new collector without
+    either fails this assertion."""
+    from orchestrator.manifest import KNOWN_COLLECTORS
+    test_src = Path(__file__).read_text()
+    expected_emit_test_names = {
+        "proc": "test_proc_emits_rows_against",
+        "perf": "test_perf_emits_rows_against",
+        "pcap": "test_pcap_bucketize_emits_rows",
+        "netflow": "test_pcap_bucketize_emits_rows",
+    }
+    for collector in KNOWN_COLLECTORS:
+        if collector in COLLECTOR_TEST_CARVE_OUTS:
+            continue
+        needle = expected_emit_test_names.get(collector)
+        assert needle is not None, (
+            f"collector {collector!r} in KNOWN_COLLECTORS has no entry in "
+            f"expected_emit_test_names and no COLLECTOR_TEST_CARVE_OUTS "
+            f"entry. Either add a test or document the carve-out."
+        )
+        assert needle in test_src, (
+            f"collector {collector!r} expected an emit-test name "
+            f"containing {needle!r} in this file. PIPELINE.md §4.4."
+        )
+
+
 def test_run_tier3_demo_wires_collector_sockets_into_episode_config() -> None:
    """`run_tier3_demo.py` must pass qmp_socket / guest_agent_socket /
    bridge_iface to EpisodeConfig the same way `run_real_vm_demo.py`