Add v0 orchestrator + first oracle collector (host /proc)
End-to-end: ``python -m orchestrator --target-pid <pid> --duration N`` now
writes a complete episode directory matching docs/data-model.md, with phase
labels, events, and a 10 Hz host /proc telemetry stream. No VM yet — pid is
arbitrary so we can validate the loop against e.g. ``sleep 5`` while the lab
side comes up.
collectors/proc_qemu.py — parses /proc/<pid>/{stat,io,status} (handles parens
in comm), single-shot collect_once(), and a stop-event-driven run_loop()
that ticks at a fixed cadence and exits when the pid disappears. Tagged
``available_in_deployment: false`` per the threat-model doc.
orchestrator/episode.py — EpisodeRunner: creates data/episodes/<ulid>/,
atomic meta.json, events.jsonl + labels.jsonl writers, drives the collector
in a thread for duration_s, writes done.marker last so the shipper never
sees a half-finished episode.
orchestrator/ulid.py — tiny 26-char Crockford-base32 ULID generator.
Time-sortable, no third-party dep.
orchestrator/__main__.py — CLI entry point.
Tests (15 new, 28 total green):
- proc_qemu: real-ish stat with parens-in-comm, missing /proc/<pid>/io,
missing pid, run_loop cadence, run_loop terminates when pid disappears.
- episode: full directory shape against os.getpid(), id override,
done.marker written after meta.json finalize.
- ulid: length+alphabet, 2000-burst uniqueness, time-sortability.
Smoke-tested against ``sleep 10``: 16 rows over 1.5s at 100ms cadence,
monotonic clock, RSS stable at ~3.5 MiB as expected for an idle sleep.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
83e111961d
commit
064387b7a0
9 changed files with 780 additions and 0 deletions
0
collectors/__init__.py
Normal file
0
collectors/__init__.py
Normal file
189
collectors/proc_qemu.py
Normal file
189
collectors/proc_qemu.py
Normal file
|
|
@ -0,0 +1,189 @@
|
||||||
|
"""Source 1 (oracle): host /proc/<pid> sampler.
|
||||||
|
|
||||||
|
Polls /proc/<pid>/{stat,io,status} at a fixed interval and emits one JSONL
|
||||||
|
row per tick into the episode telemetry file.
|
||||||
|
|
||||||
|
This source is **oracle-only** — it does not exist on a deployed device, so
|
||||||
|
every row is tagged ``available_in_deployment: false``. See
|
||||||
|
``docs/threat-model.md``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
log = logging.getLogger("cis490.collectors.proc_qemu")
|
||||||
|
|
||||||
|
SOURCE = "host_proc"
|
||||||
|
AVAILABLE_IN_DEPLOYMENT = False
|
||||||
|
PAGE_SIZE = os.sysconf("SC_PAGESIZE")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ProcStat:
|
||||||
|
minflt: int
|
||||||
|
majflt: int
|
||||||
|
utime: int # clock ticks (jiffies) in user mode
|
||||||
|
stime: int # clock ticks (jiffies) in kernel mode
|
||||||
|
vsize: int # bytes
|
||||||
|
rss_pages: int # in pages — multiply by PAGE_SIZE
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ProcIO:
|
||||||
|
read_bytes: int
|
||||||
|
write_bytes: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ProcStatus:
|
||||||
|
voluntary_ctxsw: int
|
||||||
|
involuntary_ctxsw: int
|
||||||
|
|
||||||
|
|
||||||
|
def parse_proc_stat(data: str) -> ProcStat:
|
||||||
|
"""Parse the contents of /proc/<pid>/stat.
|
||||||
|
|
||||||
|
The ``comm`` field (field 2) can contain spaces and parens, so we anchor
|
||||||
|
the split on the rightmost ')'. After that, fields are positional per
|
||||||
|
``man 5 proc``.
|
||||||
|
"""
|
||||||
|
rparen = data.rindex(")")
|
||||||
|
# Skip ") " after the comm to land on the state field.
|
||||||
|
fields = data[rparen + 2 :].split()
|
||||||
|
# Index in `fields`: man 5 proc field number (1-indexed)
|
||||||
|
# 0 state 3
|
||||||
|
# 1 ppid 4
|
||||||
|
# 2 pgrp 5
|
||||||
|
# 3 session 6
|
||||||
|
# 4 tty_nr 7
|
||||||
|
# 5 tpgid 8
|
||||||
|
# 6 flags 9
|
||||||
|
# 7 minflt 10
|
||||||
|
# 8 cminflt 11
|
||||||
|
# 9 majflt 12
|
||||||
|
# 10 cmajflt 13
|
||||||
|
# 11 utime 14
|
||||||
|
# 12 stime 15
|
||||||
|
# ...
|
||||||
|
# 20 vsize 23
|
||||||
|
# 21 rss (pages) 24
|
||||||
|
return ProcStat(
|
||||||
|
minflt=int(fields[7]),
|
||||||
|
majflt=int(fields[9]),
|
||||||
|
utime=int(fields[11]),
|
||||||
|
stime=int(fields[12]),
|
||||||
|
vsize=int(fields[20]),
|
||||||
|
rss_pages=int(fields[21]),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_proc_io(data: str) -> ProcIO:
|
||||||
|
"""Parse /proc/<pid>/io. Requires same uid (or CAP_SYS_PTRACE) to read."""
|
||||||
|
out = {}
|
||||||
|
for line in data.splitlines():
|
||||||
|
k, _, v = line.partition(":")
|
||||||
|
out[k.strip()] = int(v.strip())
|
||||||
|
return ProcIO(read_bytes=out["read_bytes"], write_bytes=out["write_bytes"])
|
||||||
|
|
||||||
|
|
||||||
|
def parse_proc_status(data: str) -> ProcStatus:
|
||||||
|
"""Parse /proc/<pid>/status — only the two ctxsw fields we care about."""
|
||||||
|
vol = nvol = 0
|
||||||
|
for line in data.splitlines():
|
||||||
|
if line.startswith("voluntary_ctxt_switches:"):
|
||||||
|
vol = int(line.split(":", 1)[1].strip())
|
||||||
|
elif line.startswith("nonvoluntary_ctxt_switches:"):
|
||||||
|
nvol = int(line.split(":", 1)[1].strip())
|
||||||
|
return ProcStatus(voluntary_ctxsw=vol, involuntary_ctxsw=nvol)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text(path: str) -> str | None:
|
||||||
|
try:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
return f.read().decode("ascii", errors="replace")
|
||||||
|
except (FileNotFoundError, ProcessLookupError, PermissionError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def collect_once(
|
||||||
|
pid: int,
|
||||||
|
t_mono_origin_ns: int,
|
||||||
|
*,
|
||||||
|
proc_root: str = "/proc",
|
||||||
|
) -> dict | None:
|
||||||
|
"""One sample. Returns None if the target pid is gone.
|
||||||
|
|
||||||
|
``proc_root`` is overridable for tests against a synthetic /proc tree.
|
||||||
|
"""
|
||||||
|
stat_text = _read_text(f"{proc_root}/{pid}/stat")
|
||||||
|
if stat_text is None:
|
||||||
|
return None
|
||||||
|
stat = parse_proc_stat(stat_text)
|
||||||
|
|
||||||
|
io_text = _read_text(f"{proc_root}/{pid}/io")
|
||||||
|
io = parse_proc_io(io_text) if io_text is not None else None
|
||||||
|
|
||||||
|
status_text = _read_text(f"{proc_root}/{pid}/status")
|
||||||
|
status = parse_proc_status(status_text) if status_text is not None else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"t_mono_ns": time.monotonic_ns() - t_mono_origin_ns,
|
||||||
|
"t_wall_ns": time.time_ns(),
|
||||||
|
"source": SOURCE,
|
||||||
|
"available_in_deployment": AVAILABLE_IN_DEPLOYMENT,
|
||||||
|
"cpu_user_jiffies": stat.utime,
|
||||||
|
"cpu_sys_jiffies": stat.stime,
|
||||||
|
"rss_bytes": stat.rss_pages * PAGE_SIZE,
|
||||||
|
"vsize_bytes": stat.vsize,
|
||||||
|
"io_read_bytes": io.read_bytes if io is not None else None,
|
||||||
|
"io_write_bytes": io.write_bytes if io is not None else None,
|
||||||
|
"voluntary_ctxsw": status.voluntary_ctxsw if status is not None else None,
|
||||||
|
"involuntary_ctxsw": status.involuntary_ctxsw if status is not None else None,
|
||||||
|
"minor_faults": stat.minflt,
|
||||||
|
"major_faults": stat.majflt,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_loop(
|
||||||
|
pid: int,
|
||||||
|
output_path: Path,
|
||||||
|
t_mono_origin_ns: int,
|
||||||
|
interval_ms: int,
|
||||||
|
stop_event: threading.Event,
|
||||||
|
*,
|
||||||
|
proc_root: str = "/proc",
|
||||||
|
) -> int:
|
||||||
|
"""Sample at a fixed cadence until stop_event is set or pid disappears.
|
||||||
|
|
||||||
|
Returns the number of rows written.
|
||||||
|
"""
|
||||||
|
interval_ns = interval_ms * 1_000_000
|
||||||
|
next_tick = time.monotonic_ns()
|
||||||
|
rows = 0
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with output_path.open("a", buffering=1) as f: # line-buffered
|
||||||
|
while not stop_event.is_set():
|
||||||
|
row = collect_once(pid, t_mono_origin_ns, proc_root=proc_root)
|
||||||
|
if row is None:
|
||||||
|
log.info("target pid %d disappeared; stopping", pid)
|
||||||
|
break
|
||||||
|
f.write(json.dumps(row) + "\n")
|
||||||
|
rows += 1
|
||||||
|
|
||||||
|
next_tick += interval_ns
|
||||||
|
sleep_ns = next_tick - time.monotonic_ns()
|
||||||
|
if sleep_ns > 0:
|
||||||
|
# Use the event's wait so SIGTERM/stop is responsive.
|
||||||
|
stop_event.wait(sleep_ns / 1_000_000_000)
|
||||||
|
else:
|
||||||
|
# We are behind schedule; resync.
|
||||||
|
next_tick = time.monotonic_ns()
|
||||||
|
return rows
|
||||||
0
orchestrator/__init__.py
Normal file
0
orchestrator/__init__.py
Normal file
70
orchestrator/__main__.py
Normal file
70
orchestrator/__main__.py
Normal file
|
|
@ -0,0 +1,70 @@
|
||||||
|
"""CLI for the v0 orchestrator: observe a pid for a fixed window."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .episode import EpisodeConfig, EpisodeRunner
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="cis490-orchestrator",
|
||||||
|
description="Run a single episode against a target pid.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--target-pid",
|
||||||
|
type=int,
|
||||||
|
required=True,
|
||||||
|
help="pid to sample (later this will be the qemu-system pid)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--duration",
|
||||||
|
type=float,
|
||||||
|
default=10.0,
|
||||||
|
help="seconds to observe (default 10)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--interval-ms",
|
||||||
|
type=int,
|
||||||
|
default=100,
|
||||||
|
help="sampling interval (default 100ms = 10 Hz)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--data-root",
|
||||||
|
default="data",
|
||||||
|
help="output directory root (default ./data)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--episode-id",
|
||||||
|
default=None,
|
||||||
|
help="override ULID generation (mostly for tests)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
cfg = EpisodeConfig(
|
||||||
|
target_pid=args.target_pid,
|
||||||
|
duration_s=args.duration,
|
||||||
|
interval_ms=args.interval_ms,
|
||||||
|
data_root=Path(args.data_root),
|
||||||
|
episode_id=args.episode_id,
|
||||||
|
)
|
||||||
|
result = EpisodeRunner(cfg).run()
|
||||||
|
|
||||||
|
print(f"episode_id={result.episode_id}")
|
||||||
|
print(f"path={result.episode_dir}")
|
||||||
|
print(f"rows_proc={result.rows_proc}")
|
||||||
|
print(f"duration_observed_s={result.duration_observed_s:.2f}")
|
||||||
|
return 0 if not result.pid_disappeared else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
207
orchestrator/episode.py
Normal file
207
orchestrator/episode.py
Normal file
|
|
@ -0,0 +1,207 @@
|
||||||
|
"""EpisodeRunner — minimum-viable single-episode driver.
|
||||||
|
|
||||||
|
This v0 runner does NOT boot a VM yet. It samples a target pid that you
|
||||||
|
provide on the command line, labels the entire window as ``clean``, and
|
||||||
|
writes the full per-episode directory shape from ``docs/data-model.md``:
|
||||||
|
|
||||||
|
data/episodes/<ulid>/
|
||||||
|
meta.json
|
||||||
|
events.jsonl
|
||||||
|
labels.jsonl
|
||||||
|
telemetry-proc.jsonl
|
||||||
|
done.marker
|
||||||
|
|
||||||
|
The point of v0 is to validate the directory shape, the JSONL schemas, the
|
||||||
|
collector loop, and (next step) the shipper, *before* the VM lab is wired
|
||||||
|
up. Once the VM bring-up exists, the runner will be extended with phase
|
||||||
|
transitions (armed → infecting → infected_running → dormant).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from collectors import proc_qemu
|
||||||
|
|
||||||
|
from .ulid import new_ulid
|
||||||
|
|
||||||
|
|
||||||
|
log = logging.getLogger("cis490.orchestrator")
|
||||||
|
|
||||||
|
SCHEMA_VERSION = 1
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EpisodeConfig:
|
||||||
|
target_pid: int
|
||||||
|
duration_s: float
|
||||||
|
data_root: Path
|
||||||
|
interval_ms: int = 100
|
||||||
|
image_name: str = "(stub)"
|
||||||
|
snapshot_name: str = "(none-stub)"
|
||||||
|
episode_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EpisodeResult:
|
||||||
|
episode_id: str
|
||||||
|
episode_dir: Path
|
||||||
|
rows_proc: int
|
||||||
|
pid_disappeared: bool
|
||||||
|
duration_observed_s: float
|
||||||
|
|
||||||
|
|
||||||
|
class EpisodeRunner:
|
||||||
|
def __init__(self, cfg: EpisodeConfig) -> None:
|
||||||
|
self.cfg = cfg
|
||||||
|
self.episode_id = cfg.episode_id or new_ulid()
|
||||||
|
self.episode_dir: Path = cfg.data_root / "episodes" / self.episode_id
|
||||||
|
self._t_mono_origin_ns: int = 0
|
||||||
|
self._stop = threading.Event()
|
||||||
|
|
||||||
|
# ---- public ---------------------------------------------------------
|
||||||
|
|
||||||
|
def run(self) -> EpisodeResult:
|
||||||
|
self.episode_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._t_mono_origin_ns = time.monotonic_ns()
|
||||||
|
started_at_wall = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
meta = self._initial_meta(started_at_wall)
|
||||||
|
self._write_meta(meta)
|
||||||
|
|
||||||
|
self._emit_event(0, "snapshot_load", snapshot=self.cfg.snapshot_name)
|
||||||
|
self._emit_label(0, "clean", prev=None, reason="snapshot_loaded")
|
||||||
|
|
||||||
|
rows_holder: dict[str, int] = {"rows": 0}
|
||||||
|
|
||||||
|
def _collector() -> None:
|
||||||
|
rows_holder["rows"] = proc_qemu.run_loop(
|
||||||
|
pid=self.cfg.target_pid,
|
||||||
|
output_path=self.episode_dir / "telemetry-proc.jsonl",
|
||||||
|
t_mono_origin_ns=self._t_mono_origin_ns,
|
||||||
|
interval_ms=self.cfg.interval_ms,
|
||||||
|
stop_event=self._stop,
|
||||||
|
)
|
||||||
|
|
||||||
|
t = threading.Thread(target=_collector, daemon=True, name="proc_qemu")
|
||||||
|
t.start()
|
||||||
|
try:
|
||||||
|
# Wait either for duration to elapse or for stop to be set.
|
||||||
|
self._stop.wait(timeout=self.cfg.duration_s)
|
||||||
|
finally:
|
||||||
|
self._stop.set()
|
||||||
|
t.join(timeout=2.0)
|
||||||
|
|
||||||
|
end_mono_ns = time.monotonic_ns() - self._t_mono_origin_ns
|
||||||
|
pid_alive = _pid_alive(self.cfg.target_pid)
|
||||||
|
self._emit_event(
|
||||||
|
end_mono_ns,
|
||||||
|
"episode_end",
|
||||||
|
target_pid_alive=pid_alive,
|
||||||
|
)
|
||||||
|
|
||||||
|
meta["ended_at_wall"] = datetime.now(timezone.utc).isoformat()
|
||||||
|
meta["result"] = {
|
||||||
|
"phases_observed": ["clean"],
|
||||||
|
"rows_proc": rows_holder["rows"],
|
||||||
|
"pid_alive_at_end": pid_alive,
|
||||||
|
"duration_observed_s": end_mono_ns / 1_000_000_000,
|
||||||
|
}
|
||||||
|
self._write_meta(meta)
|
||||||
|
(self.episode_dir / "done.marker").touch()
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"episode %s complete: rows=%d duration=%.2fs",
|
||||||
|
self.episode_id, rows_holder["rows"], end_mono_ns / 1e9,
|
||||||
|
)
|
||||||
|
return EpisodeResult(
|
||||||
|
episode_id=self.episode_id,
|
||||||
|
episode_dir=self.episode_dir,
|
||||||
|
rows_proc=rows_holder["rows"],
|
||||||
|
pid_disappeared=not pid_alive,
|
||||||
|
duration_observed_s=end_mono_ns / 1_000_000_000,
|
||||||
|
)
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self._stop.set()
|
||||||
|
|
||||||
|
# ---- internals ------------------------------------------------------
|
||||||
|
|
||||||
|
def _initial_meta(self, started_at_wall: str) -> dict:
|
||||||
|
return {
|
||||||
|
"episode_id": self.episode_id,
|
||||||
|
"schema_version": SCHEMA_VERSION,
|
||||||
|
"started_at_wall": started_at_wall,
|
||||||
|
"ended_at_wall": None,
|
||||||
|
"host_fingerprint": {
|
||||||
|
"kernel": os.uname().release,
|
||||||
|
"qemu_version": None,
|
||||||
|
},
|
||||||
|
"vm": {
|
||||||
|
"image_name": self.cfg.image_name,
|
||||||
|
"image_sha256": None,
|
||||||
|
"snapshot_name": self.cfg.snapshot_name,
|
||||||
|
"vcpus": None,
|
||||||
|
"ram_mib": None,
|
||||||
|
"target_pid": self.cfg.target_pid,
|
||||||
|
},
|
||||||
|
"exploit": None,
|
||||||
|
"sample": None,
|
||||||
|
"schedule": {
|
||||||
|
"baseline_seconds": self.cfg.duration_s,
|
||||||
|
"infected_seconds": 0,
|
||||||
|
"dormant_seconds": 0,
|
||||||
|
"interval_ms": self.cfg.interval_ms,
|
||||||
|
},
|
||||||
|
"result": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _write_meta(self, meta: dict) -> None:
|
||||||
|
path = self.episode_dir / "meta.json"
|
||||||
|
tmp = path.with_suffix(".json.partial")
|
||||||
|
with tmp.open("w") as f:
|
||||||
|
json.dump(meta, f, indent=2, sort_keys=True)
|
||||||
|
f.write("\n")
|
||||||
|
os.replace(tmp, path)
|
||||||
|
|
||||||
|
def _emit_event(self, t_mono_ns: int, event: str, **extra) -> None:
|
||||||
|
row = {
|
||||||
|
"t_mono_ns": t_mono_ns,
|
||||||
|
"t_wall_ns": time.time_ns(),
|
||||||
|
"event": event,
|
||||||
|
**extra,
|
||||||
|
}
|
||||||
|
with (self.episode_dir / "events.jsonl").open("a") as f:
|
||||||
|
f.write(json.dumps(row, sort_keys=True) + "\n")
|
||||||
|
|
||||||
|
def _emit_label(self, t_mono_ns: int, phase: str, prev: str | None, reason: str) -> None:
|
||||||
|
row = {
|
||||||
|
"t_mono_ns": t_mono_ns,
|
||||||
|
"t_wall_ns": time.time_ns(),
|
||||||
|
"phase": phase,
|
||||||
|
"prev": prev,
|
||||||
|
"reason": reason,
|
||||||
|
}
|
||||||
|
with (self.episode_dir / "labels.jsonl").open("a") as f:
|
||||||
|
f.write(json.dumps(row, sort_keys=True) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def _pid_alive(pid: int) -> bool:
|
||||||
|
try:
|
||||||
|
os.kill(pid, 0)
|
||||||
|
return True
|
||||||
|
except (ProcessLookupError, PermissionError):
|
||||||
|
# PermissionError means the pid exists but we can't signal it.
|
||||||
|
return isinstance(_last_exception(), PermissionError)
|
||||||
|
|
||||||
|
|
||||||
|
def _last_exception() -> BaseException | None:
|
||||||
|
import sys
|
||||||
|
return sys.exc_info()[1]
|
||||||
21
orchestrator/ulid.py
Normal file
21
orchestrator/ulid.py
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
"""Tiny ULID generator. 26-char Crockford base32, time-sortable."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
_CROCKFORD = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
|
||||||
|
|
||||||
|
|
||||||
|
def new_ulid(now_ms: int | None = None) -> str:
|
||||||
|
"""Return a fresh ULID. ``now_ms`` is overridable for tests."""
|
||||||
|
ms = (now_ms if now_ms is not None else int(time.time() * 1000)) & ((1 << 48) - 1)
|
||||||
|
raw = ms.to_bytes(6, "big") + os.urandom(10)
|
||||||
|
n = int.from_bytes(raw, "big")
|
||||||
|
out = []
|
||||||
|
for _ in range(26):
|
||||||
|
out.append(_CROCKFORD[n & 31])
|
||||||
|
n >>= 5
|
||||||
|
return "".join(reversed(out))
|
||||||
88
tests/test_episode.py
Normal file
88
tests/test_episode.py
Normal file
|
|
@ -0,0 +1,88 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from orchestrator.episode import EpisodeConfig, EpisodeRunner
|
||||||
|
|
||||||
|
|
||||||
|
def _read_jsonl(p: Path) -> list[dict]:
|
||||||
|
return [json.loads(l) for l in p.read_text().splitlines()]
|
||||||
|
|
||||||
|
|
||||||
|
def test_episode_against_self_pid_produces_full_directory(tmp_path: Path) -> None:
|
||||||
|
cfg = EpisodeConfig(
|
||||||
|
target_pid=os.getpid(),
|
||||||
|
duration_s=0.5,
|
||||||
|
interval_ms=50,
|
||||||
|
data_root=tmp_path,
|
||||||
|
)
|
||||||
|
result = EpisodeRunner(cfg).run()
|
||||||
|
|
||||||
|
d = result.episode_dir
|
||||||
|
assert d.exists()
|
||||||
|
assert (d / "meta.json").exists()
|
||||||
|
assert (d / "events.jsonl").exists()
|
||||||
|
assert (d / "labels.jsonl").exists()
|
||||||
|
assert (d / "telemetry-proc.jsonl").exists()
|
||||||
|
assert (d / "done.marker").exists()
|
||||||
|
|
||||||
|
# meta.json structure
|
||||||
|
meta = json.loads((d / "meta.json").read_text())
|
||||||
|
assert meta["episode_id"] == result.episode_id
|
||||||
|
assert meta["schema_version"] == 1
|
||||||
|
assert meta["started_at_wall"] is not None
|
||||||
|
assert meta["ended_at_wall"] is not None
|
||||||
|
assert meta["vm"]["target_pid"] == os.getpid()
|
||||||
|
assert meta["schedule"]["baseline_seconds"] == 0.5
|
||||||
|
assert meta["schedule"]["interval_ms"] == 50
|
||||||
|
assert meta["result"]["rows_proc"] == result.rows_proc
|
||||||
|
assert "clean" in meta["result"]["phases_observed"]
|
||||||
|
|
||||||
|
# labels.jsonl: at least one clean label at t=0.
|
||||||
|
labels = _read_jsonl(d / "labels.jsonl")
|
||||||
|
assert any(r["phase"] == "clean" and r["t_mono_ns"] == 0 for r in labels)
|
||||||
|
|
||||||
|
# events.jsonl: snapshot_load + episode_end.
|
||||||
|
events = _read_jsonl(d / "events.jsonl")
|
||||||
|
event_names = [e["event"] for e in events]
|
||||||
|
assert "snapshot_load" in event_names
|
||||||
|
assert "episode_end" in event_names
|
||||||
|
|
||||||
|
# telemetry-proc.jsonl: roughly 10 ticks @ 50ms over 500ms.
|
||||||
|
proc_rows = _read_jsonl(d / "telemetry-proc.jsonl")
|
||||||
|
assert len(proc_rows) >= 5
|
||||||
|
for row in proc_rows:
|
||||||
|
assert row["source"] == "host_proc"
|
||||||
|
assert row["available_in_deployment"] is False
|
||||||
|
assert row["rss_bytes"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_episode_id_can_be_overridden(tmp_path: Path) -> None:
|
||||||
|
cfg = EpisodeConfig(
|
||||||
|
target_pid=os.getpid(),
|
||||||
|
duration_s=0.1,
|
||||||
|
interval_ms=50,
|
||||||
|
data_root=tmp_path,
|
||||||
|
episode_id="01TEST",
|
||||||
|
)
|
||||||
|
result = EpisodeRunner(cfg).run()
|
||||||
|
assert result.episode_id == "01TEST"
|
||||||
|
assert result.episode_dir == tmp_path / "episodes" / "01TEST"
|
||||||
|
|
||||||
|
|
||||||
|
def test_episode_writes_done_marker_last(tmp_path: Path) -> None:
|
||||||
|
"""done.marker should not appear until meta.json has ended_at_wall set."""
|
||||||
|
cfg = EpisodeConfig(
|
||||||
|
target_pid=os.getpid(),
|
||||||
|
duration_s=0.1,
|
||||||
|
interval_ms=50,
|
||||||
|
data_root=tmp_path,
|
||||||
|
)
|
||||||
|
result = EpisodeRunner(cfg).run()
|
||||||
|
assert (result.episode_dir / "done.marker").exists()
|
||||||
|
meta = json.loads((result.episode_dir / "meta.json").read_text())
|
||||||
|
assert meta["ended_at_wall"] is not None
|
||||||
185
tests/test_proc_qemu.py
Normal file
185
tests/test_proc_qemu.py
Normal file
|
|
@ -0,0 +1,185 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from collectors.proc_qemu import (
|
||||||
|
PAGE_SIZE,
|
||||||
|
collect_once,
|
||||||
|
parse_proc_io,
|
||||||
|
parse_proc_stat,
|
||||||
|
parse_proc_status,
|
||||||
|
run_loop,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# A real-ish /proc/<pid>/stat line where comm contains both spaces and parens.
|
||||||
|
SAMPLE_STAT = (
|
||||||
|
"1234 (weird (comm) name) S 1 1234 1234 0 -1 4194304 "
|
||||||
|
"412 0 3 0 " # minflt cminflt majflt cmajflt
|
||||||
|
"142 38 0 0 " # utime stime cutime cstime
|
||||||
|
"20 0 1 0 " # priority nice num_threads itrealvalue
|
||||||
|
"100000000 " # starttime
|
||||||
|
"1842933760 " # vsize
|
||||||
|
"132453 " # rss (pages)
|
||||||
|
"18446744073709551615 1 1 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
SAMPLE_IO = (
|
||||||
|
"rchar: 12345\n"
|
||||||
|
"wchar: 4096\n"
|
||||||
|
"syscr: 100\n"
|
||||||
|
"syscw: 50\n"
|
||||||
|
"read_bytes: 8192\n"
|
||||||
|
"write_bytes: 4096\n"
|
||||||
|
"cancelled_write_bytes: 0\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
SAMPLE_STATUS = """\
|
||||||
|
Name: bash
|
||||||
|
Umask: 0022
|
||||||
|
State: S (sleeping)
|
||||||
|
Tgid: 1234
|
||||||
|
VmPeak: 12345 kB
|
||||||
|
VmSize: 10000 kB
|
||||||
|
VmRSS: 1024 kB
|
||||||
|
voluntary_ctxt_switches: 12
|
||||||
|
nonvoluntary_ctxt_switches: 3
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_proc_stat_handles_parens_in_comm() -> None:
|
||||||
|
s = parse_proc_stat(SAMPLE_STAT)
|
||||||
|
assert s.minflt == 412
|
||||||
|
assert s.majflt == 3
|
||||||
|
assert s.utime == 142
|
||||||
|
assert s.stime == 38
|
||||||
|
assert s.vsize == 1842933760
|
||||||
|
assert s.rss_pages == 132453
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_proc_io() -> None:
|
||||||
|
io = parse_proc_io(SAMPLE_IO)
|
||||||
|
assert io.read_bytes == 8192
|
||||||
|
assert io.write_bytes == 4096
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_proc_status() -> None:
|
||||||
|
s = parse_proc_status(SAMPLE_STATUS)
|
||||||
|
assert s.voluntary_ctxsw == 12
|
||||||
|
assert s.involuntary_ctxsw == 3
|
||||||
|
|
||||||
|
|
||||||
|
def _write_synthetic_proc(root: Path, pid: int, *, with_io: bool = True) -> None:
|
||||||
|
pdir = root / str(pid)
|
||||||
|
pdir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(pdir / "stat").write_text(SAMPLE_STAT)
|
||||||
|
(pdir / "status").write_text(SAMPLE_STATUS)
|
||||||
|
if with_io:
|
||||||
|
(pdir / "io").write_text(SAMPLE_IO)
|
||||||
|
|
||||||
|
|
||||||
|
def test_collect_once_against_synthetic_proc(tmp_path: Path) -> None:
|
||||||
|
_write_synthetic_proc(tmp_path, 1234)
|
||||||
|
row = collect_once(1234, t_mono_origin_ns=0, proc_root=str(tmp_path))
|
||||||
|
assert row is not None
|
||||||
|
assert row["source"] == "host_proc"
|
||||||
|
assert row["available_in_deployment"] is False
|
||||||
|
assert row["cpu_user_jiffies"] == 142
|
||||||
|
assert row["cpu_sys_jiffies"] == 38
|
||||||
|
assert row["rss_bytes"] == 132453 * PAGE_SIZE
|
||||||
|
assert row["vsize_bytes"] == 1842933760
|
||||||
|
assert row["io_read_bytes"] == 8192
|
||||||
|
assert row["io_write_bytes"] == 4096
|
||||||
|
assert row["voluntary_ctxsw"] == 12
|
||||||
|
assert row["involuntary_ctxsw"] == 3
|
||||||
|
assert row["minor_faults"] == 412
|
||||||
|
assert row["major_faults"] == 3
|
||||||
|
assert row["t_mono_ns"] >= 0
|
||||||
|
assert row["t_wall_ns"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_collect_once_handles_missing_io(tmp_path: Path) -> None:
|
||||||
|
_write_synthetic_proc(tmp_path, 1234, with_io=False)
|
||||||
|
row = collect_once(1234, t_mono_origin_ns=0, proc_root=str(tmp_path))
|
||||||
|
assert row is not None
|
||||||
|
assert row["io_read_bytes"] is None
|
||||||
|
assert row["io_write_bytes"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_collect_once_returns_none_for_missing_pid(tmp_path: Path) -> None:
|
||||||
|
row = collect_once(99999, t_mono_origin_ns=0, proc_root=str(tmp_path))
|
||||||
|
assert row is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_collect_once_against_real_self() -> None:
|
||||||
|
"""Smoke: read our own /proc entry. Most fields will be present."""
|
||||||
|
row = collect_once(os.getpid(), t_mono_origin_ns=time.monotonic_ns())
|
||||||
|
assert row is not None
|
||||||
|
assert row["source"] == "host_proc"
|
||||||
|
assert row["rss_bytes"] > 0
|
||||||
|
assert row["vsize_bytes"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_loop_writes_rows_until_stopped(tmp_path: Path) -> None:
|
||||||
|
_write_synthetic_proc(tmp_path, 1234)
|
||||||
|
out = tmp_path / "telemetry-proc.jsonl"
|
||||||
|
stop = threading.Event()
|
||||||
|
|
||||||
|
def stopper():
|
||||||
|
time.sleep(0.25)
|
||||||
|
stop.set()
|
||||||
|
|
||||||
|
t = threading.Thread(target=stopper)
|
||||||
|
t.start()
|
||||||
|
rows = run_loop(
|
||||||
|
pid=1234,
|
||||||
|
output_path=out,
|
||||||
|
t_mono_origin_ns=time.monotonic_ns(),
|
||||||
|
interval_ms=50,
|
||||||
|
stop_event=stop,
|
||||||
|
proc_root=str(tmp_path),
|
||||||
|
)
|
||||||
|
t.join()
|
||||||
|
assert rows >= 3 # roughly 5 ticks @ 50ms over 250ms
|
||||||
|
lines = out.read_text().splitlines()
|
||||||
|
assert len(lines) == rows
|
||||||
|
# Each line is valid JSON with the expected schema.
|
||||||
|
for line in lines:
|
||||||
|
row = json.loads(line)
|
||||||
|
assert row["source"] == "host_proc"
|
||||||
|
assert row["available_in_deployment"] is False
|
||||||
|
assert row["cpu_user_jiffies"] == 142
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_loop_stops_when_pid_disappears(tmp_path: Path) -> None:
|
||||||
|
_write_synthetic_proc(tmp_path, 1234)
|
||||||
|
out = tmp_path / "telemetry-proc.jsonl"
|
||||||
|
stop = threading.Event()
|
||||||
|
|
||||||
|
def vanish():
|
||||||
|
time.sleep(0.1)
|
||||||
|
# Delete the synthetic /proc/<pid>/ to simulate exit.
|
||||||
|
for f in (tmp_path / "1234").iterdir():
|
||||||
|
f.unlink()
|
||||||
|
(tmp_path / "1234").rmdir()
|
||||||
|
|
||||||
|
t = threading.Thread(target=vanish)
|
||||||
|
t.start()
|
||||||
|
rows = run_loop(
|
||||||
|
pid=1234,
|
||||||
|
output_path=out,
|
||||||
|
t_mono_origin_ns=time.monotonic_ns(),
|
||||||
|
interval_ms=20,
|
||||||
|
stop_event=stop,
|
||||||
|
proc_root=str(tmp_path),
|
||||||
|
)
|
||||||
|
t.join()
|
||||||
|
assert rows >= 1
|
||||||
|
# Loop terminated even though stop was never set.
|
||||||
|
assert not stop.is_set()
|
||||||
20
tests/test_ulid.py
Normal file
20
tests/test_ulid.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from orchestrator.ulid import new_ulid
|
||||||
|
|
||||||
|
|
||||||
|
def test_ulid_length_and_alphabet() -> None:
|
||||||
|
u = new_ulid()
|
||||||
|
assert len(u) == 26
|
||||||
|
assert all(c in "0123456789ABCDEFGHJKMNPQRSTVWXYZ" for c in u)
|
||||||
|
|
||||||
|
|
||||||
|
def test_ulid_uniqueness_burst() -> None:
|
||||||
|
seen = {new_ulid() for _ in range(2000)}
|
||||||
|
assert len(seen) == 2000
|
||||||
|
|
||||||
|
|
||||||
|
def test_ulid_time_sortable() -> None:
|
||||||
|
earlier = new_ulid(now_ms=1_700_000_000_000)
|
||||||
|
later = new_ulid(now_ms=1_700_000_001_000)
|
||||||
|
assert earlier < later
|
||||||
Loading…
Add table
Reference in a new issue