"""Capability self-detection for a training-fleet worker. Each worker reports a Capability blob to the receiver at startup + periodically thereafter. The receiver intersects this with the host's declared capability in the training manifest (more restrictive wins) and uses the result to filter claimable jobs. What we report: hostname — same as the worker's host_id by default os, arch — for diagnostics cpu_cores — physical, not hyperthreaded (best-effort) ram_total_gib ram_available_gib cuda_available — bool; torch.cuda.is_available() result cuda_devices — list of {name, vram_total_gib, vram_free_gib} torch_version python_version training_commit — git commit of /opt/cis490 (or the worker's repo) Detection is best-effort: if torch isn't importable we report cuda_available=false rather than failing. If a CUDA device is present but CUDA fails to initialize, we still report it as cuda_available=false. """ from __future__ import annotations import os import platform import socket import subprocess import sys from dataclasses import asdict, dataclass, field from pathlib import Path @dataclass(frozen=True) class CudaDevice: name: str vram_total_gib: float vram_free_gib: float @dataclass(frozen=True) class Capability: hostname: str os: str arch: str cpu_cores: int ram_total_gib: float ram_available_gib: float cuda_available: bool cuda_devices: tuple[CudaDevice, ...] torch_version: str | None python_version: str training_commit: str | None def to_dict(self) -> dict: d = asdict(self) d["cuda_devices"] = [asdict(c) for c in self.cuda_devices] return d def best_vram_gib(self) -> float: """VRAM of the largest visible CUDA device (free memory).""" if not self.cuda_devices: return 0.0 return max(c.vram_free_gib for c in self.cuda_devices) def can_run(self, *, require_cuda: bool, min_vram_gib: float, min_ram_gib: float, min_cores: int) -> tuple[bool, str]: """Return (eligible, reason). False eligible → reason explains why.""" if require_cuda and not self.cuda_available: return False, "require_cuda but no CUDA device available" if require_cuda and self.best_vram_gib() < min_vram_gib: return False, (f"require_cuda but largest free VRAM " f"{self.best_vram_gib():.1f} GiB < " f"{min_vram_gib:.1f} GiB needed") if self.ram_available_gib < min_ram_gib: return False, (f"available RAM {self.ram_available_gib:.1f} GiB < " f"{min_ram_gib:.1f} GiB needed") if self.cpu_cores < min_cores: return False, (f"cpu_cores {self.cpu_cores} < " f"{min_cores} needed") return True, "ok" def _detect_ram_gib() -> tuple[float, float]: """(total, available) in GiB. Linux /proc/meminfo first, fall back to platform-specific tools.""" try: meminfo = Path("/proc/meminfo").read_text() parts = {} for line in meminfo.splitlines(): k, _, rest = line.partition(":") v = rest.strip().split() if v and v[-1].lower() == "kb": try: parts[k.strip()] = int(v[0]) except ValueError: pass total_kib = parts.get("MemTotal", 0) avail_kib = parts.get("MemAvailable") or parts.get("MemFree", 0) return (total_kib / (1024 * 1024), avail_kib / (1024 * 1024)) except (FileNotFoundError, PermissionError): pass # Windows/macOS fallback via psutil if installed try: import psutil # type: ignore v = psutil.virtual_memory() return (v.total / (1024 ** 3), v.available / (1024 ** 3)) except ImportError: return (0.0, 0.0) def _detect_cpu_cores() -> int: """Physical core count, best-effort.""" try: # Linux /proc/cpuinfo "physical id"+"core id" pairs info = Path("/proc/cpuinfo").read_text() pairs: set[tuple[str, str]] = set() cur = {} for line in info.splitlines(): line = line.strip() if not line: if "physical id" in cur and "core id" in cur: pairs.add((cur["physical id"], cur["core id"])) cur = {} continue if ":" in line: k, _, v = line.partition(":") cur[k.strip()] = v.strip() if pairs: return len(pairs) except (FileNotFoundError, PermissionError): pass # Fallback: logical count return os.cpu_count() or 1 def _detect_cuda() -> tuple[bool, tuple[CudaDevice, ...], str | None]: """Probe torch for CUDA. Returns (available, devices, torch_version).""" try: import torch torch_ver = torch.__version__ except Exception: return False, (), None try: if not torch.cuda.is_available(): return False, (), torch_ver devs: list[CudaDevice] = [] for i in range(torch.cuda.device_count()): name = torch.cuda.get_device_name(i) free, total = torch.cuda.mem_get_info(i) devs.append(CudaDevice( name=name, vram_total_gib=total / (1024 ** 3), vram_free_gib=free / (1024 ** 3), )) return True, tuple(devs), torch_ver except Exception: return False, (), torch_ver def _detect_commit(repo_root: Path) -> str | None: try: r = subprocess.run( ["git", "rev-parse", "HEAD"], cwd=str(repo_root), capture_output=True, text=True, timeout=2, ) if r.returncode == 0: return r.stdout.strip() except (FileNotFoundError, subprocess.TimeoutExpired): pass return None def detect(*, hostname_override: str | None = None, repo_root: Path | None = None) -> Capability: hostname = (hostname_override or os.environ.get("FLEET_HOST_ID") or socket.gethostname()) ram_total, ram_avail = _detect_ram_gib() cuda_available, cuda_devs, torch_ver = _detect_cuda() commit = _detect_commit(repo_root or Path(__file__).resolve().parents[2]) return Capability( hostname=hostname, os=platform.system(), arch=platform.machine(), cpu_cores=_detect_cpu_cores(), ram_total_gib=ram_total, ram_available_gib=ram_avail, cuda_available=cuda_available, cuda_devices=cuda_devs, torch_version=torch_ver, python_version=platform.python_version(), training_commit=commit, ) def main() -> int: """`python -m training.fleet.capability` — debug print.""" import json cap = detect() print(json.dumps(cap.to_dict(), indent=2)) return 0 if __name__ == "__main__": raise SystemExit(main())