CIS490/training/fleet/capability.py

"""Capability self-detection for a training-fleet worker.

Each worker reports a Capability blob to the receiver at startup +
periodically thereafter. The receiver intersects this with the
host's declared capability in the training manifest (more
restrictive wins) and uses the result to filter claimable jobs.

What we report:

  hostname        — same as the worker's host_id by default
  os, arch        — for diagnostics
  cpu_cores       — physical, not hyperthreaded (best-effort)
  ram_total_gib
  ram_available_gib
  cuda_available  — bool; torch.cuda.is_available() result
  cuda_devices    — list of {name, vram_total_gib, vram_free_gib}
  torch_version
  python_version
  training_commit — git commit of /opt/cis490 (or the worker's repo)

Detection is best-effort: if torch isn't importable we report
cuda_available=false rather than failing. If a CUDA device is
present but CUDA fails to initialize, we still report it as
cuda_available=false.
"""
from __future__ import annotations

import os
import platform
import socket
import subprocess
import sys
from dataclasses import asdict, dataclass, field
from pathlib import Path


@dataclass(frozen=True)
class CudaDevice:
    name: str
    vram_total_gib: float
    vram_free_gib: float


@dataclass(frozen=True)
class Capability:
    hostname: str
    os: str
    arch: str
    cpu_cores: int
    ram_total_gib: float
    ram_available_gib: float
    cuda_available: bool
    cuda_devices: tuple[CudaDevice, ...]
    torch_version: str | None
    python_version: str
    training_commit: str | None

    def to_dict(self) -> dict:
        d = asdict(self)
        d["cuda_devices"] = [asdict(c) for c in self.cuda_devices]
        return d

    def best_vram_gib(self) -> float:
        """VRAM of the largest visible CUDA device (free memory)."""
        if not self.cuda_devices:
            return 0.0
        return max(c.vram_free_gib for c in self.cuda_devices)

    def can_run(self, *, require_cuda: bool, min_vram_gib: float,
                min_ram_gib: float, min_cores: int) -> tuple[bool, str]:
        """Return (eligible, reason). False eligible → reason explains why."""
        if require_cuda and not self.cuda_available:
            return False, "require_cuda but no CUDA device available"
        if require_cuda and self.best_vram_gib() < min_vram_gib:
            return False, (f"require_cuda but largest free VRAM "
                            f"{self.best_vram_gib():.1f} GiB < "
                            f"{min_vram_gib:.1f} GiB needed")
        if self.ram_available_gib < min_ram_gib:
            return False, (f"available RAM {self.ram_available_gib:.1f} GiB < "
                            f"{min_ram_gib:.1f} GiB needed")
        if self.cpu_cores < min_cores:
            return False, (f"cpu_cores {self.cpu_cores} < "
                            f"{min_cores} needed")
        return True, "ok"


def _detect_ram_gib() -> tuple[float, float]:
    """(total, available) in GiB. Linux /proc/meminfo first, fall
    back to platform-specific tools."""
    try:
        meminfo = Path("/proc/meminfo").read_text()
        parts = {}
        for line in meminfo.splitlines():
            k, _, rest = line.partition(":")
            v = rest.strip().split()
            if v and v[-1].lower() == "kb":
                try:
                    parts[k.strip()] = int(v[0])
                except ValueError:
                    pass
        total_kib = parts.get("MemTotal", 0)
        avail_kib = parts.get("MemAvailable") or parts.get("MemFree", 0)
        return (total_kib / (1024 * 1024), avail_kib / (1024 * 1024))
    except (FileNotFoundError, PermissionError):
        pass
    # Windows/macOS fallback via psutil if installed
    try:
        import psutil  # type: ignore
        v = psutil.virtual_memory()
        return (v.total / (1024 ** 3), v.available / (1024 ** 3))
    except ImportError:
        return (0.0, 0.0)


def _detect_cpu_cores() -> int:
    """Physical core count, best-effort."""
    try:
        # Linux /proc/cpuinfo "physical id"+"core id" pairs
        info = Path("/proc/cpuinfo").read_text()
        pairs: set[tuple[str, str]] = set()
        cur = {}
        for line in info.splitlines():
            line = line.strip()
            if not line:
                if "physical id" in cur and "core id" in cur:
                    pairs.add((cur["physical id"], cur["core id"]))
                cur = {}
                continue
            if ":" in line:
                k, _, v = line.partition(":")
                cur[k.strip()] = v.strip()
        if pairs:
            return len(pairs)
    except (FileNotFoundError, PermissionError):
        pass
    # Fallback: logical count
    return os.cpu_count() or 1


def _detect_cuda() -> tuple[bool, tuple[CudaDevice, ...], str | None]:
    """Probe torch for CUDA. Returns (available, devices, torch_version)."""
    try:
        import torch
        torch_ver = torch.__version__
    except Exception:
        return False, (), None
    try:
        if not torch.cuda.is_available():
            return False, (), torch_ver
        devs: list[CudaDevice] = []
        for i in range(torch.cuda.device_count()):
            name = torch.cuda.get_device_name(i)
            free, total = torch.cuda.mem_get_info(i)
            devs.append(CudaDevice(
                name=name,
                vram_total_gib=total / (1024 ** 3),
                vram_free_gib=free / (1024 ** 3),
            ))
        return True, tuple(devs), torch_ver
    except Exception:
        return False, (), torch_ver


def _detect_commit(repo_root: Path) -> str | None:
    try:
        r = subprocess.run(
            ["git", "rev-parse", "HEAD"],
            cwd=str(repo_root), capture_output=True, text=True, timeout=2,
        )
        if r.returncode == 0:
            return r.stdout.strip()
    except (FileNotFoundError, subprocess.TimeoutExpired):
        pass
    return None


def detect(*, hostname_override: str | None = None,
            repo_root: Path | None = None) -> Capability:
    hostname = (hostname_override or os.environ.get("FLEET_HOST_ID")
                 or socket.gethostname())
    ram_total, ram_avail = _detect_ram_gib()
    cuda_available, cuda_devs, torch_ver = _detect_cuda()
    commit = _detect_commit(repo_root or Path(__file__).resolve().parents[2])
    return Capability(
        hostname=hostname,
        os=platform.system(),
        arch=platform.machine(),
        cpu_cores=_detect_cpu_cores(),
        ram_total_gib=ram_total,
        ram_available_gib=ram_avail,
        cuda_available=cuda_available,
        cuda_devices=cuda_devs,
        torch_version=torch_ver,
        python_version=platform.python_version(),
        training_commit=commit,
    )


def main() -> int:
    """`python -m training.fleet.capability` — debug print."""
    import json
    cap = detect()
    print(json.dumps(cap.to_dict(), indent=2))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())