"""HTTP client for the trainer-receiver. Stdlib-only so the worker doesn't pull a new dep into pyproject.toml. Used by the worker daemon (training/fleet/worker.py) and by the operator CLI (tools/cis490_jobs.py).""" from __future__ import annotations import hashlib import json import logging import urllib.error import urllib.request from pathlib import Path from typing import Any log = logging.getLogger("cis490.fleet.client") class FleetClient: """HTTP client for the trainer-receiver.""" def __init__(self, base_url: str = "https://10.100.0.1:8445", *, host_id: str, operator_token: str | None = None, timeout: float = 30.0) -> None: self.base_url = base_url.rstrip("/") self.host_id = host_id self.operator_token = operator_token self.timeout = timeout def _request(self, method: str, path: str, *, body: bytes | None = None, json_body: Any = None, extra_headers: dict | None = None, expect_status: tuple[int, ...] = (200, 201, 204) ) -> tuple[int, dict | bytes]: url = f"{self.base_url}{path}" headers = {"x-lab-host": self.host_id} if extra_headers: headers.update(extra_headers) if json_body is not None: body = json.dumps(json_body).encode() headers["content-type"] = "application/json" if self.operator_token: headers["x-operator-token"] = self.operator_token req = urllib.request.Request(url, data=body, method=method, headers=headers) try: with urllib.request.urlopen(req, timeout=self.timeout) as resp: code = resp.status raw = resp.read() except urllib.error.HTTPError as e: return e.code, e.read() if code == 204 or not raw: return code, {} ctype = resp.headers.get("content-type", "") if "json" in ctype: return code, json.loads(raw) return code, raw # ------------------------------------------------------------------ # Worker API # ------------------------------------------------------------------ def claim(self, capability: dict) -> dict | None: code, body = self._request("POST", "/v1/job/claim", json_body={"capability": capability}) # 200 with {"job": None} is the "no eligible job" sentinel. if code != 200 or not isinstance(body, dict): return None if body.get("job", "") is None: return None if not body.get("job_id"): return None return body def heartbeat(self, job_id: str) -> bool: code, _ = self._request("POST", f"/v1/job/{job_id}/heartbeat") return code == 200 def complete(self, job_id: str, *, artifact_id: str) -> bool: code, _ = self._request("POST", f"/v1/job/{job_id}/complete", json_body={"artifact_id": artifact_id}) return code == 200 def fail(self, job_id: str, *, error: str) -> bool: code, _ = self._request("POST", f"/v1/job/{job_id}/fail", json_body={"error": error}) return code == 200 def upload_artifact(self, job_id: str, bundle_path: Path) -> dict: h = hashlib.sha256() with bundle_path.open("rb") as f: for ch in iter(lambda: f.read(1 << 20), b""): h.update(ch) sha = h.hexdigest() size = bundle_path.stat().st_size with bundle_path.open("rb") as f: data = f.read() code, body = self._request( "PUT", f"/v1/model/{job_id}", body=data, extra_headers={ "x-content-sha256": sha, "content-length": str(size), "content-type": "application/octet-stream", }, expect_status=(200, 201), ) if code not in (200, 201): raise RuntimeError(f"artifact upload failed: code={code} body={body!r}") return body if isinstance(body, dict) else {} # ------------------------------------------------------------------ # Operator API # ------------------------------------------------------------------ def list_jobs(self, *, status: str | None = None) -> list[dict]: path = "/v1/jobs" if status: path += f"?status={status}" code, body = self._request("GET", path) return body.get("jobs", []) if isinstance(body, dict) else [] def cancel(self, job_id: str) -> bool: code, body = self._request("POST", f"/v1/job/{job_id}/cancel") return code == 200 and bool((body or {}).get("ok")) def requeue(self, job_id: str) -> bool: code, body = self._request("POST", f"/v1/job/{job_id}/requeue") return code == 200 and bool((body or {}).get("ok")) def reload_manifest(self) -> dict: code, body = self._request("POST", "/v1/manifest/reload") if code != 200: raise RuntimeError(f"reload failed: code={code} body={body!r}") return body if isinstance(body, dict) else {} def workers(self) -> list[dict]: code, body = self._request("GET", "/v1/workers") return body.get("workers", []) if isinstance(body, dict) else []