"""cis490-jobs — operator control CLI for the training fleet. Talks to the trainer-receiver over HTTP. Subcommands: cis490-jobs status pretty-print queue + worker status cis490-jobs list [--status pending] cis490-jobs show cis490-jobs cancel cis490-jobs requeue force-requeue from any state cis490-jobs reload re-read manifest, sync queue cis490-jobs workers last-seen capability per worker Auth: control endpoints require X-Operator-Token. Set it via $CIS490_OPERATOR_TOKEN. Status endpoints (status, list, show, workers) work without a token. Usage from outside the Pi: set --receiver-url to the Pi's WG address (e.g., http://10.100.0.1:8445). """ from __future__ import annotations import argparse import json import os import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from training.fleet.client import FleetClient def _client_from_args(args) -> FleetClient: token = (args.token if args.token else os.environ.get("CIS490_OPERATOR_TOKEN")) return FleetClient(args.receiver_url, host_id=args.as_host or os.uname().nodename, operator_token=token) def cmd_status(args) -> int: c = _client_from_args(args) jobs = c.list_jobs() workers = c.workers() from collections import Counter counts = Counter(j["status"] for j in jobs) print("=== queue ===") for s in ("pending", "claimed", "running", "completed", "failed", "cancelled"): n = counts.get(s, 0) print(f" {s:>10} {n}") print() print(f"=== workers ({len(workers)}) ===") now = time.time() for w in workers: cap = w.get("capability", {}) seen = (now - float(w.get("last_seen", 0))) cuda = "CUDA" if cap.get("cuda_available") else "CPU" vram = cap.get("cuda_devices", [{}])[0].get("vram_total_gib", 0.0) \ if cap.get("cuda_devices") else 0.0 print(f" {w['hostname']:>20} {cuda} cores={cap.get('cpu_cores')}" f" ram={cap.get('ram_available_gib', 0):.1f}/" f"{cap.get('ram_total_gib', 0):.1f}GiB" f" vram={vram:.1f}GiB last_seen={seen:.0f}s ago") print() print("=== running ===") for j in jobs: if j["status"] in ("claimed", "running"): print(f" {j['name']:>26} by={j['claimed_by']} status={j['status']}") print() print("=== failed ===") for j in jobs: if j["status"] == "failed": err = (j.get("last_error") or "")[:100] print(f" {j['name']:>26} attempts={j['attempts']} err={err}") return 0 def cmd_list(args) -> int: c = _client_from_args(args) jobs = c.list_jobs(status=args.status) if args.json: print(json.dumps(jobs, indent=2)) return 0 print(f" {'name':<26} {'model':<18} {'mode':<10} {'prio':>5} " f"{'status':<10} {'host':<16}") for j in jobs: print(f" {j['name']:<26} {j.get('model','?'):<18} " f"{j.get('mode','?'):<10} {j.get('priority','?'):>5} " f"{j['status']:<10} {(j.get('claimed_by') or '-'):<16}") return 0 def cmd_show(args) -> int: c = _client_from_args(args) jobs = c.list_jobs() job = next((j for j in jobs if j["job_id"] == args.job_id or j["name"] == args.job_id), None) if job is None: print(f"no job matching {args.job_id!r}", file=sys.stderr) return 1 print(json.dumps(job, indent=2)) return 0 def cmd_cancel(args) -> int: c = _client_from_args(args) ok = c.cancel(args.job_id) print("cancelled" if ok else "cancel failed (wrong state? unknown id?)", file=sys.stderr) return 0 if ok else 1 def cmd_requeue(args) -> int: c = _client_from_args(args) ok = c.requeue(args.job_id) print("requeued" if ok else "requeue failed", file=sys.stderr) return 0 if ok else 1 def cmd_reload(args) -> int: c = _client_from_args(args) res = c.reload_manifest() print(json.dumps(res, indent=2)) return 0 def cmd_workers(args) -> int: c = _client_from_args(args) workers = c.workers() if args.json: print(json.dumps(workers, indent=2)) else: for w in workers: print(f"\n=== {w['hostname']} ===") cap = w.get("capability", {}) print(f" os/arch: {cap.get('os')}/{cap.get('arch')}") print(f" python: {cap.get('python_version')} torch={cap.get('torch_version')}") print(f" cores: {cap.get('cpu_cores')}") print(f" ram: {cap.get('ram_available_gib', 0):.1f} / " f"{cap.get('ram_total_gib', 0):.1f} GiB") print(f" cuda: {cap.get('cuda_available')}") for d in cap.get("cuda_devices") or []: print(f" {d.get('name')} " f"vram={d.get('vram_free_gib',0):.1f}/{d.get('vram_total_gib',0):.1f} GiB") print(f" commit: {(cap.get('training_commit') or '-')[:12]}") return 0 def main() -> int: p = argparse.ArgumentParser(prog="cis490-jobs") p.add_argument("--receiver-url", default=os.environ.get( "CIS490_TRAINER_RECEIVER_URL", "http://10.100.0.1:8445" )) p.add_argument("--token", help="operator token (or $CIS490_OPERATOR_TOKEN)") p.add_argument("--as-host", default=None, help="X-Lab-Host header (default: this machine)") sub = p.add_subparsers(dest="cmd", required=True) s_status = sub.add_parser("status", help="pretty-print queue + worker status") s_status.set_defaults(func=cmd_status) s_list = sub.add_parser("list", help="list jobs") s_list.add_argument("--status", choices=["pending","claimed","running","completed", "failed","cancelled"]) s_list.add_argument("--json", action="store_true") s_list.set_defaults(func=cmd_list) s_show = sub.add_parser("show", help="full detail for one job (id or name)") s_show.add_argument("job_id") s_show.set_defaults(func=cmd_show) s_cancel = sub.add_parser("cancel", help="mark pending/failed → cancelled") s_cancel.add_argument("job_id") s_cancel.set_defaults(func=cmd_cancel) s_requeue = sub.add_parser("requeue", help="force any non-pending job back to pending") s_requeue.add_argument("job_id") s_requeue.set_defaults(func=cmd_requeue) s_reload = sub.add_parser("reload", help="re-read manifest, sync queue") s_reload.set_defaults(func=cmd_reload) s_workers = sub.add_parser("workers", help="list workers + capabilities") s_workers.add_argument("--json", action="store_true") s_workers.set_defaults(func=cmd_workers) args = p.parse_args() return args.func(args) if __name__ == "__main__": raise SystemExit(main())