CIS490/training/fleet/store.py

"""Trained-artifact store on the Pi.

Mirrors ``receiver/store.py`` for episodes — same atomic-write,
sha256-verified, stream-ingest design — but stores trained models
under ``/var/lib/cis490/models/<model>_<mode>/<artifact_id>/``.

An ``artifact_id`` is the sha256 of the uploaded tarball. The same
job_id can produce multiple artifact_ids if the operator re-runs the
job (different code commit, different epoch, different seed); the
queue records the latest artifact_id for each completed job, but the
store keeps every uploaded artifact so re-runs can be compared.

Layout::

    /var/lib/cis490/models/
        index.jsonl                                — append-only ingest log
        <model>_<mode>/
            <artifact_id>/
                bundle.tar.zst                     — what was uploaded
                meta.json                          — header from the bundle
"""
from __future__ import annotations

import hashlib
import json
import re
import time
from dataclasses import dataclass
from pathlib import Path
from typing import AsyncIterator


_ID_RE = re.compile(r"^[A-Za-z0-9_.-]{1,128}$")


def is_valid_id(s: str) -> bool:
    return bool(_ID_RE.match(s))


@dataclass(frozen=True)
class StoreResult:
    status: str        # "stored" | "already-present" | "sha-mismatch" | "too-large"
    artifact_id: str | None
    size_bytes: int | None


class ModelStore:
    def __init__(self, store_root: Path, incoming_root: Path,
                 index_path: Path) -> None:
        self.store_root = store_root
        self.incoming_root = incoming_root
        self.index_path = index_path
        self.store_root.mkdir(parents=True, exist_ok=True)
        self.incoming_root.mkdir(parents=True, exist_ok=True)
        self.index_path.parent.mkdir(parents=True, exist_ok=True)
        self.index_path.touch(exist_ok=True)

    def final_dir(self, model: str, mode: str, artifact_id: str) -> Path:
        return self.store_root / f"{model}_{mode}" / artifact_id

    async def ingest_stream(
        self,
        *,
        job_id: str,
        model: str,
        mode: str,
        worker: str,
        expected_sha256: str,
        body: AsyncIterator[bytes],
        max_bytes: int,
    ) -> StoreResult:
        # Final artifact id == the uploaded tarball's sha256, so
        # uploading the same bytes twice deduplicates.
        h = hashlib.sha256()
        n = 0
        incoming_dir = self.incoming_root / f"{model}_{mode}"
        incoming_dir.mkdir(parents=True, exist_ok=True)
        partial = incoming_dir / f"{job_id}-{int(time.time())}.tar.zst.partial"
        try:
            with partial.open("wb") as out:
                async for chunk in body:
                    n += len(chunk)
                    if n > max_bytes:
                        partial.unlink(missing_ok=True)
                        return StoreResult("too-large", None, n)
                    h.update(chunk)
                    out.write(chunk)
            actual = h.hexdigest()
            if expected_sha256 and actual != expected_sha256.lower():
                partial.unlink(missing_ok=True)
                return StoreResult("sha-mismatch", actual, n)
            artifact_id = actual
            final_dir = self.final_dir(model, mode, artifact_id)
            if final_dir.exists() and (final_dir / "bundle.tar.zst").exists():
                partial.unlink(missing_ok=True)
                return StoreResult("already-present", artifact_id, n)
            final_dir.mkdir(parents=True, exist_ok=True)
            final = final_dir / "bundle.tar.zst"
            partial.replace(final)
            self._write_meta(final_dir, model=model, mode=mode,
                              job_id=job_id, worker=worker,
                              artifact_id=artifact_id, size_bytes=n)
            self._append_index({
                "received_at_wall": time.strftime("%Y-%m-%dT%H:%M:%SZ",
                                                   time.gmtime()),
                "job_id": job_id, "model": model, "mode": mode,
                "worker": worker, "artifact_id": artifact_id,
                "size_bytes": n,
            })
            return StoreResult("stored", artifact_id, n)
        except BaseException:
            partial.unlink(missing_ok=True)
            raise

    def _write_meta(self, final_dir: Path, **kwargs) -> None:
        (final_dir / "meta.json").write_text(
            json.dumps(kwargs, indent=2) + "\n"
        )

    def _append_index(self, row: dict) -> None:
        line = json.dumps(row, sort_keys=True) + "\n"
        with self.index_path.open("a") as f:
            f.write(line)