CIS490/etc/cis490-trainer-worker.service

[Unit]
Description=CIS490 trainer worker (claims jobs, runs trainings, ships artifacts)
After=network-online.target
Wants=network-online.target
Documentation=https://maxgit.wg/spectral/CIS490

[Service]
Type=simple
User=cis490
Group=cis490

EnvironmentFile=-/etc/cis490/trainer-worker.env

# CIS490_TRAINER_RECEIVER_URL — set in trainer-worker.env
# FLEET_HOST_ID — override the hostname-derived host_id (optional)

ExecStart=/opt/cis490/.venv/bin/python -m training.fleet.worker \
    --receiver-url ${CIS490_TRAINER_RECEIVER_URL} \
    --validation /opt/cis490/data/processed/validation_v1.parquet \
    --summary /opt/cis490/data/processed/features_window_v1.parquet \
    --tensors /opt/cis490/data/processed/tensor_window_v1 \
    --artifacts-dir artifacts \
    --reports-dir reports/eval

WorkingDirectory=/opt/cis490
Restart=on-failure
RestartSec=15s

# Workers do compute-heavy training. Don't kill them just because a single
# job failed; let the daemon's own loop handle that.
TimeoutStopSec=120s

ProtectSystem=strict
ProtectHome=true
PrivateTmp=false                     # need /tmp for trainer scratch
NoNewPrivileges=true
ReadWritePaths=/opt/cis490 /var/lib/cis490 /tmp

[Install]
WantedBy=multi-user.target