[Unit] Description=CIS490 trainer worker (claims jobs, runs trainings, ships artifacts) After=network-online.target Wants=network-online.target Documentation=https://maxgit.wg/spectral/CIS490 [Service] Type=simple User=cis490 Group=cis490 EnvironmentFile=-/etc/cis490/trainer-worker.env # CIS490_TRAINER_RECEIVER_URL — set in trainer-worker.env # FLEET_HOST_ID — override the hostname-derived host_id (optional) ExecStart=/opt/cis490/.venv/bin/python -m training.fleet.worker \ --receiver-url ${CIS490_TRAINER_RECEIVER_URL} \ --validation /opt/cis490/data/processed/validation_v1.parquet \ --summary /opt/cis490/data/processed/features_window_v1.parquet \ --tensors /opt/cis490/data/processed/tensor_window_v1 \ --artifacts-dir artifacts \ --reports-dir reports/eval WorkingDirectory=/opt/cis490 Restart=on-failure RestartSec=15s # Workers do compute-heavy training. Don't kill them just because a single # job failed; let the daemon's own loop handle that. TimeoutStopSec=120s ProtectSystem=strict ProtectHome=true PrivateTmp=false # need /tmp for trainer scratch NoNewPrivileges=true ReadWritePaths=/opt/cis490 /var/lib/cis490 /tmp [Install] WantedBy=multi-user.target