From 3d4f282e9c75063d4b5a4777ddabfda56f0b94d3 Mon Sep 17 00:00:00 2001
From: Max Gorog <mgorog@gmail.com>
Date: Mon, 4 May 2026 01:55:37 -0500
Subject: [PATCH] Tier-2 episodes use clean-only schedule; .gitignore VERSION
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two correctness fixes that the §4.5 event-driven labeller surfaced:

1. tools/run_real_vm_demo.py was hardcoding a Tier-3-shaped schedule
   (clean → armed → infecting → infected_running → ...) for episodes
   with no exploit firing. Pre-§4.5 those episodes wrote dishonest
   `infected_running` labels from the schedule clock — exactly the §3
   evidence pattern. Post-§4.5 they write `failed` at the infecting
   transition (the justifying exploit_fire never arrives), which is
   honest about what happened but useless for training.

   The honest fix: Tier-2 episodes have a clean-only schedule. All
   telemetry tagged `clean` because nothing infected anything. The
   total duration matches the canonical Tier-3 schedule so episode
   lengths are comparable across tiers — no length-bias in the
   dataset (§10).

   Helper `tier2_schedule_from(schedule)` in orchestrator/manifest.py
   derives `[("clean", total_seconds)]` from the canonical schedule.
   `tier3_schedule_from(schedule)` renders the legacy
   `[(name, seconds)]` shape EpisodeConfig still expects.

   Tier-2 demo (run_real_vm_demo.py) now calls tier2_schedule_from.
   Tier-3 demo (run_tier3_demo.py) now calls tier3_schedule_from.
   Drops the hardcoded DEFAULT_SCHEDULE constants from both — the
   canonical manifest is the single source of truth (§4.1).

2. .gitignore now excludes /VERSION. The install-lab-host.sh stamp
   writes /opt/cis490/VERSION so episodes can record code provenance
   without /opt/cis490 carrying a .git directory. But /opt/cis490 IS
   typically a git checkout on lab hosts (auto-update.sh pulls into
   it), so writing VERSION leaves the working tree dirty. Every
   episode's meta.code_version.dirty=true. PIPELINE.md §4.6 acceptance
   gate's rule 4 would then reject every episode without
   CIS490_ALLOW_DIRTY=1 set — which would break the data flow.

   Now VERSION is .gitignored: install-lab-host.sh stamps it, git
   status doesn't see it, dirty=false, gate rule 4 passes naturally.

These two changes together keep the data flowing AND honest. Tier-2
episodes pass with `phases=[clean]` + every collector emitting real
rows. Tier-3 episodes (none today, empty catalog) walk the full
event-driven schedule when a verified module gets re-admitted.

286 tests passing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore                |  6 ++++++
 orchestrator/manifest.py  | 27 +++++++++++++++++++++++++++
 tools/run_real_vm_demo.py | 27 +++++++++++++--------------
 tools/run_tier3_demo.py   | 28 ++++++++++++----------------
 4 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/.gitignore b/.gitignore
index 11c9b47..8375b55 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,12 @@
 vm/images/
 vm/snapshots/
 
+# VERSION file is install-script-stamped (provenance for episodes
+# generated from /opt/cis490 install copies). Tracking it would
+# trigger spurious dirty-tree state on lab hosts and reject every
+# episode at the §4.6 acceptance gate.
+/VERSION
+
 # Telemetry output
 data/episodes/
 data/campaign.json
diff --git a/orchestrator/manifest.py b/orchestrator/manifest.py
index 43748f5..1fe2d5c 100644
--- a/orchestrator/manifest.py
+++ b/orchestrator/manifest.py
@@ -27,6 +27,33 @@ from pathlib import Path
 
 CANONICAL_FILENAME = "manifest.toml"
 
+
+def tier3_schedule_from(schedule: "tuple[Phase, ...]") -> list[tuple[str, float]]:
+    """Render the canonical schedule as the legacy
+    `[(name, seconds)]` format EpisodeConfig.phase_schedule expects.
+
+    This IS the Tier-3 schedule: clean → armed → infecting →
+    infected_running → ... per the canonical manifest. Phase labels
+    are event-driven (PIPELINE.md §4.5) so durations are budgets, not
+    label sources."""
+    return [(p.name, p.seconds) for p in schedule]
+
+
+def tier2_schedule_from(schedule: "tuple[Phase, ...]") -> list[tuple[str, float]]:
+    """Tier-2 episodes have no exploit and no driver firing modules.
+    Walking the Tier-3 phase set on a Tier-2 episode produces dishonest
+    `infected_running` labels (PIPELINE.md §3 evidence — the original
+    sin) under clock-driven labelling, OR `failed` labels under
+    event-driven labelling (still useless for training).
+
+    Honest fix: Tier-2 episodes ride a single `clean` phase for the
+    same total wall-clock as the Tier-3 walk so episode lengths are
+    comparable across tiers (no length-bias in the dataset). Every
+    telemetry row on a Tier-2 episode is tagged `clean` because
+    nothing infected anything."""
+    total = sum(p.seconds for p in schedule)
+    return [("clean", float(total))]
+
 # Closed enums — keep in sync with the corresponding code that
 # implements each name. A name not in these sets means the manifest
 # is asking for something the orchestrator doesn't know how to do.
diff --git a/tools/run_real_vm_demo.py b/tools/run_real_vm_demo.py
index e1795ff..bea543a 100644
--- a/tools/run_real_vm_demo.py
+++ b/tools/run_real_vm_demo.py
@@ -29,23 +29,21 @@ sys.path.insert(0, str(Path(__file__).resolve().parent))
 
 from collectors import qmp  # noqa: E402
 from orchestrator.episode import EpisodeConfig, EpisodeRunner  # noqa: E402
-from orchestrator.manifest import ManifestError, load_canonical  # noqa: E402
+from orchestrator.manifest import (  # noqa: E402
+    ManifestError, load_canonical, tier2_schedule_from,
+)
 from samples.manifest import SampleManifest  # noqa: E402
 from vm_load_controller import VMLoadController  # noqa: E402
 from vm_serial import SerialClient  # noqa: E402
 
 
-# Same shape as run_envelope_demo so plots are comparable.
-DEFAULT_SCHEDULE = [
-    ("clean",            10.0),
-    ("armed",             2.0),
-    ("infecting",         3.0),
-    ("infected_running", 25.0),
-    ("dormant",          15.0),
-    ("infected_running", 20.0),
-    ("dormant",           5.0),
-    ("clean",             5.0),
-]
+# Tier-2 episodes have no exploit firing — their schedule is derived
+# from the canonical Tier-3 schedule total duration (PIPELINE.md §4.1
+# canonical manifest, §4.5 event-driven labeller, §10 honest labels).
+# `tier2_schedule_from(experiment.schedule)` produces a single `clean`
+# phase for the same wall-clock as a Tier-3 walk; that keeps episode
+# lengths comparable across tiers without minting `infected_running`
+# labels for episodes where nothing infected anything.
 
 
 def _wait_for_socket(path: Path, timeout_s: float) -> None:
@@ -214,12 +212,13 @@ def main() -> int:
         controller.setup()
 
         agent_sock = run_dir / "agent.sock"
+        schedule = tier2_schedule_from(experiment.schedule)
         cfg = EpisodeConfig(
             target_pid=qemu_pid,
-            duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
+            duration_s=sum(d for _, d in schedule),
             interval_ms=args.interval_ms,
             data_root=Path(args.data_root),
-            phase_schedule=DEFAULT_SCHEDULE,
+            phase_schedule=schedule,
             image_name="alpine-3.21-cloudinit",
             snapshot_name="baseline-v1",
             qmp_socket=qmp_sock if qmp_sock.exists() else None,
diff --git a/tools/run_tier3_demo.py b/tools/run_tier3_demo.py
index 4a90af9..2e76078 100644
--- a/tools/run_tier3_demo.py
+++ b/tools/run_tier3_demo.py
@@ -38,23 +38,18 @@ from exploits.driver import DriverConfig, MSFExploitDriver  # noqa: E402
 from exploits.modules import load_module_config  # noqa: E402
 from exploits.msfrpc import MSFRpcClient, MSFRpcConfig  # noqa: E402
 from orchestrator.episode import EpisodeConfig, EpisodeRunner  # noqa: E402
-from orchestrator.manifest import ManifestError, load_canonical  # noqa: E402
+from orchestrator.manifest import (  # noqa: E402
+    ManifestError, load_canonical, tier3_schedule_from,
+)
 from samples.manifest import SampleManifest  # noqa: E402
 
 
-# Same envelope shape as Tier 2 so plots are comparable. Slightly more
-# armed/infecting time because real exploit fire + session establishment
-# takes hundreds of ms to a few seconds.
-DEFAULT_SCHEDULE = [
-    ("clean",            10.0),
-    ("armed",             3.0),
-    ("infecting",         5.0),
-    ("infected_running", 25.0),
-    ("dormant",          15.0),
-    ("infected_running", 20.0),
-    ("dormant",           5.0),
-    ("clean",             5.0),
-]
+# Tier-3 schedule comes from the canonical manifest at episode-launch
+# time. Phase durations are budgets for the §4.5 event-driven labeller
+# (clean/armed orchestrator-emitted; infecting/infected_running gated
+# on exploit_fire / session_open events). Per-call lookup so a manifest
+# amendment takes effect on the next episode without a service
+# restart.
 
 
 def _wait_for_path(path: Path, timeout_s: float) -> None:
@@ -304,12 +299,13 @@ def main() -> int:
         # configured but emits zero rows is exactly the silent-downgrade
         # pattern §1 forbids.
         agent_sock = run_dir / "agent.sock"
+        schedule = tier3_schedule_from(experiment.schedule)
         cfg = EpisodeConfig(
             target_pid=qemu_pid,
-            duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
+            duration_s=sum(d for _, d in schedule),
             interval_ms=args.interval_ms,
             data_root=Path(args.data_root),
-            phase_schedule=DEFAULT_SCHEDULE,
+            phase_schedule=schedule,
             image_name=module.name + "-target",
             snapshot_name="baseline-v1",
             qmp_socket=qmp_sock if qmp_sock.exists() else None,