Tier-2 episodes use clean-only schedule; .gitignore VERSION
Two correctness fixes that the §4.5 event-driven labeller surfaced:
1. tools/run_real_vm_demo.py was hardcoding a Tier-3-shaped schedule
(clean → armed → infecting → infected_running → ...) for episodes
with no exploit firing. Pre-§4.5 those episodes wrote dishonest
`infected_running` labels from the schedule clock — exactly the §3
evidence pattern. Post-§4.5 they write `failed` at the infecting
transition (the justifying exploit_fire never arrives), which is
honest about what happened but useless for training.
The honest fix: Tier-2 episodes have a clean-only schedule. All
telemetry tagged `clean` because nothing infected anything. The
total duration matches the canonical Tier-3 schedule so episode
lengths are comparable across tiers — no length-bias in the
dataset (§10).
Helper `tier2_schedule_from(schedule)` in orchestrator/manifest.py
derives `[("clean", total_seconds)]` from the canonical schedule.
`tier3_schedule_from(schedule)` renders the legacy
`[(name, seconds)]` shape EpisodeConfig still expects.
Tier-2 demo (run_real_vm_demo.py) now calls tier2_schedule_from.
Tier-3 demo (run_tier3_demo.py) now calls tier3_schedule_from.
Drops the hardcoded DEFAULT_SCHEDULE constants from both — the
canonical manifest is the single source of truth (§4.1).
2. .gitignore now excludes /VERSION. The install-lab-host.sh stamp
writes /opt/cis490/VERSION so episodes can record code provenance
without /opt/cis490 carrying a .git directory. But /opt/cis490 IS
typically a git checkout on lab hosts (auto-update.sh pulls into
it), so writing VERSION leaves the working tree dirty. Every
episode's meta.code_version.dirty=true. PIPELINE.md §4.6 acceptance
gate's rule 4 would then reject every episode without
CIS490_ALLOW_DIRTY=1 set — which would break the data flow.
Now VERSION is .gitignored: install-lab-host.sh stamps it, git
status doesn't see it, dirty=false, gate rule 4 passes naturally.
These two changes together keep the data flowing AND honest. Tier-2
episodes pass with `phases=[clean]` + every collector emitting real
rows. Tier-3 episodes (none today, empty catalog) walk the full
event-driven schedule when a verified module gets re-admitted.
286 tests passing.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d9f913fc97
commit
3d4f282e9c
4 changed files with 58 additions and 30 deletions
6
.gitignore
vendored
6
.gitignore
vendored
|
|
@ -9,6 +9,12 @@
|
||||||
vm/images/
|
vm/images/
|
||||||
vm/snapshots/
|
vm/snapshots/
|
||||||
|
|
||||||
|
# VERSION file is install-script-stamped (provenance for episodes
|
||||||
|
# generated from /opt/cis490 install copies). Tracking it would
|
||||||
|
# trigger spurious dirty-tree state on lab hosts and reject every
|
||||||
|
# episode at the §4.6 acceptance gate.
|
||||||
|
/VERSION
|
||||||
|
|
||||||
# Telemetry output
|
# Telemetry output
|
||||||
data/episodes/
|
data/episodes/
|
||||||
data/campaign.json
|
data/campaign.json
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,33 @@ from pathlib import Path
|
||||||
|
|
||||||
CANONICAL_FILENAME = "manifest.toml"
|
CANONICAL_FILENAME = "manifest.toml"
|
||||||
|
|
||||||
|
|
||||||
|
def tier3_schedule_from(schedule: "tuple[Phase, ...]") -> list[tuple[str, float]]:
|
||||||
|
"""Render the canonical schedule as the legacy
|
||||||
|
`[(name, seconds)]` format EpisodeConfig.phase_schedule expects.
|
||||||
|
|
||||||
|
This IS the Tier-3 schedule: clean → armed → infecting →
|
||||||
|
infected_running → ... per the canonical manifest. Phase labels
|
||||||
|
are event-driven (PIPELINE.md §4.5) so durations are budgets, not
|
||||||
|
label sources."""
|
||||||
|
return [(p.name, p.seconds) for p in schedule]
|
||||||
|
|
||||||
|
|
||||||
|
def tier2_schedule_from(schedule: "tuple[Phase, ...]") -> list[tuple[str, float]]:
|
||||||
|
"""Tier-2 episodes have no exploit and no driver firing modules.
|
||||||
|
Walking the Tier-3 phase set on a Tier-2 episode produces dishonest
|
||||||
|
`infected_running` labels (PIPELINE.md §3 evidence — the original
|
||||||
|
sin) under clock-driven labelling, OR `failed` labels under
|
||||||
|
event-driven labelling (still useless for training).
|
||||||
|
|
||||||
|
Honest fix: Tier-2 episodes ride a single `clean` phase for the
|
||||||
|
same total wall-clock as the Tier-3 walk so episode lengths are
|
||||||
|
comparable across tiers (no length-bias in the dataset). Every
|
||||||
|
telemetry row on a Tier-2 episode is tagged `clean` because
|
||||||
|
nothing infected anything."""
|
||||||
|
total = sum(p.seconds for p in schedule)
|
||||||
|
return [("clean", float(total))]
|
||||||
|
|
||||||
# Closed enums — keep in sync with the corresponding code that
|
# Closed enums — keep in sync with the corresponding code that
|
||||||
# implements each name. A name not in these sets means the manifest
|
# implements each name. A name not in these sets means the manifest
|
||||||
# is asking for something the orchestrator doesn't know how to do.
|
# is asking for something the orchestrator doesn't know how to do.
|
||||||
|
|
|
||||||
|
|
@ -29,23 +29,21 @@ sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||||||
|
|
||||||
from collectors import qmp # noqa: E402
|
from collectors import qmp # noqa: E402
|
||||||
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
|
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
|
||||||
from orchestrator.manifest import ManifestError, load_canonical # noqa: E402
|
from orchestrator.manifest import ( # noqa: E402
|
||||||
|
ManifestError, load_canonical, tier2_schedule_from,
|
||||||
|
)
|
||||||
from samples.manifest import SampleManifest # noqa: E402
|
from samples.manifest import SampleManifest # noqa: E402
|
||||||
from vm_load_controller import VMLoadController # noqa: E402
|
from vm_load_controller import VMLoadController # noqa: E402
|
||||||
from vm_serial import SerialClient # noqa: E402
|
from vm_serial import SerialClient # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
# Same shape as run_envelope_demo so plots are comparable.
|
# Tier-2 episodes have no exploit firing — their schedule is derived
|
||||||
DEFAULT_SCHEDULE = [
|
# from the canonical Tier-3 schedule total duration (PIPELINE.md §4.1
|
||||||
("clean", 10.0),
|
# canonical manifest, §4.5 event-driven labeller, §10 honest labels).
|
||||||
("armed", 2.0),
|
# `tier2_schedule_from(experiment.schedule)` produces a single `clean`
|
||||||
("infecting", 3.0),
|
# phase for the same wall-clock as a Tier-3 walk; that keeps episode
|
||||||
("infected_running", 25.0),
|
# lengths comparable across tiers without minting `infected_running`
|
||||||
("dormant", 15.0),
|
# labels for episodes where nothing infected anything.
|
||||||
("infected_running", 20.0),
|
|
||||||
("dormant", 5.0),
|
|
||||||
("clean", 5.0),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _wait_for_socket(path: Path, timeout_s: float) -> None:
|
def _wait_for_socket(path: Path, timeout_s: float) -> None:
|
||||||
|
|
@ -214,12 +212,13 @@ def main() -> int:
|
||||||
controller.setup()
|
controller.setup()
|
||||||
|
|
||||||
agent_sock = run_dir / "agent.sock"
|
agent_sock = run_dir / "agent.sock"
|
||||||
|
schedule = tier2_schedule_from(experiment.schedule)
|
||||||
cfg = EpisodeConfig(
|
cfg = EpisodeConfig(
|
||||||
target_pid=qemu_pid,
|
target_pid=qemu_pid,
|
||||||
duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
|
duration_s=sum(d for _, d in schedule),
|
||||||
interval_ms=args.interval_ms,
|
interval_ms=args.interval_ms,
|
||||||
data_root=Path(args.data_root),
|
data_root=Path(args.data_root),
|
||||||
phase_schedule=DEFAULT_SCHEDULE,
|
phase_schedule=schedule,
|
||||||
image_name="alpine-3.21-cloudinit",
|
image_name="alpine-3.21-cloudinit",
|
||||||
snapshot_name="baseline-v1",
|
snapshot_name="baseline-v1",
|
||||||
qmp_socket=qmp_sock if qmp_sock.exists() else None,
|
qmp_socket=qmp_sock if qmp_sock.exists() else None,
|
||||||
|
|
|
||||||
|
|
@ -38,23 +38,18 @@ from exploits.driver import DriverConfig, MSFExploitDriver # noqa: E402
|
||||||
from exploits.modules import load_module_config # noqa: E402
|
from exploits.modules import load_module_config # noqa: E402
|
||||||
from exploits.msfrpc import MSFRpcClient, MSFRpcConfig # noqa: E402
|
from exploits.msfrpc import MSFRpcClient, MSFRpcConfig # noqa: E402
|
||||||
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
|
from orchestrator.episode import EpisodeConfig, EpisodeRunner # noqa: E402
|
||||||
from orchestrator.manifest import ManifestError, load_canonical # noqa: E402
|
from orchestrator.manifest import ( # noqa: E402
|
||||||
|
ManifestError, load_canonical, tier3_schedule_from,
|
||||||
|
)
|
||||||
from samples.manifest import SampleManifest # noqa: E402
|
from samples.manifest import SampleManifest # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
# Same envelope shape as Tier 2 so plots are comparable. Slightly more
|
# Tier-3 schedule comes from the canonical manifest at episode-launch
|
||||||
# armed/infecting time because real exploit fire + session establishment
|
# time. Phase durations are budgets for the §4.5 event-driven labeller
|
||||||
# takes hundreds of ms to a few seconds.
|
# (clean/armed orchestrator-emitted; infecting/infected_running gated
|
||||||
DEFAULT_SCHEDULE = [
|
# on exploit_fire / session_open events). Per-call lookup so a manifest
|
||||||
("clean", 10.0),
|
# amendment takes effect on the next episode without a service
|
||||||
("armed", 3.0),
|
# restart.
|
||||||
("infecting", 5.0),
|
|
||||||
("infected_running", 25.0),
|
|
||||||
("dormant", 15.0),
|
|
||||||
("infected_running", 20.0),
|
|
||||||
("dormant", 5.0),
|
|
||||||
("clean", 5.0),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _wait_for_path(path: Path, timeout_s: float) -> None:
|
def _wait_for_path(path: Path, timeout_s: float) -> None:
|
||||||
|
|
@ -304,12 +299,13 @@ def main() -> int:
|
||||||
# configured but emits zero rows is exactly the silent-downgrade
|
# configured but emits zero rows is exactly the silent-downgrade
|
||||||
# pattern §1 forbids.
|
# pattern §1 forbids.
|
||||||
agent_sock = run_dir / "agent.sock"
|
agent_sock = run_dir / "agent.sock"
|
||||||
|
schedule = tier3_schedule_from(experiment.schedule)
|
||||||
cfg = EpisodeConfig(
|
cfg = EpisodeConfig(
|
||||||
target_pid=qemu_pid,
|
target_pid=qemu_pid,
|
||||||
duration_s=sum(d for _, d in DEFAULT_SCHEDULE),
|
duration_s=sum(d for _, d in schedule),
|
||||||
interval_ms=args.interval_ms,
|
interval_ms=args.interval_ms,
|
||||||
data_root=Path(args.data_root),
|
data_root=Path(args.data_root),
|
||||||
phase_schedule=DEFAULT_SCHEDULE,
|
phase_schedule=schedule,
|
||||||
image_name=module.name + "-target",
|
image_name=module.name + "-target",
|
||||||
snapshot_name="baseline-v1",
|
snapshot_name="baseline-v1",
|
||||||
qmp_socket=qmp_sock if qmp_sock.exists() else None,
|
qmp_socket=qmp_sock if qmp_sock.exists() else None,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue