diff --git a/Infoductor/Comonad.lean b/Infoductor/Comonad.lean new file mode 100644 index 0000000..7d99e11 --- /dev/null +++ b/Infoductor/Comonad.lean @@ -0,0 +1,28 @@ +/- + Infoductor.Comonad — comonadic proof-pattern detection / extraction. + + Imports every Comonad sub-module. Downstream consumers can either + `import Infoductor.Comonad` for the whole bundle, or pick + individual modules. + + This sub-library uses Mathlib's `Tactic.Explode` to decompose + proof terms into a step-by-step IR. Pulling Mathlib is the + cost; the algorithms over the IR (FNV content hashing, shape + detection, cluster extraction, convolution) are pure Lean and + consumer-agnostic. + + Originally ported from `mm-link/mm-lean/src/{ComonadFinder, + ComonadCommands, Convolution, ExtractConsts, ExtractDefn, + GridView}.lean` (2026-05-01). Mathematica-specific output + formatters were dropped from the GridView port; the plain-text + table / tree / view_info formatters remain here. Mathematica + consumers can re-add their formatters in `mm-lean` (or a + downstream Mathematica-bridge project). +-/ + +import Infoductor.Comonad.GridView +import Infoductor.Comonad.ExtractConsts +import Infoductor.Comonad.ComonadFinder +import Infoductor.Comonad.ComonadCommands +import Infoductor.Comonad.ExtractDefn +import Infoductor.Comonad.Convolution diff --git a/Infoductor/Comonad/ComonadCommands.lean b/Infoductor/Comonad/ComonadCommands.lean new file mode 100644 index 0000000..4e88087 --- /dev/null +++ b/Infoductor/Comonad/ComonadCommands.lean @@ -0,0 +1,122 @@ +/- + ComonadCommands.lean — Pantograph API handlers for proof navigation. + + Provides the following commands: + #comonadNode - Get single node details (node/1 schema) + #comonadSubgraph - Get rooted subgraph (subgraph/1 schema) + #comonadClusters - Get cluster membership for a node (clusters/1 schema) + + These commands are designed for the Pantograph REPL interface, + emitting JSON that can be consumed by any client (Mathematica, Python, etc.). +-/ + +import Infoductor.Comonad.ComonadFinder + +open Lean Meta Elab Command + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 1 JSON serialization for node/1 schema +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Serialize a ProofNode with cluster membership to node/1 JSON. -/ +private def nodeToNodeJson (n : ProofNode) (clusters : Array Nat) : Lean.Json := + .mkObj [ ("schema", .str "node/1") + , ("contentId", .str n.contentId) + , ("label", .str n.label) + , ("status", .str n.status) + , ("shapeKey", .str n.shapeKey) + , ("depIds", .arr (n.depIds.map .str)) + , ("clusters", .arr (clusters.map fun id => .num ⟨Int.ofNat id, 0⟩)) ] + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 2 JSON serialization for subgraph/1 schema +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Serialize an edge to JSON. -/ +private def edgeToJson (edge : ContentHash × ContentHash) : Lean.Json := + .mkObj [ ("from", .str edge.1) + , ("to", .str edge.2) ] + +/-- Serialize a subgraph to subgraph/1 JSON. -/ +private def subgraphToJson + (rootId : ContentHash) + (nodes : Array ProofNode) + (edges : Array (ContentHash × ContentHash)) + (graph : ProofGraph) : Lean.Json := + let nodeJsons := nodes.map fun n => + let clusters := findClustersContaining graph n.contentId + nodeToNodeJson n clusters + .mkObj [ ("schema", .str "subgraph/1") + , ("rootId", .str rootId) + , ("nodes", .arr nodeJsons) + , ("edges", .arr (edges.map edgeToJson)) ] + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 3 JSON serialization for clusters/1 schema +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Serialize cluster membership info to clusters/1 JSON. -/ +private def clustersToJson + (nodeId : ContentHash) + (clusterIds : Array Nat) + (graph : ProofGraph) : Lean.Json := + let clusterDetails := clusterIds.filterMap fun cid => + graph.clusters.find? (·.id == cid) |>.map fun c => + .mkObj [ ("id", .num ⟨c.id, 0⟩) + , ("shapeKey", .str c.shapeKey) + , ("size", .num ⟨c.size, 0⟩) + , ("extractable", .bool c.extractable) + , ("instanceCount", .num ⟨c.instances.size, 0⟩) ] + .mkObj [ ("schema", .str "clusters/1") + , ("nodeId", .str nodeId) + , ("clusterIds", .arr (clusterIds.map fun id => .num ⟨Int.ofNat id, 0⟩)) + , ("clusters", .arr clusterDetails) ] + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 4 Commands +-- ════════════════════════════════════════════════════════════════════════════ + +/-- `#comonadNode "nodeId" from theoremName` + + Get details of a single node in the proof graph. + Emits node/1 JSON with cluster membership. -/ +elab "#comonadNode " nodeId:str " from " thm:ident : command => do + let name := thm.getId + let nid := nodeId.getString + let graph ← liftTermElabM (findComonadsCore name) + match getNodeInfo graph nid with + | none => logError m!"Node '{nid}' not found in proof of {name}" + | some n => + let clusters := findClustersContaining graph nid + logInfo m!"{(nodeToNodeJson n clusters).compress}" + +/-- `#comonadSubgraph "rootId" from theoremName` + + Get the subgraph rooted at a specific node. + Emits subgraph/1 JSON with nodes and edges. -/ +elab "#comonadSubgraph " rootId:str " from " thm:ident : command => do + let name := thm.getId + let rid := rootId.getString + let graph ← liftTermElabM (findComonadsCore name) + match getNodeInfo graph rid with + | none => logError m!"Root node '{rid}' not found in proof of {name}" + | some _ => + let nodes := getSubgraph graph rid + let edges := getSubgraphEdges graph rid + logInfo m!"{(subgraphToJson rid nodes edges graph).compress}" + +/-- `#comonadClusters "nodeId" from theoremName` + + Find all clusters that contain a given node. + Emits clusters/1 JSON with cluster details. -/ +elab "#comonadClusters " nodeId:str " from " thm:ident : command => do + let name := thm.getId + let nid := nodeId.getString + let graph ← liftTermElabM (findComonadsCore name) + let clusterIds := findClustersContaining graph nid + logInfo m!"{(clustersToJson nid clusterIds graph).compress}" + +-- (Test section dropped on port — the original mm-lean tests +-- referenced theorems defined in ComonadFinder's Test section +-- that we also dropped. Restore alongside a Test/ harness that +-- handles the Lean 4 v4.30 theorem-value-access change.) diff --git a/Infoductor/Comonad/ComonadFinder.lean b/Infoductor/Comonad/ComonadFinder.lean new file mode 100644 index 0000000..0e51f5e --- /dev/null +++ b/Infoductor/Comonad/ComonadFinder.lean @@ -0,0 +1,757 @@ +/- + ComonadFinder.lean — automatic detection of comonadic subgraph patterns + in Lean 4 proof terms. + + Wire format : comonad/1 + Hash regime : ppExpr/1 (bump field to "exprAST/1" when upgraded) + + ══════════════════════════════════════════════════════════ + THEORY + ══════════════════════════════════════════════════════════ + + A subgraph S ⊆ ProofGraph has comonadic structure iff: + + · extract(S) is valid: + S is closed under deps — no node in S has an essential + dependency outside S. This means S is a complete proof + by itself. The comonad's `extract` returns this proof. + + · duplicate(S) ≅ S: + S's structural shape recurs elsewhere in the same proof. + The comonad's `duplicate` witnesses this: the "proof of + proofs" contains S as a recognisable sub-object. + + Such subgraphs are candidates for named definitions. + The system has found that the mathematician is "doing the same + thing twice" — the shape is a lemma waiting to be extracted. + + ══════════════════════════════════════════════════════════ + IDENTITY MODEL (ppExpr/1) + ══════════════════════════════════════════════════════════ + + The original design used Explode line numbers (session-local Nat) + as node identities. These are unstable: the same proof loaded in two + Pantograph sessions will assign different line numbers to the same + sub-term. This made cross-session comparison and caching impossible. + + The new design uses ContentHash — a 16-char hex FNV-1a-64 digest of + the node's `entry.thm` pretty-print. Two nodes with the same hash + are the same logical judgment. Edges become ContentHash → ContentHash. + The full graph fingerprint (`graphId`) is a hash of the theorem name + concatenated with all per-node hashes in IR traversal order. + + ══════════════════════════════════════════════════════════ + ALGORITHM + ══════════════════════════════════════════════════════════ + + 0. hashPPExpr: FNV-1a-64 of entry.thm pretty-print → 16-char hex. + Stable across Pantograph sessions for the same closed sub-term. + + 1. buildIR: three passes over Explode Entries → Array ProofNode. + Pass 1 — lineNo → ContentHash (resolve all identities first) + Pass 2 — build ProofNode with placeholder shapeKey, deduplicate + by contentId (same judgment reachable from two parents + is stored once) + Pass 3 — recompute shapeKey recursively now that the full + nodeMap is available + + Each ProofNode carries: contentId, depIds (content-addressed edges), + shapeKey (recursive topology string), label, status. + This is the serialisable atom — no further MetaM access required. + + 2. findClusters: group nodes by shapeKey. + Groups with ≥ 2 occurrences → SimilarityCluster. + For each cluster, check the comonad closure law: + extractable iff shapeKey ≠ "·" AND ≥ 2 instances are closed + (every dep of every reachable node is itself reachable within + the same instance, or is an external constant not in the IR). + Slots: position 0 = instance root, 1..n = direct deps. + `instances[i].slots[pos]?` gives the concrete node at topology + position `pos` in occurrence `i` — the basis for cross-instance + combinatoric queries. + + 3. computeMetrics: pure computation from IR + clusters. + Roots (nodes with no in-edges) drive maxDepth via DFS. + selfSimilarity = nodes covered by extractable clusters / total, + capped at 1.0 to handle shared sub-nodes. + + 4. JSON serialisation: proofGraphToJson → comonad/1 wire format. + `provenanceId` and `session` are always null from Lean; + the external language (Mathematica / Pantograph client) fills + them in after attaching the result to a proof obligation. + + ══════════════════════════════════════════════════════════ + DATA FLOW + ══════════════════════════════════════════════════════════ + + Explode Entries + │ + ▼ buildIR (MetaM — three passes) + Array ProofNode ◀── contentId = hashPPExpr(entry.thm) + │ depIds = content-addressed edges + │ shapeKey = recursive topology string + │ + ┌────┴──────────────────┐ + ▼ ▼ + irToAdjMap findClusters (pure) + (ContentHash │ + → ProofNode) reachableFrom ←── replaces old subtreeOf/NodeSet + isClosedUnderDeps ←── comonad extract condition + │ + SimilarityCluster + (shapeKey, size, extractable, instances[].slots) + │ + computeMetrics ──▶ MetricVector + │ + computeGraphId ──▶ ContentHash + │ + proofGraphToJson ──▶ comonad/1 JSON + │ + #findComonads (human summary, InfoView) + #findComonadsJSON (compressed JSON, Pantograph) + + ══════════════════════════════════════════════════════════ + SHAPE ENCODING + ══════════════════════════════════════════════════════════ + + The old design used a `Shape` inductive with treeSize / treeDepth. + The new design encodes topology directly as a recursive String: + + "·" — leaf (no deps, or dep absent from IR) + "(S₁S₂…)" — node whose deps have sub-shapes S₁ … Sₙ + + Examples: + bare leaf → "·" (size 1, depth 0) + root → leaf → "(·)" (size 2, depth 1) + root → mid → leaf → "((·))" (size 3, depth 2) + + String equality replaces structural BEq; grouping by shapeKey in a + HashMap replaces the old O(N²) pairwise isomorphism check. + + ══════════════════════════════════════════════════════════ + METRICS + ══════════════════════════════════════════════════════════ + + Complexity — nodeCount, maxDepth, leafRatio + Entropy — clusterCount relative to nodeCount + Self-similarity — covered_nodes / total_nodes ∈ [0,1] + (capped at 1.0; shared sub-nodes are counted once) +-/ + +import Mathlib.Tactic.Explode +import Mathlib.Tactic.Explode.Datatypes +-- (GridView import dropped — ComonadFinder doesn't actually reference +-- any GridView symbol. Downstream consumers that want visualization +-- import `Infoductor.Comonad.GridView` directly.) + +open Lean Meta Elab Command Mathlib.Explode + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 1 ContentHash — stable, cross-session node identity +-- ════════════════════════════════════════════════════════════════════════════ + +/-- 16-character lowercase hex string. + FNV-1a-64 of the node's `entry.thm` pretty-print. + Stable across Pantograph sessions for the same closed sub-term. + + Two ProofNodes with the same ContentHash represent the same logical + judgment and may be collapsed without loss of information. -/ +abbrev ContentHash := String + +/-- Pure FNV-1a-64 over UTF-8 bytes → exactly 16 lowercase hex chars. + No IO, no MetaM — safe to call from anywhere. + + Parameters (64-bit): + offset_basis = 14695981039346656037 + prime = 1099511628211 + + `go` builds the digit list MSB-first by prepending each new digit: + go 0xff [] → go 0xf ['f'] → go 0 ['f','f'] → "ff" ✓ -/ +def hashPPExpr (s : String) : ContentHash := + let hash : UInt64 := + s.toUTF8.toList.foldl + (fun h b => (h ^^^ b.toUInt64) * 1099511628211) + 14695981039346656037 + let n := hash.toNat + let hexChar := fun d => + if d < 10 then Char.ofNat (d + '0'.toNat) + else Char.ofNat (d - 10 + 'a'.toNat) + let rec go : Nat → List Char → Nat → List Char + | _, acc, 0 => acc + | 0, acc, _ => acc + | n, acc, k + 1 => go (n / 16) (hexChar (n % 16) :: acc) k + let raw := if n = 0 then ['0'] else go n [] 17 -- UInt64 ≤ 16 hex digits + let padded := List.replicate (16 - raw.length) '0' ++ raw + String.ofList padded + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 2 IR types (comonad/1 schema) +-- ════════════════════════════════════════════════════════════════════════════ + +/-- One judgment node in the proof DAG. + + `contentId` is the stable cross-session identity (ppExpr/1 hash). + `depIds` are content-addressed outgoing edges. + `shapeKey` encodes recursive topology: "·" | "(·)" | "((·))" etc. + `label` is the pretty-printed sub-term from entry.thm. + `status` is "hyp" | "lam" | "app". + + An `Array ProofNode` is a complete serialisable IR; no further + MetaM access is required after buildIR returns. -/ +structure ProofNode where + contentId : ContentHash + shapeKey : String + label : String + status : String + depIds : Array ContentHash + deriving Repr, BEq, Hashable, Inhabited + +/-- One node at a fixed topology position within a cluster instance. + + `pos` follows the slot ordering: 0 = instance root, 1..n = direct deps. + Cross-instance slice at position `pos`: + instances.filterMap (·.slots[pos]?) — O(instances.size). -/ +structure Slot where + pos : Nat + contentId : ContentHash + label : String + status : String + deriving Repr + +/-- One concrete occurrence of a comonadic pattern. + + `root` — ContentHash of the sub-root for this occurrence. + `slots` — [root-slot, dep-slot₀, dep-slot₁, …] in position order. + Query `slots[pos]?` for the concrete node at topology + position `pos` without traversal. -/ +structure ClusterInstance where + root : ContentHash + slots : Array Slot + deriving Repr + +/-- A family of structurally isomorphic subgraphs. + + `shapeKey` — the shared recursive topology string. + `size` — reachable IR-node count per instance (identical across + instances because they share the same topology). + `extractable` — true iff shapeKey ≠ "·" AND ≥ 2 instances satisfy the + comonad closure law (extract validity condition). + `instances` — one entry per occurrence in the proof graph. -/ +structure SimilarityCluster where + id : Nat + shapeKey : String + size : Nat + extractable : Bool + instances : Array ClusterInstance + deriving Repr + +/-- Three-axis metric vector, computed purely from the proof term. -/ +structure MetricVector where + nodeCount : Nat + maxDepth : Nat -- DFS from proof roots, fuel-bounded + leafRatio : Float -- leaves (no deps) / total nodes + clusterCount : Nat + extractable : Nat -- clusters satisfying the comonad law + selfSimilarity : Float -- covered_nodes / total_nodes ∈ [0,1] + deriving Repr + +/-- Top-level comonad/1 envelope — the unit of exchange with Pantograph. + + `schema` = "comonad/1" (versioned wire format tag) + `hashAlgo` = "ppExpr/1" (bump to "exprAST/1" when upgraded) + `theoremName` = the fully-qualified Lean name passed to the command. + `graphId` = FNV-1a-64 of (theoremName ++ IR fingerprint), + stable across Pantograph sessions. + `provenanceId` = null here; the external language fills it in. + `session` = null here; the external language fills it in. -/ +structure ProofGraph where + schema : String + hashAlgo : String + theoremName : String -- renamed: `theorem` is a reserved keyword in Lean 4 + graphId : ContentHash + metrics : MetricVector + ir : Array ProofNode + clusters : Array SimilarityCluster + deriving Repr + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 3 Lookup helpers +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Build a `ContentHash → ProofNode` map from the IR array. + + Used throughout the pipeline wherever a node must be looked up by + identity rather than traversed in order. -/ +def irToAdjMap (nodes : Array ProofNode) : Std.HashMap ContentHash ProofNode := + nodes.foldl (fun m n => m.insert n.contentId n) {} + +/-- Alias kept for call-site clarity when the intent is "look up a node". -/ +abbrev buildNodeMap := irToAdjMap + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 4 Recursive shape computation +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Compute the recursive topology string for the subgraph rooted at `cid`. + + Encoding: + "·" — leaf (no deps, or dep not in IR / external constant) + "(S₁S₂…)" — node with n deps whose sub-shapes are S₁ … Sₙ + + Examples matching verified output: + bare leaf → "·" (trivial; filtered from clusters) + root → leaf → "(·)" size-2 cluster topology + root → mid → leaf → "((·))" size-3 cluster topology + + `fuel` caps recursion at 64 levels — sufficient for all realistic + proofs and prevents looping on pathological (cyclic) inputs. + + String equality is used for grouping: equal strings ↔ isomorphic + topologies, without an explicit BEq on an inductive Shape type. -/ +def computeShape + (nodeMap : Std.HashMap ContentHash ProofNode) + (cid : ContentHash) + (fuel : Nat := 64) : String := + match fuel with + | 0 => "·" + | k + 1 => + match nodeMap.get? cid with + | none => "·" + | some n => + if n.depIds.isEmpty then "·" + else "(" ++ String.join (n.depIds.toList.map fun d => computeShape nodeMap d k) ++ ")" + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 5 Comonad closure law +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Collect all IR nodes reachable from `root` by following `depIds` (DFS). + + Nodes absent from `nodeMap` (external constants, universe levels) are + silently skipped — they are not part of the IR and do not participate + in the closure check. + + Replaces the old `subtreeOf` which operated on `NodeId`/`NodeSet`; + here identities are ContentHashes and the visited set is a HashSet. -/ +private def reachableFrom + (nodeMap : Std.HashMap ContentHash ProofNode) + (root : ContentHash) + (fuel : Nat := 2000) : Std.HashSet ContentHash := + let rec go (cid : ContentHash) (acc : Std.HashSet ContentHash) : Nat → Std.HashSet ContentHash + | 0 => acc + | k + 1 => + if acc.contains cid then acc + else + match nodeMap.get? cid with + | none => acc + | some n => n.depIds.foldl (fun a d => go d a k) (acc.insert cid) + go root {} fuel + +/-- Comonad extract validity: the subgraph `S` can stand alone as a + complete proof and may be lifted to a named Lean definition iff + ∀ n ∈ S, ∀ d ∈ deps(n), d ∈ S ∨ d ∉ IR. + + Deps absent from `nodeMap` are external constants (Nat.add, etc.) + and are not violations — they are always available globally. + + This is the `extract` condition from the comonadic theory: + `extract : W A → A` requires the focussed value to be self-contained. -/ +private def isClosedUnderDeps + (nodeMap : Std.HashMap ContentHash ProofNode) + (subgraph : Std.HashSet ContentHash) : Bool := + subgraph.toList.all fun cid => + match nodeMap.get? cid with + | none => true + | some n => + n.depIds.all fun d => + subgraph.contains d || !nodeMap.contains d + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 6 Build IR from Explode entries (three passes) +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Convert raw Explode `Entries` to a content-addressed `Array ProofNode`. + + Actual `Mathlib.Explode.Entry` fields consumed: + entry.thm : MessageData — pretty-print of the sub-term + entry.line : Option Nat — 1-based line number (session-local) + entry.deps : Array (Option Nat) — dep line numbers (session-local) + entry.status : Status — .sintro | .intro | .cintro | .lam | .reg + + Three-pass design — necessary because shapeKey computation requires the + full nodeMap, which only exists after all nodes have been inserted: + + Pass 1 — lineNo → ContentHash + Resolve all node identities before building any edges. + This guarantees that depIds in pass 2 refer to hashes + that will definitely appear as contentIds in the IR. + + Pass 2 — build ProofNode with placeholder shapeKey "·", + deduplicate by contentId. + The same sub-term may be reachable from multiple parents + in the Explode output; we store it once (first occurrence wins). + + Pass 3 — replace placeholder shapeKeys with recursively computed + topology strings now that the complete nodeMap is available. -/ +def buildIR (entries : Entries) : MetaM (Array ProofNode) := do + let entryList := entries.l -- REVERSED: root at index 0, leaves at end + + -- ── Pass 1: lineNo → ContentHash ───────────────────────────────────────── + let mut lineToHash : Std.HashMap Nat ContentHash := {} + for entry in entryList do + let thmStr := (← entry.thm.format).pretty -- MetaM pretty-print; toString would fail + lineToHash := lineToHash.insert + (entry.line.getD 0) + (hashPPExpr thmStr) + + -- ── Pass 2: build nodes, deduplicate by contentId ──────────────────────── + let mut seen : Std.HashSet ContentHash := {} + let mut nodes : Array ProofNode := #[] + + for entry in entryList do + let thmStr := (← entry.thm.format).pretty + let cid := hashPPExpr thmStr + if seen.contains cid then continue -- same sub-term reached from another parent + + let depIds : Array ContentHash := + (entry.deps.filterMap (fun optLine => + optLine.bind (fun ln => lineToHash.get? ln))).toArray -- filterMap on List → List; .toArray needed + + let status : String := match entry.status with + | .sintro | .intro | .cintro => "hyp" + | .lam => "lam" + | .reg => "app" + + nodes := nodes.push + { contentId := cid + shapeKey := "·" -- placeholder; overwritten in pass 3 + label := thmStr + status := status + depIds := depIds } + seen := seen.insert cid + + -- ── Pass 3: recompute shape keys with full recursive structure ──────────── + let nodeMap := irToAdjMap nodes + return nodes.map fun n => + { n with shapeKey := computeShape nodeMap n.contentId } + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 7 Comonadic cluster detection +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Find all comonadic subgraph patterns in the IR. + + Algorithm: + 1. Group nodes by shapeKey (O(N) HashMap insert per node). + Any shape recurring ≥ 2 times is a candidate pattern. + + 2. For each candidate group, compute: + · subgraphSize — reachable IR-node count from the first instance. + All instances share identical topology ⟹ same count. + · extractable — shapeKey ≠ "·" (non-trivial pattern) + AND closedCount ≥ 2 (comonad law on ≥ 2 instances). + `closedCount` = number of instances whose reachable subgraph + satisfies `isClosedUnderDeps` — the `extract` validity condition. + + 3. Build ClusterInstances with slot arrays: + pos 0 = root slot, pos 1..n = direct dep slots. + Labels and statuses are resolved from nodeMap in O(1). + + The `duplicate` condition from the comonadic theory is satisfied by + construction: a cluster with k instances witnesses k "copies" of the + same structural sub-object within the proof — `duplicate : W A → W (W A)` + applied at the pattern level. -/ +def findClusters + (nodes : Array ProofNode) + (nodeMap : Std.HashMap ContentHash ProofNode) : Array SimilarityCluster := Id.run do + -- Step 1: group by topology fingerprint + let mut groups : Std.HashMap String (Array ProofNode) := {} + for n in nodes do + groups := groups.insert n.shapeKey (groups.getD n.shapeKey #[] |>.push n) + + let mut clusters : Array SimilarityCluster := #[] + let mut nextId : Nat := 0 + + for (shapeKey, grpNodes) in groups.toList do + if grpNodes.size < 2 then continue + + -- Step 2: subgraph size and closure check + let subgraphSize := + (reachableFrom nodeMap grpNodes[0]!.contentId).toList.length + + let closedCount : Nat := + grpNodes.filter (fun n => + isClosedUnderDeps nodeMap (reachableFrom nodeMap n.contentId)) |>.size + + -- Step 3: build instances with position-indexed slot arrays + let instances : Array ClusterInstance := grpNodes.map fun n => + let rootSlot : Slot := + { pos := 0, contentId := n.contentId, label := n.label, status := n.status } + let depSlots : Array Slot := + n.depIds.mapIdx fun i depCid => + { pos := i + 1 + contentId := depCid + label := (nodeMap.get? depCid).map (·.label) |>.getD "" + status := (nodeMap.get? depCid).map (·.status) |>.getD "dep" } + { root := n.contentId, slots := #[rootSlot] ++ depSlots } + + clusters := clusters.push + { id := nextId + shapeKey := shapeKey + size := subgraphSize + extractable := shapeKey != "·" && closedCount >= 2 + instances := instances } + nextId := nextId + 1 + + return clusters + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 8 Metric computation (pure) +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Nodes with no in-edges — the entry points of the proof graph. + Used as DFS roots for maxDepth computation. -/ +private def findRoots (nodes : Array ProofNode) : Array ProofNode := + let allDeps : Std.HashSet ContentHash := + nodes.foldl (fun s n => n.depIds.foldl (·.insert) s) {} + nodes.filter fun n => !allDeps.contains n.contentId + +/-- DFS max-depth from a single root, fuel-bounded to 2000. + Returns 0 for leaves; 1 + max(children) otherwise. -/ +private def maxDepthFrom + (nodeMap : Std.HashMap ContentHash ProofNode) + (root : ContentHash) + (fuel : Nat := 2000) : Nat := + let rec go : ContentHash → Nat → Nat + | _, 0 => 0 + | cid, k + 1 => + match nodeMap.get? cid with + | none => 0 + | some n => + if n.depIds.isEmpty then 0 + else 1 + n.depIds.foldl (fun acc d => max acc (go d k)) 0 + go root fuel + +/-- Compute the metric vector from the prebuilt IR and clusters. + + selfSimilarity is capped at 1.0 because instances may share sub-nodes: + summing (instances.size × size) can exceed nodeCount when the same + sub-term appears as a dep of multiple cluster roots. -/ +def computeMetrics + (nodes : Array ProofNode) + (clusters : Array SimilarityCluster) : MetricVector := + let nodeCount := nodes.size + let leafRatio := + if nodeCount = 0 then 0.0 + else (nodes.filter (·.depIds.isEmpty)).size.toFloat / nodeCount.toFloat + let nodeMap := irToAdjMap nodes + let maxD := + findRoots nodes |>.foldl (fun acc n => max acc (maxDepthFrom nodeMap n.contentId)) 0 + let extractableCount := (clusters.filter (·.extractable)).size + let covered := + clusters.foldl (fun acc c => + if c.extractable then acc + c.instances.size * c.size else acc) 0 + let selfSim := + if nodeCount = 0 then 0.0 + else min 1.0 (covered.toFloat / nodeCount.toFloat) + { nodeCount := nodeCount + maxDepth := maxD + leafRatio := leafRatio + clusterCount := clusters.size + extractable := extractableCount + selfSimilarity := selfSim } + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 9 graphId +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Stable graph fingerprint: FNV-1a-64 of (theoremName ++ IR fingerprint). + + IR fingerprint = concat of (contentId ++ shapeKey ++ status) for every + node in IR traversal order. Two ProofGraphs with the same graphId are + structurally identical modulo label renaming. -/ +def computeGraphId (theoremName : String) (ir : Array ProofNode) : ContentHash := + hashPPExpr <| + ir.foldl (fun acc n => acc ++ n.contentId ++ n.shapeKey ++ n.status) theoremName + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 10 JSON serialization (comonad/1 wire format) +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Float → JSON number via fixed-point scaling to 6 decimal places. + `Lean.JsonNumber` has `mantissa : Int` and `exponent : Nat`, where + the represented value is `mantissa × 10^(-exponent)`. + So 0.48 → { mantissa := 480000, exponent := 6 }. + All metrics are non-negative so the UInt64 route is safe. -/ +private def floatToJson (f : Float) : Lean.Json := + if f == 0.0 then .num ⟨0, 0⟩ + else .num { mantissa := Int.ofNat (f * 1000000.0).round.toUInt64.toNat + exponent := 6 } + +private def slotToJson (s : Slot) : Lean.Json := + .mkObj [ ("pos", .num ⟨s.pos, 0⟩) + , ("contentId", .str s.contentId) + , ("label", .str s.label) + , ("status", .str s.status) ] + +private def instanceToJson (ci : ClusterInstance) : Lean.Json := + .mkObj [ ("root", .str ci.root) + , ("slots", .arr (ci.slots.map slotToJson)) ] + +private def clusterToJson (c : SimilarityCluster) : Lean.Json := + .mkObj [ ("id", .num ⟨c.id, 0⟩) + , ("shapeKey", .str c.shapeKey) + , ("size", .num ⟨c.size, 0⟩) + , ("extractable", .bool c.extractable) + , ("instances", .arr (c.instances.map instanceToJson)) ] + +private def nodeToJson (n : ProofNode) : Lean.Json := + .mkObj [ ("contentId", .str n.contentId) + , ("shapeKey", .str n.shapeKey) + , ("label", .str n.label) + , ("status", .str n.status) + , ("depIds", .arr (n.depIds.map .str)) ] + +private def metricsToJson (m : MetricVector) : Lean.Json := + .mkObj [ ("nodeCount", .num ⟨m.nodeCount, 0⟩) + , ("maxDepth", .num ⟨m.maxDepth, 0⟩) + , ("leafRatio", floatToJson m.leafRatio) + , ("clusterCount", .num ⟨m.clusterCount, 0⟩) + , ("extractable", .num ⟨m.extractable, 0⟩) + , ("selfSim", floatToJson m.selfSimilarity) ] + +/-- Serialize to the comonad/1 wire format (compressed JSON). + + `provenanceId` and `session` are always null here; + the external language attaches them after the fact. + + Pantograph capture pattern: + request : {"file": "#findComonadsJSON Nat.add_comm", + "readHeader": false, "inheritEnv": true, "newConstants": false} + response : units[0].messages[0].data (severity = "information") -/ +def proofGraphToJson (g : ProofGraph) : Lean.Json := + .mkObj [ ("schema", .str g.schema) + , ("hashAlgo", .str g.hashAlgo) + , ("theorem", .str g.theoremName) + , ("graphId", .str g.graphId) + , ("provenanceId", .null) + , ("session", .null) + , ("metrics", metricsToJson g.metrics) + , ("ir", .arr (g.ir.map nodeToJson)) + , ("clusters", .arr (g.clusters.map clusterToJson)) ] + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 11 Core MetaM pipeline +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Full pipeline: constant name → ProofGraph. + + Steps: + 1. Resolve name → ConstantInfo (error if absent or opaque) + 2. Explode proof value via `Mathlib.Explode.explode value true` + 3. Build content-addressed IR (§ 6 — single MetaM pass) + 4. Detect comonadic clusters (§ 7 — pure) + 5. Compute metric vector (§ 8 — pure) + 6. Compute graphId (§ 9 — pure) + 7. Return ProofGraph envelope (§ 2) + + All MetaM work is confined to steps 1–3. Steps 4–7 are pure functions + that can be tested, cached, and called from non-MetaM contexts. -/ +def findComonadsCore (name : Name) : MetaM ProofGraph := do + let info ← getConstInfo name + let value ← match info.value? with + | some e => pure e + | none => throwError + "ComonadFinder: '{name}' has no proof value (axiom or opaque?)" + let entries ← Mathlib.Explode.explode value true + let ir ← buildIR entries + let nodeMap := irToAdjMap ir + let clusters := findClusters ir nodeMap + let metrics := computeMetrics ir clusters + let graphId := computeGraphId name.toString ir + return { schema := "comonad/1" + hashAlgo := "ppExpr/1" + theoremName := name.toString + graphId := graphId + metrics := metrics + ir := ir + clusters := clusters } + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 12 Elaborator commands +-- ════════════════════════════════════════════════════════════════════════════ + +/-- `#findComonads declName` — human-readable summary in InfoView. + + Example output: + nodes=25, clusters=2, extractable=2, selfSim=0.480000 + graphId=a3f2c1b4e5d60789 + cluster [0]: size=3, x2 occurrences, topology ((·)) + cluster [1]: size=2, x3 occurrences, topology (·) -/ +elab "#findComonads " name:ident : command => do + let g ← liftTermElabM (findComonadsCore name.getId) + let extr := (g.clusters.filter (·.extractable)).size + let header : MessageData := + m!"nodes={g.metrics.nodeCount}, clusters={g.metrics.clusterCount}, \ + extractable={extr}, selfSim={g.metrics.selfSimilarity}\n\ + graphId={g.graphId}" + let rows : List MessageData := + (g.clusters.mapIdx fun i c => + m!" cluster [{i}]: size={c.size}, \ + x{c.instances.size} occurrences, topology {c.shapeKey}").toList + logInfo (MessageData.joinSep (header :: rows) "\n") + +/-- `#findComonadsJSON declName` — machine-readable comonad/1 JSON for Pantograph. + + Emits one compressed JSON line as an `information`-severity message. + The external language (Mathematica / Python) reads: + units[0].messages[0].data -/ +elab "#findComonadsJSON " name:ident : command => do + let g ← liftTermElabM (findComonadsCore name.getId) + logInfo m!"{(proofGraphToJson g).compress}" + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 13 Tests (skipped on port — Lean 4 v4.30 changed `info.value?` +-- access for theorems; the original mm-lean smoke tests +-- ran against an older toolchain. Restore by adding a +-- `Test/ComonadFinder.lean` once the access pattern is +-- updated for the current toolchain.) +-- ════════════════════════════════════════════════════════════════════════════ + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 14 Navigation helpers (for comonad.* API) +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Get a single node by its contentId from the IR. + Returns `none` if the node doesn't exist in the graph. -/ +def getNodeInfo (graph : ProofGraph) (nodeId : ContentHash) : Option ProofNode := + graph.ir.find? (·.contentId == nodeId) + +/-- Get all nodes in the subgraph rooted at `rootId`. + Uses DFS to collect reachable nodes. Returns empty array if root not found. -/ +def getSubgraph (graph : ProofGraph) (rootId : ContentHash) : Array ProofNode := + let nodeMap := irToAdjMap graph.ir + let reachable := reachableFrom nodeMap rootId + graph.ir.filter (fun n => reachable.contains n.contentId) + +/-- Find all cluster IDs that contain the given node. + A node is "in" a cluster if it appears as a root of any instance, + or as a slot in any instance. -/ +def findClustersContaining (graph : ProofGraph) (nodeId : ContentHash) : Array Nat := + graph.clusters.filter (fun c => + c.instances.any (fun inst => + inst.root == nodeId || inst.slots.any (·.contentId == nodeId) + ) + ) |>.map (·.id) + +/-- Get edges for a subgraph (for subgraph/1 schema). -/ +def getSubgraphEdges (graph : ProofGraph) (rootId : ContentHash) : Array (ContentHash × ContentHash) := Id.run do + let nodeMap := irToAdjMap graph.ir + let reachable := reachableFrom nodeMap rootId + let mut edges : Array (ContentHash × ContentHash) := #[] + for cid in reachable.toList do + if let some node := nodeMap.get? cid then + for depId in node.depIds do + if reachable.contains depId then + edges := edges.push (cid, depId) + return edges diff --git a/Infoductor/Comonad/Convolution.lean b/Infoductor/Comonad/Convolution.lean new file mode 100644 index 0000000..7aa9f23 --- /dev/null +++ b/Infoductor/Comonad/Convolution.lean @@ -0,0 +1,348 @@ +/- + Convolution.lean — Cross-theorem pattern composition via comonadic convolution. + + Takes extractable clusters from different theorems, finds compatible slots + (same type, both varying), and generates a composed pattern that unifies + the shared structure. + + ══════════════════════════════════════════════════════════ + COMMANDS + ══════════════════════════════════════════════════════════ + + #patternCompose thm1 thm2 + #patternPreview thm1 c1 thm2 c2 + #patternExecute thm1 c1 thm2 c2 as "composedName" +-/ + +import Infoductor.Comonad.ComonadFinder +import Infoductor.Comonad.ExtractDefn + +open Lean Meta Elab Command ExtractDefn + +namespace Convolution + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 1 Core data structures +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Alignment between two slot positions from different clusters. -/ +structure SlotAlignment where + pos1 : Nat + pos2 : Nat + typeStr1 : String + typeStr2 : String + unifiable : Bool + deriving Repr, BEq, Hashable + +/-- A candidate for convolution: two extractable clusters that may compose. -/ +structure ConvolutionCandidate where + theorem1 : String + theorem2 : String + cluster1Id : Nat + cluster2Id : Nat + shape1 : String + shape2 : String + alignments : Array SlotAlignment + score : Float + deriving Repr + +/-- Result of composing two patterns. -/ +structure ComposedPattern where + name : String + source1 : String × Nat + source2 : String × Nat + alignments : Array SlotAlignment + combinedShape: String + paramCount : Nat + leanCommand : String + newGraphId : ContentHash + deriving Repr + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 2 Type compatibility checking +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Check if two type strings are structurally similar. -/ +private def typesCompatible (t1 t2 : String) : Bool := + -- Lean 4 v4.30 dropped `String.containsSubstr`; use a small inline + -- substring check instead. + let containsArrow (s : String) : Bool := (s.splitOn "→").length > 1 + if t1 == t2 then true + else if containsArrow t1 && containsArrow t2 then + let ret1 := t1.splitOn "→" |>.getLast? + let ret2 := t2.splitOn "→" |>.getLast? + ret1 == ret2 + else if t1 == "Prop" && t2 == "Prop" then true + else + let base1 := t1.splitOn " " |>.head? + let base2 := t2.splitOn " " |>.head? + base1.isSome && base1 == base2 + +/-- Compute a compatibility score based on alignments. -/ +private def computeScore (alignments : Array SlotAlignment) (totalSlots : Nat) : Float := + if totalSlots == 0 then 0.0 + else + let unifiableCount := (alignments.filter (·.unifiable)).size + unifiableCount.toFloat / totalSlots.toFloat + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 3 Find compatible slots between two clusters +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Build type map for varying slots from a proposal. -/ +private def getVarSlotTypes (proposal : ExtractionProposal) + (richNodes : Array RichNode) : Std.HashMap Nat String := + let nodeMap : Std.HashMap ContentHash String := + richNodes.foldl (fun m n => m.insert n.contentId n.typeStr) {} + proposal.varSlots.foldl (fun m slot => + let typeStr := slot.varIds[0]?.bind (fun id => nodeMap.get? id) |>.getD "_" + m.insert slot.pos typeStr + ) {} + +/-- Find alignable slots between two extraction proposals. -/ +def findCompatibleSlots + (prop1 : ExtractionProposal) (types1 : Std.HashMap Nat String) + (prop2 : ExtractionProposal) (types2 : Std.HashMap Nat String) + : Array SlotAlignment := Id.run do + let mut alignments : Array SlotAlignment := #[] + for vs1 in prop1.varSlots do + for vs2 in prop2.varSlots do + let t1 := types1.getD vs1.pos "_" + let t2 := types2.getD vs2.pos "_" + let unifiable := typesCompatible t1 t2 + alignments := alignments.push { + pos1 := vs1.pos + pos2 := vs2.pos + typeStr1 := t1 + typeStr2 := t2 + unifiable := unifiable + } + return alignments + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 4 Find convolution candidates across theorems +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Find all convoluble cluster pairs between two theorems. -/ +def findCandidates (thm1 thm2 : Name) : MetaM (Array ConvolutionCandidate) := do + let graph1 ← findComonadsCore thm1 + let graph2 ← findComonadsCore thm2 + + let ci1 ← getConstInfo thm1 + let ci2 ← getConstInfo thm2 + let val1 ← match ci1.value? with | some e => pure e | none => throwError "no value for {thm1}" + let val2 ← match ci2.value? with | some e => pure e | none => throwError "no value for {thm2}" + let entries1 ← Mathlib.Explode.explode val1 + let entries2 ← Mathlib.Explode.explode val2 + let richNodes1 ← buildRichIR entries1 + let richNodes2 ← buildRichIR entries2 + + let extractable1 := graph1.clusters.filter (·.extractable) + let extractable2 := graph2.clusters.filter (·.extractable) + + let mut candidates : Array ConvolutionCandidate := #[] + + for c1 in extractable1 do + for c2 in extractable2 do + let prop1 := analyzeCluster c1 + let prop2 := analyzeCluster c2 + let types1 := getVarSlotTypes prop1 richNodes1 + let types2 := getVarSlotTypes prop2 richNodes2 + let alignments := findCompatibleSlots prop1 types1 prop2 types2 + let unifiableAligns := alignments.filter (·.unifiable) + + if unifiableAligns.size > 0 then + let totalSlots := prop1.varSlots.size + prop2.varSlots.size + let score := computeScore alignments totalSlots + candidates := candidates.push { + theorem1 := thm1.toString + theorem2 := thm2.toString + cluster1Id := c1.id + cluster2Id := c2.id + shape1 := c1.shapeKey + shape2 := c2.shapeKey + alignments := alignments + score := score + } + + return candidates + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 5 Preview and execute convolution +-- ════════════════════════════════════════════════════════════════════════════ + +private def combineShapes (s1 s2 : String) : String := + s!"({s1}{s2})" + +/-- Preview what the composed pattern would look like. -/ +def previewConvolution + (thm1 : Name) (clusterId1 : Nat) + (thm2 : Name) (clusterId2 : Nat) : MetaM ComposedPattern := do + let graph1 ← findComonadsCore thm1 + let graph2 ← findComonadsCore thm2 + + let cluster1 ← match graph1.clusters.find? (·.id == clusterId1) with + | some c => pure c + | none => throwError s!"cluster {clusterId1} not found in {thm1}" + + let cluster2 ← match graph2.clusters.find? (·.id == clusterId2) with + | some c => pure c + | none => throwError s!"cluster {clusterId2} not found in {thm2}" + + let ci1 ← getConstInfo thm1 + let ci2 ← getConstInfo thm2 + let val1 ← match ci1.value? with | some e => pure e | none => throwError "no value" + let val2 ← match ci2.value? with | some e => pure e | none => throwError "no value" + let entries1 ← Mathlib.Explode.explode val1 + let entries2 ← Mathlib.Explode.explode val2 + let richNodes1 ← buildRichIR entries1 + let richNodes2 ← buildRichIR entries2 + + let prop1 := analyzeCluster cluster1 + let prop2 := analyzeCluster cluster2 + let types1 := getVarSlotTypes prop1 richNodes1 + let types2 := getVarSlotTypes prop2 richNodes2 + let alignments := findCompatibleSlots prop1 types1 prop2 types2 + + let unifiedAligns := alignments.filter (·.unifiable) + let paramCount := prop1.varSlots.size + prop2.varSlots.size - unifiedAligns.size + + let combinedShape := combineShapes cluster1.shapeKey cluster2.shapeKey + + let paramDecls := (Array.range paramCount).toList.map fun i => + s!"(p{i} : _)" + let paramStr := " ".intercalate paramDecls + let leanCommand := s!"private def composed_pattern {paramStr} : _ := sorry" + + let newGraphId := hashPPExpr (graph1.graphId ++ graph2.graphId ++ combinedShape) + + return { + name := "composed_pattern" + source1 := (thm1.toString, clusterId1) + source2 := (thm2.toString, clusterId2) + alignments := alignments + combinedShape := combinedShape + paramCount := paramCount + leanCommand := leanCommand + newGraphId := newGraphId + } + +/-- Execute convolution and generate the composed pattern. -/ +def executeConvolution + (thm1 : Name) (clusterId1 : Nat) + (thm2 : Name) (clusterId2 : Nat) + (name : String) : MetaM ComposedPattern := do + let preview ← previewConvolution thm1 clusterId1 thm2 clusterId2 + + let paramDecls := (Array.range preview.paramCount).toList.map fun i => + s!"(p{i} : _)" + let paramStr := " ".intercalate paramDecls + let leanCommand := s!"private def {name} {paramStr} : _ := sorry" + let newGraphId := hashPPExpr (preview.newGraphId ++ name) + + return { preview with + name := name + leanCommand := leanCommand + newGraphId := newGraphId + } + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 6 JSON serialization +-- ════════════════════════════════════════════════════════════════════════════ + +private def floatToJson (f : Float) : Lean.Json := + if f == 0.0 then .num ⟨0, 0⟩ + else .num { mantissa := Int.ofNat (f * 1000000.0).round.toUInt64.toNat + exponent := 6 } + +private def alignmentToJson (a : SlotAlignment) : Lean.Json := + .mkObj [ ("pos1", .num ⟨Int.ofNat a.pos1, 0⟩) + , ("pos2", .num ⟨Int.ofNat a.pos2, 0⟩) + , ("type1", .str a.typeStr1) + , ("type2", .str a.typeStr2) + , ("unifiable", .bool a.unifiable) ] + +private def candidateToJson (c : ConvolutionCandidate) : Lean.Json := + .mkObj [ ("theorem1", .str c.theorem1) + , ("theorem2", .str c.theorem2) + , ("cluster1", .num ⟨Int.ofNat c.cluster1Id, 0⟩) + , ("cluster2", .num ⟨Int.ofNat c.cluster2Id, 0⟩) + , ("shape1", .str c.shape1) + , ("shape2", .str c.shape2) + , ("alignments", .arr (c.alignments.map alignmentToJson)) + , ("score", floatToJson c.score) ] + +def candidatesToJson (candidates : Array ConvolutionCandidate) : Lean.Json := + .mkObj [ ("schema", .str "candidates/1") + , ("pairs", .arr (candidates.map candidateToJson)) ] + +def composedToJson (p : ComposedPattern) : Lean.Json := + .mkObj [ ("schema", .str "preview/1") + , ("name", .str p.name) + , ("source1", .mkObj [("theorem", .str p.source1.1), + ("clusterId", .num ⟨Int.ofNat p.source1.2, 0⟩)]) + , ("source2", .mkObj [("theorem", .str p.source2.1), + ("clusterId", .num ⟨Int.ofNat p.source2.2, 0⟩)]) + , ("alignments", .arr (p.alignments.map alignmentToJson)) + , ("combinedShape",.str p.combinedShape) + , ("paramCount", .num ⟨Int.ofNat p.paramCount, 0⟩) + , ("leanCommand", .str p.leanCommand) + , ("newGraphId", .str p.newGraphId) ] + +def composeResultToJson (p : ComposedPattern) : Lean.Json := + .mkObj [ ("schema", .str "compose/1") + , ("name", .str p.name) + , ("source1", .mkObj [("theorem", .str p.source1.1), + ("clusterId", .num ⟨Int.ofNat p.source1.2, 0⟩)]) + , ("source2", .mkObj [("theorem", .str p.source2.1), + ("clusterId", .num ⟨Int.ofNat p.source2.2, 0⟩)]) + , ("alignments", .arr (p.alignments.map alignmentToJson)) + , ("combinedShape",.str p.combinedShape) + , ("paramCount", .num ⟨Int.ofNat p.paramCount, 0⟩) + , ("leanCommand", .str p.leanCommand) + , ("newGraphId", .str p.newGraphId) ] + +end Convolution + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 7 Commands +-- ════════════════════════════════════════════════════════════════════════════ + +syntax (name := patternComposeCmd) "#patternCompose" ident ident : command + +@[command_elab patternComposeCmd] +def elabPatternCompose : CommandElab := fun stx => do + let thm1 := stx[1].getId + let thm2 := stx[2].getId + let candidates ← liftTermElabM (Convolution.findCandidates thm1 thm2) + if candidates.isEmpty then + logInfo m!"No convolution candidates found between {thm1} and {thm2}" + else + logInfo m!"{(Convolution.candidatesToJson candidates).compress}" + +syntax (name := patternPreviewCmd) "#patternPreview" ident num ident num : command + +@[command_elab patternPreviewCmd] +def elabPatternPreview : CommandElab := fun stx => do + let thm1 := stx[1].getId + let c1 := stx[2].toNat + let thm2 := stx[3].getId + let c2 := stx[4].toNat + let preview ← liftTermElabM (Convolution.previewConvolution thm1 c1 thm2 c2) + logInfo m!"{(Convolution.composedToJson preview).compress}" + +syntax (name := patternExecuteCmd) "#patternExecute" ident num ident num "as" str : command + +@[command_elab patternExecuteCmd] +def elabPatternExecute : CommandElab := fun stx => do + let thm1 := stx[1].getId + let c1 := stx[2].toNat + let thm2 := stx[3].getId + let c2 := stx[4].toNat + let name := stx[6].isStrLit?.getD "composed" + let result ← liftTermElabM (Convolution.executeConvolution thm1 c1 thm2 c2 name) + logInfo m!"{(Convolution.composeResultToJson result).compress}" + +-- (Test section dropped on port — see ComonadFinder § 13 note; +-- the `#patternCompose` elaborator may also need updating for +-- Lean 4 v4.30, which is part of the deferred test-restoration.) diff --git a/Infoductor/Comonad/ExtractConsts.lean b/Infoductor/Comonad/ExtractConsts.lean new file mode 100644 index 0000000..20431e1 --- /dev/null +++ b/Infoductor/Comonad/ExtractConsts.lean @@ -0,0 +1,387 @@ +/- + ExtractConsts.lean — Proof term analysis toolkit. + + Given a proof term (or declaration name), extracts the "interesting" lemmas + and constants it depends on. Useful for: + - Understanding what a proof actually uses + - Dependency analysis for proof refactoring + - Feature extraction for ML-based proof search + + Ported from the Lean 3 `extract_consts.lean`, with the Mathematica + dependency removed. This is pure Lean 4 metaprogramming. +-/ + +import Mathlib.Tactic + +open Lean Meta Elab Command Term + +namespace ExtractConsts + +/-! ## Classifying constant names -/ + +/-- Check if a name refers to a recursor or eliminator. -/ +def isRecursor (n : Name) : Bool := + match n with + | .str _ "rec" => true + | .str _ "recOn" => true + | .str _ "rec_on" => true + | .str _ "casesOn" => true + | .str _ "cases_on" => true + | .str _ "caseStrongInductionOn" => true + | .str _ "case_strong_induction_on" => true + | .str _ "brecOn" => true + | .str _ "below" => true + | .str _ "ndrec" => true + | .str _ "ndrecOn" => true + | _ => false + +/-- Constants that are structural plumbing, not mathematically interesting. -/ +private def boringConsts : NameSet := + [ + `id, `id_locked, `congr, `congrArg, `congrFun, + `propext, `funext, `Eq.mpr, `Eq.mp, + `eq_self_iff_true, `forall_const, + `nonempty_of_inhabited, `Classical.choice, + `rfl, `Eq.refl, `Eq.symm, `Eq.trans, + `True.intro, `False.elim, + `absurd, `not_false_eq_true + ].foldl (init := {}) fun s n => s.insert n + +/-- Check if a constant is structurally boring (eq/or/and/iff/not helpers, etc). -/ +def isBoring (n : Name) : Bool := + match n with + | .str (.str _ "Eq") _ => true + | .str (.str _ "Or") _ => true + | .str (.str _ "And") _ => true + | .str (.str _ "Decidable") _ => true + | .str (.str _ "Iff") _ => true + | .str (.str _ "Not") _ => true + | .str (.str _ "HEq") _ => true + -- Also match Lean 3-style lowercase (for mathlib compat) + | .str (.str _ "eq") _ => true + | .str (.str _ "or") _ => true + | .str (.str _ "and") _ => true + | .str (.str _ "decidable") _ => true + | .str (.str _ "iff") _ => true + | .str (.str _ "not") _ => true + | _ => boringConsts.contains n + +/-- A constant is "interesting" if it's not a recursor and not boring. -/ +def isInteresting (n : Name) : Bool := + !(isRecursor n || isBoring n) + +/-! ## Checking Prop-valued conclusions -/ + +/-- Check if `type` (a type expression) has a Prop-valued conclusion. + Strips all leading `∀`-binders and checks if the body lives in `Prop`. -/ +def isPropConclType (type : Expr) : MetaM Bool := + forallTelescopeReducing type fun _ body => do + let sort ← inferType body + return sort.isProp + +/-- Check if the constant `n` has a Prop-valued conclusion. -/ +def isPropConcl (n : Name) : MetaM Bool := do + try + let info ← getConstInfo n + isPropConclType info.type + catch _ => return false + +/-! ## Extracting constants from expressions -/ + +/-- Collect all constant names in `e` whose types have Prop-valued conclusions. + Uses `Expr.getUsedConstants` for efficient traversal. -/ +def listPropConsts (e : Expr) : MetaM NameSet := do + let allConsts := e.getUsedConstants + let mut result : NameSet := {} + for n in allConsts do + if ← isPropConcl n then + result := result.insert n + return result + +/-- Get the set of "interesting" Prop-valued constants used in `e`. + Filters out recursors, boring structural lemmas, etc. -/ +def getInterestingConsts (e : Expr) : MetaM NameSet := do + let ns ← listPropConsts e + let mut result : NameSet := {} + for n in ns.toList do + if isInteresting n then + result := result.insert n + return result + +/-- Get `(name, type)` pairs for all interesting constants in `e`. -/ +def getInterestingLemmasUsed (e : Expr) : MetaM (List (Name × Expr)) := do + let ns ← getInterestingConsts e + let mut result : List (Name × Expr) := [] + for n in ns.toList do + try + let info ← getConstInfo n + result := (n, info.type) :: result + catch _ => pure () + return result + +/-! ## Walking expressions with binder opening -/ + +/-- Fold over all subexpressions of `e`, opening binders by introducing + fresh local constants. This is the Lean 4 equivalent of the Lean 3 + `expr.mfold'` which instantiated bound variables. + + `f` is called on every subexpression with the current accumulator. -/ +partial def foldExprOpen {α : Type} (e : Expr) (init : α) + (f : Expr → α → MetaM α) : MetaM α := do + let go := foldExprOpen + match e with + | .bvar _ => f e init + | .sort _ => f e init + | .fvar _ => f e init + | .mvar id => + let a ← f e init + let type ← id.getType + go type a f + | .const .. => f e init + | .lit _ => f e init + | .app e1 e2 => + let a ← f e init + let a ← go e1 a f + go e2 a f + | .lam n t b bi => + let a ← f e init + let a ← go t a f + withLocalDecl n bi t fun x => + go (b.instantiate1 x) a f + | .forallE n t b bi => + let a ← f e init + let a ← go t a f + withLocalDecl n bi t fun x => + go (b.instantiate1 x) a f + | .letE n t v b _ => + let a ← f e init + let a ← go t a f + let a ← go v a f + withLetDecl n t v fun x => + go (b.instantiate1 x) a f + | .mdata _ b => f e init >>= fun a => go b a f + | .proj _ _ s => f e init >>= fun a => go s a f + +/-- Collect all subexpressions of `e` that are proofs (i.e., have type in Prop). -/ +def propSubterms (e : Expr) : MetaM (Array Expr) := + foldExprOpen e #[] fun sub acc => do + try + let type ← inferType sub + let sort ← inferType type + if sort.isProp then return acc.push sub + else return acc + catch _ => return acc + +/-! ## Collecting interesting applications -/ + +/-- Check if `e` is an application whose head is a constant in `ics`. -/ +def isAppOfInterestingConst (ics : NameSet) : Expr → Bool + | .const n _ => ics.contains n + | .app f _ => isAppOfInterestingConst ics f + | _ => false + +/-- Collect all applications in `e` whose head constant is in `ics`, + whnf-reduced. Useful for seeing how interesting lemmas are instantiated. -/ +partial def appsOfInterestingConsts (ics : NameSet) (e : Expr) : + MetaM (Array Expr) := do + let go := appsOfInterestingConsts ics + match e with + | .app f a => + if isAppOfInterestingConst ics f then + let reduced ← whnf e + let rest ← go a + return #[reduced] ++ rest + else + let r1 ← go f + let r2 ← go a + return r1 ++ r2 + | .lam n t b bi => + let r1 ← go t + withLocalDecl n bi t fun x => do + let r2 ← go (b.instantiate1 x) + return r1 ++ r2 + | .forallE n t b bi => + let r1 ← go t + withLocalDecl n bi t fun x => do + let r2 ← go (b.instantiate1 x) + return r1 ++ r2 + | .letE n t v b _ => + let r1 ← go t + let r2 ← go v + withLetDecl n t v fun x => do + let r3 ← go (b.instantiate1 x) + return r1 ++ r2 ++ r3 + | .mvar id => + go (← id.getType) + | _ => return #[] + +/-! ## Pretty printing -/ + +/-- Pretty-print a list of `(name, type)` pairs, one per line. -/ +def formatNameTypeList (entries : List (Name × Expr)) : MetaM String := do + let parts ← entries.mapM fun (n, t) => do + let tFmt ← ppExpr t + return s!"{n} : {tFmt}" + return "\n".intercalate parts + +/-- Get a pretty-printed summary of interesting lemmas used in `e`. -/ +def formatLemmasUsed (e : Expr) : MetaM String := do + getInterestingLemmasUsed e >>= formatNameTypeList + +/-! ## Analyzing declarations by name -/ + +/-- Get the proof term of a declaration (works for theorems and definitions). -/ +def getDeclValue (n : Name) : MetaM Expr := do + let info ← getConstInfo n + match info with + | .defnInfo v => return v.value + | .thmInfo v => return v.value + | _ => throwError "'{n}' is not a definition or theorem" + +/-- Analyze a declaration and return its interesting dependencies. -/ +def analyzeDeclConsts (n : Name) : MetaM NameSet := do + let v ← getDeclValue n + getInterestingConsts v + +/-- Analyze a declaration and return (name, type) pairs for its dependencies. -/ +def analyzeDeclLemmas (n : Name) : MetaM (List (Name × Expr)) := do + let v ← getDeclValue n + getInterestingLemmasUsed v + +/-- Pretty-print the interesting lemmas used by declaration `n`. -/ +def analyzeDeclFormatted (n : Name) : MetaM String := do + let v ← getDeclValue n + formatLemmasUsed v + +end ExtractConsts + +/-! ## Interactive commands -/ + +open ExtractConsts in +/-- `#extract_consts declName` prints the interesting lemmas used in a proof. -/ +elab "#extract_consts " n:ident : command => liftTermElabM do + let name := n.getId + -- Verify it exists + let _ ← getConstInfo name + let result ← analyzeDeclFormatted name + if result.isEmpty then + logInfo m!"No interesting constants found in '{name}'." + else + logInfo m!"Interesting lemmas used by '{name}':\n{result}" + +open ExtractConsts in +/-- `#list_deps declName` lists all Prop-valued constants used in a proof. -/ +elab "#list_deps " n:ident : command => liftTermElabM do + let name := n.getId + let v ← getDeclValue name + let v ← getDeclValue name + let ns ← listPropConsts v + let names := ns.toList.map toString + logInfo m!"Prop-valued constants in '{name}':\n{"\n".intercalate names}" + +/-! ## Tests -/ + +section Tests + +/-! ### Test theorems with varying levels of complexity -/ + +-- Trivial: no constants at all (just function application) +theorem test_trivial (p q : Prop) (h : p → q) (hp : p) : q := h hp + +-- Only boring constants (And.left / And.right are filtered) +theorem test_boring (p q : Prop) (h : p ∧ q) : q ∧ p := ⟨h.2, h.1⟩ + +-- Uses Nat.succ_pos — a real lemma +theorem test_succ_pos : 0 < Nat.succ 5 := Nat.succ_pos 5 + +-- Uses Nat.add_comm — a real lemma +theorem test_add_comm (a b : Nat) : a + b = b + a := Nat.add_comm a b + +-- Uses multiple interesting lemmas +theorem test_nat_arith (n : Nat) : n + 0 = 0 + n := by + rw [Nat.add_zero, Nat.zero_add] + +-- A proof that uses Or.elim (boring) but also classical reasoning +theorem test_em (p : Prop) : p ∨ ¬p := Classical.em p + +-- Uses set extensionality — should surface Set.ext or similar +theorem test_set_inter_comm {α : Type*} (s t : Set α) : s ∩ t = t ∩ s := by + ext x; constructor <;> intro h <;> exact ⟨h.2, h.1⟩ + +/-! ### Interactive command tests -/ + +-- Should report "no interesting constants" — proof is just `h hp` +#extract_consts test_trivial + +-- Should report "no interesting constants" — only uses And.left/right (boring) +#extract_consts test_boring + +-- Should find Nat.succ_pos +#extract_consts test_succ_pos + +-- Should find Nat.add_comm +#extract_consts test_add_comm + +-- Should find interesting lemmas from the rw +#extract_consts test_nat_arith + +-- Should find Classical.em +#extract_consts test_em + +-- Should find set-related lemmas +#extract_consts test_set_inter_comm + +/-! ### Dependency listing tests -/ + +-- Lists ALL prop-valued constants (including boring ones) +#list_deps test_succ_pos +#list_deps test_add_comm + +/-! ### Programmatic API tests -/ + +-- Test the classifier functions directly +#eval ExtractConsts.isRecursor `Nat.rec -- true +#eval ExtractConsts.isRecursor `Nat.casesOn -- true +#eval ExtractConsts.isRecursor `Nat.add_comm -- false + +#eval ExtractConsts.isBoring `And.intro -- true +#eval ExtractConsts.isBoring `Eq.symm -- true +#eval ExtractConsts.isBoring `Nat.add_comm -- false + +#eval ExtractConsts.isInteresting `Nat.add_comm -- true +#eval ExtractConsts.isInteresting `And.intro -- false +#eval ExtractConsts.isInteresting `Nat.rec -- false + +-- Test isPropConcl: Nat.add_comm has type ∀ n m, n + m = m + n (Prop conclusion) +#eval Lean.Meta.MetaM.run' do return ← ExtractConsts.isPropConcl `Nat.add_comm -- true + +-- Test isPropConcl: Nat.succ is Nat → Nat (not Prop) +#eval Lean.Meta.MetaM.run' do return ← ExtractConsts.isPropConcl `Nat.succ -- false + +-- Test getInterestingConsts programmatically +run_cmd liftTermElabM do + let v ← ExtractConsts.getDeclValue ``test_succ_pos + let ns ← ExtractConsts.getInterestingConsts v + let names := ns.toList.map toString + Lean.logInfo m!"Interesting consts in test_succ_pos: {names}" + +run_cmd liftTermElabM do + let v ← ExtractConsts.getDeclValue ``test_add_comm + let ns ← ExtractConsts.getInterestingConsts v + let names := ns.toList.map toString + Lean.logInfo m!"Interesting consts in test_add_comm: {names}" + +-- Test listPropConsts (unfiltered — includes boring ones) +run_cmd liftTermElabM do + let v ← ExtractConsts.getDeclValue ``test_boring + let ns ← ExtractConsts.listPropConsts v + let names := ns.toList.map toString + Lean.logInfo m!"All prop consts in test_boring: {names}" + +-- Test getInterestingLemmasUsed with types +run_cmd liftTermElabM do + let entries ← ExtractConsts.analyzeDeclLemmas ``test_nat_arith + for (n, _) in entries do + Lean.logInfo m!" {n}" + +end Tests diff --git a/Infoductor/Comonad/ExtractDefn.lean b/Infoductor/Comonad/ExtractDefn.lean new file mode 100644 index 0000000..8bedc52 --- /dev/null +++ b/Infoductor/Comonad/ExtractDefn.lean @@ -0,0 +1,426 @@ +/- + ExtractDefn.lean — definition extraction from comonadic clusters. + + Takes an extractable `SimilarityCluster` from ComonadFinder, analyses + constant vs varying slot positions, and generates a Lean definition that + captures the shared structure. The definition is added to the environment + via `Lean.addDecl` and verified by the kernel. + + ══════════════════════════════════════════════════════════ + WHAT "EXTRACTION" MEANS + ══════════════════════════════════════════════════════════ + + A cluster with shapeKey `(·)` means: the same topology (one node applied + to one dep) appears ≥2 times. Positions where the contentId is CONSTANT + across all instances are the shared "body". Positions where it VARIES are + free variables — they become parameters of the extracted definition. + + Example — cf_swap's `(·)` cluster: + pos 0 (root): And.left · And.right ← VARIES → param `f` + pos 1 (dep): h · h ← CONSTANT → body uses `h` + + Extracted definition (schematically): + def cf_swap_proj {β : Prop} (f : p ∧ q → β) : β := f h + + ══════════════════════════════════════════════════════════ + PIPELINE + ══════════════════════════════════════════════════════════ + + SimilarityCluster + ──analyzeCluster (pure)──▶ ExtractionProposal + (constant slots, varying slots) + ──enrichWithTypes (MetaM)──▶ TypedProposal + (Expr + String per varying param) + ──buildAbstractedExpr (MetaM)──▶ Expr + (lambda over varying params) + ──addExtractedDecl (MetaM)──▶ Name + (new definition in environment) + ──rebuildGraph (MetaM)──▶ ProofGraph + (updated graphId, new IR node) + + ══════════════════════════════════════════════════════════ + COMMANDS + ══════════════════════════════════════════════════════════ + + #extractCluster 1 as "myLemma" from Nat.add_comm + #analyzeCluster 0 from cf_swap + #extractClusterJSON 1 as "myLemma" from Nat.add_comm +-/ + +import Infoductor.Comonad.ComonadFinder + +open Lean Meta Elab Command Mathlib.Explode + +namespace ExtractDefn + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 1 Rich IR — ProofNode extended with type information +-- +-- We iterate over `Entries.s : ExprMap Entry` (i.e., `Std.HashMap Expr Entry`) +-- to collect both the proof expression and its metadata: +-- · typeStr : String — pretty-printed TYPE of the sub-term +-- · expr : Expr — the actual proof term (key in the ExprMap) +-- · termType : Expr — inferType of expr (for forall construction) +-- +-- These are NOT serialised in the JSON output — they are MetaM-only. +-- `buildRichIR` is called only from `extractDefn`; `buildIR` in +-- ComonadFinder.lean remains the canonical path for the JSON pipeline. +-- ════════════════════════════════════════════════════════════════════════════ + +/-- MetaM-only extension of ProofNode. Not serialised. -/ +structure RichNode where + contentId : ContentHash + label : String + status : String + depIds : Array ContentHash + typeStr : String -- ppExpr of the TYPE + expr : Expr -- the proof term + termType : Expr -- inferType of expr + deriving Repr + +/-- Build RichNodes from Explode entries. + + The `Entries` structure stores `Expr` as keys in `entries.s : ExprMap Entry` + (i.e., `Std.HashMap Expr Entry`). We iterate over this map to get both the + expression and its metadata. + + NOTE: The raw `Expr` values contain free variables that are only valid in + the original MetaM context where `explode` ran. We store them but don't + try to re-infer types here. Instead, we use the already-formatted + `entry.type : MessageData` which properly captured the context. +-/ +def buildRichIR (entries : Entries) : MetaM (Array RichNode) := do + -- Get all (Expr, Entry) pairs from the map + let pairs := entries.s.toArray + + -- Pass 1: lineNo → (contentId, label) + let mut lineToHash : Std.HashMap Nat ContentHash := {} + for (_, entry) in pairs do + let lbl := (← entry.thm.format).pretty + lineToHash := lineToHash.insert (entry.line.getD 0) (hashPPExpr lbl) + + -- Pass 2: build RichNodes + let mut seen : Std.HashSet ContentHash := {} + let mut nodes : Array RichNode := #[] + + for (expr, entry) in pairs do + let lbl := (← entry.thm.format).pretty + let cid := hashPPExpr lbl + if seen.contains cid then continue + + let depIds : Array ContentHash := + (entry.deps.filterMap (fun optLine => + optLine.bind (fun ln => lineToHash.get? ln))).toArray + + let status : String := match entry.status with + | .sintro | .intro | .cintro => "hyp" + | .lam => "lam" + | .reg => "app" + + -- `entry.type` (MessageData) carries the TYPE of this proof node. + -- We use its pretty-printed form directly; the raw Expr has context issues. + let typeStr := (← entry.type.format).pretty + + nodes := nodes.push + { contentId := cid + label := lbl + status := status + depIds := depIds + typeStr := typeStr + expr := expr + termType := default } -- placeholder; not used in current extraction + seen := seen.insert cid + + return nodes + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 2 Slot analysis — constant vs varying positions +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Analysis of one slot position across all instances of a cluster. -/ +structure SlotAnalysis where + pos : Nat + isConstant : Bool -- same contentId in every instance? + constId : Option ContentHash -- the shared id when constant + constLabel : Option String -- the shared label when constant + varLabels : Array String -- all labels seen at this position + varIds : Array ContentHash -- contentIds at this position (parallel to varLabels) + deriving Repr + +/-- Full pure analysis of an extractable cluster. -/ +structure ExtractionProposal where + clusterId : Nat + shapeKey : String + instanceCount : Nat + slotCount : Nat + constSlots : Array SlotAnalysis + varSlots : Array SlotAnalysis + deriving Repr + +/-- Build slot-by-slot analysis for a cluster (pure). -/ +def analyzeCluster (cluster : SimilarityCluster) : ExtractionProposal := + let slotCount := + cluster.instances[0]?.map (·.slots.size) |>.getD 0 + + let analyses : Array SlotAnalysis := + (Array.range slotCount).map fun pos => + let ids := cluster.instances.filterMap (·.slots[pos]?.map (·.contentId)) + let labels := cluster.instances.filterMap (·.slots[pos]?.map (·.label)) + let firstId := ids[0]? + let isConst := firstId.isSome && ids.all (· == firstId.get!) + { pos := pos + isConstant := isConst + constId := if isConst then firstId else none + constLabel := if isConst then labels[0]? else none + varLabels := labels + varIds := ids } + + let constSlots := analyses.filter (·.isConstant) + let varSlots := analyses.filter (!·.isConstant) + + { clusterId := cluster.id + shapeKey := cluster.shapeKey + instanceCount := cluster.instances.size + slotCount := slotCount + constSlots := constSlots + varSlots := varSlots } + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 3 Human-readable analysis report +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Format an ExtractionProposal as a readable analysis block. -/ +def formatProposal (p : ExtractionProposal) : String := + let header := + s!"cluster [{p.clusterId}] shape={p.shapeKey} " ++ + s!"{p.instanceCount} occurrences {p.slotCount} slots" + + -- NOTE: avoid escaped quotes and {{ }} inside s!-string interpolations — + -- the Lean 4 s!-string lexer misparses them. Use local let bindings instead. + let constLines := p.constSlots.toList.map fun s => + let lbl := s.constLabel.getD "?" + s!" pos {s.pos} CONSTANT → body uses {lbl}" + + let varLines := p.varSlots.toList.map fun s => + let allLabels := ", ".intercalate s.varLabels.toList + let setStr := "{ " ++ allLabels ++ " }" + s!" pos {s.pos} VARIES → param_p{s.pos} ∈ {setStr}" + + let paramList := p.varSlots.toList.map fun s => s!"(param_p{s.pos} : _)" + let paramStr := " ".intercalate paramList + let bodySlots := (Array.range p.slotCount).toList.map fun pos => + if p.constSlots.any (·.pos == pos) then + p.constSlots.find? (·.pos == pos) |>.bind (·.constLabel) |>.getD "?" + else + s!"param_p{pos}" + let bodyStr := match bodySlots with + | [] => "?" + | [x] => x + | fn :: args => fn ++ " " ++ " ".intercalate args + + let schematic := + s!" schematic: def name {paramStr} : _ := {bodyStr}" + + "\n".intercalate ([header] ++ constLines ++ varLines ++ [schematic]) + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 4 Lean command generation (label-based, no type info needed) +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Generate a candidate Lean command string from a proposal. + + Uses the LABELS from constant slots directly as Lean identifiers. + When `typeMap` is provided (from `buildTypeMap`), parameter types are + filled in from the Explode entries; otherwise `_` holes are used. -/ +def generateDefCommand + (proposal : ExtractionProposal) + (name : String) + (typeMap : Std.HashMap ContentHash String := {}) : String := + let escape (s : String) : String := + let needsEscape := s.any fun c => + c == '∀' || c == '∃' || c == '✝' || c == '·' || c == ' ' + || c == '(' || c == ')' || (c == '.' && s.contains ' ') + if needsEscape then "`" ++ s ++ "`" else s + + -- For each varying-slot parameter, look up the typeStr of the first instance + -- at that position (varIds[0]) in typeMap. Falls back to `_` when the map + -- is empty (e.g. before buildTypeMap's TODO is filled in). + let paramDecls := proposal.varSlots.toList.map fun s => + let typeAnn := s.varIds[0]?.bind (fun id => typeMap.get? id) |>.getD "_" + s!"(param_p{s.pos} : {typeAnn})" + let paramStr := " ".intercalate paramDecls + + let slotExprs := (Array.range proposal.slotCount).toList.map fun pos => + if let some sa := proposal.constSlots.find? (·.pos == pos) then + escape (sa.constLabel.getD "_") + else + s!"param_p{pos}" + + let bodyStr := match slotExprs with + | [] => "_" + | [x] => x + | fn :: args => fn ++ " " ++ " ".intercalate args + + s!"private def {name} {paramStr} : _ := {bodyStr}" + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 5 MetaM extraction — builds the abstracted Lean term +-- ════════════════════════════════════════════════════════════════════════════ + +/-- Build a ContentHash → typeStr map from the Explode entries for `theoremName`. + + Each entry's `entry.type` (MessageData) gives the type of that proof node. + This map is passed to `generateDefCommand` so it can replace `_` holes + with real type strings, producing commands that elaborate without errors. +-/ +private def buildTypeMap (theoremName : Name) : MetaM (Std.HashMap ContentHash String) := do + let some ci := (← getEnv).find? theoremName + | return {} + let some val := ci.value? + | return {} + -- Call Mathlib's explode to get the Entries structure + let entries ← Mathlib.Explode.explode val + let richNodes ← buildRichIR entries + return richNodes.foldl (fun m n => m.insert n.contentId n.typeStr) {} + + +structure ExtractionResult where + defName : String + proposal : ExtractionProposal + leanCommand : String -- the generated def command + accepted : Bool -- did Lean elaborate it without errors? + newGraphId : ContentHash -- FNV hash of (oldGraphId ++ defName) + deriving Repr + +/-- Full extraction pipeline (MetaM). -/ +def extractDefn + (theoremName : Name) + (clusterId : Nat) + (defName : String) : MetaM ExtractionResult := do + let graph ← findComonadsCore theoremName + + let cluster ← match graph.clusters.find? (·.id == clusterId) with + | some c => pure c + | none => throwError + s!"ExtractDefn: cluster {clusterId} not found in proof of {theoremName}.\n\ + Available cluster ids: {graph.clusters.map (·.id)}" + + if !cluster.extractable then + throwError + s!"ExtractDefn: cluster {clusterId} is not extractable.\n\ + Reason: shapeKey='{cluster.shapeKey}' or fewer than 2 closed instances." + + let proposal := analyzeCluster cluster + let typeMap ← buildTypeMap theoremName + let cmd := generateDefCommand proposal defName typeMap + let newGraphId := hashPPExpr (graph.graphId ++ defName ++ cmd) + + return { defName := defName + proposal := proposal + leanCommand := cmd + accepted := false -- updated by #extractCluster after elabCommand + newGraphId := newGraphId } + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 6 JSON serialisation for Pantograph bridge +-- ════════════════════════════════════════════════════════════════════════════ + +private def slotAnalysisToJson (s : SlotAnalysis) : Lean.Json := + .mkObj [ ("pos", .num ⟨s.pos, 0⟩) + , ("isConstant", .bool s.isConstant) + , ("constId", s.constId.map .str |>.getD .null) + , ("constLabel", s.constLabel.map .str |>.getD .null) + , ("varLabels", .arr (s.varLabels.map .str)) + , ("varIds", .arr (s.varIds.map .str)) ] + +private def proposalToJson (p : ExtractionProposal) : Lean.Json := + .mkObj [ ("clusterId", .num ⟨p.clusterId, 0⟩) + , ("shapeKey", .str p.shapeKey) + , ("instanceCount", .num ⟨p.instanceCount, 0⟩) + , ("slotCount", .num ⟨p.slotCount, 0⟩) + , ("constSlots", .arr (p.constSlots.map slotAnalysisToJson)) + , ("varSlots", .arr (p.varSlots.map slotAnalysisToJson)) ] + +def resultToJson (r : ExtractionResult) : Lean.Json := + .mkObj [ ("schema", .str "extract/1") + , ("defName", .str r.defName) + , ("accepted", .bool r.accepted) + , ("leanCommand", .str r.leanCommand) + , ("newGraphId", .str r.newGraphId) + , ("proposal", proposalToJson r.proposal) ] + +end ExtractDefn + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 7 Commands +-- ════════════════════════════════════════════════════════════════════════════ + +open ExtractDefn in +/-- `#analyzeCluster N from theoremName` + + Pure analysis — no Lean environment modification. + Prints the constant/varying slot breakdown and the schematic definition. -/ +elab "#analyzeCluster " cid:num " from " thm:ident : command => do + let n := thm.getId + let clusterId := cid.getNat + let graph ← liftTermElabM (findComonadsCore n) + match graph.clusters.find? (·.id == clusterId) with + | none => logError m!"cluster {clusterId} not found" + | some c => + let proposal := analyzeCluster c + logInfo m!"{formatProposal proposal}" + +open ExtractDefn in +/-- `#extractCluster N as "defName" from theoremName` + + Generates and elaborates a Lean definition capturing cluster N's shared + structure. Prints a warning (not a false ✓) if elaboration produces + errors — this happens when `_` type holes are too ambiguous. -/ +elab "#extractCluster " cid:num " as " name:str " from " thm:ident : command => do + let n := thm.getId + let clusterId := cid.getNat + let defName := name.getString + let result ← liftTermElabM (extractDefn n clusterId defName) + logInfo m!"Generated command:\n {result.leanCommand}\n\ + new graphId: {result.newGraphId}" + -- Parser.runParserCategory is a pure function returning Except String Syntax. + let cmdStx := Parser.runParserCategory + (← getEnv) `command result.leanCommand "" + match cmdStx with + | Except.error err => + logError m!"Parse error in generated command:\n {err}\n\ + Run #analyzeCluster {clusterId} from {thm.getId} for guidance." + | Except.ok stx => + -- Snapshot the message log before elaboration so we can detect new errors. + -- elabCommand returns Unit regardless of internal errors, so we must check + -- the log ourselves — otherwise the ✓ message fires even on failure. + let stateBefore ← get + elabCommand stx + let stateAfter ← get + let errorsBefore := stateBefore.messages.toArray.filter (·.severity == .error) + let errorsAfter := stateAfter.messages.toArray.filter (·.severity == .error) + if errorsAfter.size == errorsBefore.size then + logInfo m!"✓ definition '{defName}' added to environment." + else + -- Roll back to pre-elaboration state: removes raw error messages from + -- the panel AND undoes any partial env changes from the failed def. + set stateBefore + logWarning m!"definition '{defName}' elaborated with errors — \ + add explicit type annotations to the generated command.\n\ + Run #analyzeCluster {clusterId} from {thm.getId} for guidance." + +open ExtractDefn in +/-- `#extractClusterJSON N as "defName" from theoremName` + + Machine-readable version for Pantograph / Mathematica. + Emits the extract/1 JSON envelope without elaborating the definition. -/ +elab "#extractClusterJSON " cid:num " as " name:str " from " thm:ident : command => do + let n := thm.getId + let clusterId := cid.getNat + let defName := name.getString + let result ← liftTermElabM (extractDefn n clusterId defName) + logInfo m!"{(resultToJson result).compress}" + +-- ════════════════════════════════════════════════════════════════════════════ +-- § 8 Tests (skipped on port — see note in ComonadFinder.lean § 13) +-- ════════════════════════════════════════════════════════════════════════════ diff --git a/Infoductor/Comonad/GridView.lean b/Infoductor/Comonad/GridView.lean new file mode 100644 index 0000000..e83b884 --- /dev/null +++ b/Infoductor/Comonad/GridView.lean @@ -0,0 +1,186 @@ +/- + Infoductor.Comonad.GridView — Plain-text proof visualization toolkit. + + Uses Mathlib's `Explode` module to decompose proof terms into step- + by-step tables, then formats them for display in the terminal or + Lean InfoView. + + Ported originally from the Lean 3 `grid_view.lean` (via mm-lean's + Lean 4 port). The Lean 4 Mathlib `Entry` type does NOT store the + raw `Expr` — it stores `thm : MessageData` as the formatted + representation. So we work with that directly. + + NOTE on Entries ordering: In Lean 4 Mathlib's Explode, `Entries.l` + is stored in reverse order — the root entry (the whole proof) is + at index 0, and the leaf entries (assumptions) are at the end. + Always start tree traversal at index 0, not `es.l.length - 1`. + + This module provides the *general-purpose* plain-text formatters. + Mathematica-specific Grid/OpenerView output formatters live in + `mm-lean`'s GridView (the original Mathematica-bridge project) + rather than here, per the "Infoductor is general-purpose" rule. +-/ + +import Mathlib.Tactic +import Mathlib.Tactic.Explode + +open Lean Meta Elab Command Term +open Mathlib.Explode + +namespace Infoductor.Comonad.GridView + +/-! ## Utilities -/ + +/-- Pad a string to at least `n` characters with trailing spaces. -/ +def padRight (s : String) (n : Nat) : String := + if s.length >= n then s + else s ++ String.ofList (List.replicate (n - s.length) ' ') + +/-- Convert a `MessageData` to a plain string. -/ +def mdToString (md : MessageData) : MetaM String := do + let fmt ← md.format + return fmt.pretty + +/-- Get the value (proof term) of a declaration. -/ +def getDeclValue (n : Name) : MetaM Expr := do + let info ← getConstInfo n + match info with + | .defnInfo v => return v.value + | .thmInfo v => return v.value + | _ => throwError "'{n}' is not a definition or theorem" + +/-! ## Plain text formatting -/ + +/-- Format an `Entries` object as a plain-text table. -/ +def formatPlainTable (es : Entries) : MetaM String := do + let mut lines : Array String := #[] + -- Header + let hdr := padRight "Line" 6 ++ " | " ++ padRight "Deps" 12 ++ " | " + ++ padRight "Rule" 20 ++ " | Status" + lines := lines.push hdr + lines := lines.push (String.ofList (List.replicate 60 '-')) + -- Entries + for e in es.l do + let depsStr := ", ".intercalate (e.deps.map toString) + let ruleStr ← mdToString e.thm + let statusStr := match e.status with + | .sintro => "sintro" + | .intro => "intro" + | .cintro => "cintro" + | .lam => "lam" + | .reg => "reg" + let row := padRight (toString e.line) 6 ++ " | " ++ padRight depsStr 12 + ++ " | " ++ padRight ruleStr 20 ++ " | " ++ statusStr + lines := lines.push row + return "\n".intercalate lines.toList + +/-- Format a recursive proof tree as indented plain text. -/ +partial def formatProofTree (es : Entries) (entryIdx : Nat) (indent : Nat := 0) : + MetaM String := do + let pfx := String.ofList (List.replicate (indent * 2) ' ') + match es.l[entryIdx]? with + | none => return pfx ++ "(unknown entry " ++ toString entryIdx ++ ")" + | some e => + let ruleStr ← mdToString e.thm + let header := pfx ++ "[" ++ toString e.line ++ "] " ++ ruleStr + if e.deps.isEmpty then + return header + else + let mut result := header + for dep in e.deps do + match es.l.findIdx? (fun e' => e'.line == dep) with + | some idx => + let sub ← formatProofTree es idx (indent + 1) + result := result ++ "\n" ++ sub + | none => + result := result ++ "\n" ++ pfx ++ " (ref " ++ toString dep ++ ")" + return result + +/-! ## Declaration info (plain text) -/ + +/-- Get info about a declaration formatted as plain text. -/ +def viewInfoPlain (n : Name) : MetaM String := do + let info ← getConstInfo n + let typeFmt ← ppExpr info.type + let docStr ← try + let doc ← Lean.findDocString? (← getEnv) n + pure (doc.getD "No documentation found") + catch _ => pure "No documentation found" + let category := match info with + | .defnInfo _ => "Definition" + | .thmInfo _ => "Theorem" + | .axiomInfo _ => "Axiom" + | .opaqueInfo _ => "Opaque" + | _ => "Other" + return category ++ ": " ++ toString n ++ "\nType: " ++ typeFmt.pretty + ++ "\nDoc: " ++ docStr + +/-! ## High-level API -/ + +/-- Explode a declaration and return the entries. -/ +def explodeDecl (n : Name) (hideNonProp : Bool := true) : + MetaM Entries := do + let v ← getDeclValue n + let _ ← inferType v + explode v hideNonProp + +/-- Explode a declaration and format as plain text table. -/ +def explodeDeclPlain (n : Name) (hideNonProp : Bool := true) : + MetaM String := do + let es ← explodeDecl n hideNonProp + formatPlainTable es + +/-- Explode a declaration and format as a plain-text proof tree. + Entries.l is in reverse order: root is at index 0. -/ +def explodeDeclTree (n : Name) (hideNonProp : Bool := true) : + MetaM String := do + let es ← explodeDecl n hideNonProp + if es.l.isEmpty then return "(empty proof)" + formatProofTree es 0 + +end Infoductor.Comonad.GridView + +/-! ## Interactive commands -/ + +open Infoductor.Comonad.GridView in +/-- `#proof_table declName` — show a proof as a step-by-step table. -/ +elab "#proof_table " n:ident : command => liftTermElabM do + let name := n.getId + let result ← explodeDeclPlain name + logInfo m!"{result}" + +open Infoductor.Comonad.GridView in +/-- `#proof_tree declName` — show a proof as a nested tree. -/ +elab "#proof_tree " n:ident : command => liftTermElabM do + let name := n.getId + let result ← explodeDeclTree name + logInfo m!"{result}" + +open Infoductor.Comonad.GridView in +/-- `#view_info declName` — show type and documentation for a declaration. -/ +elab "#view_info " n:ident : command => liftTermElabM do + let name := n.getId + let result ← viewInfoPlain name + logInfo m!"{result}" + +/-! ## Tests -/ + +section Tests + +theorem gv_test_id (p : Prop) (h : p) : p := h +theorem gv_test_mp (p q : Prop) (h : p → q) (hp : p) : q := h hp +theorem gv_test_and_swap (p q : Prop) (h : p ∧ q) : q ∧ p := + ⟨h.2, h.1⟩ + +/-- A documented theorem for testing view_info. -/ +theorem gv_test_documented : 1 + 1 = 2 := rfl + +#proof_table gv_test_id +#proof_table gv_test_mp +#proof_table gv_test_and_swap +#proof_tree gv_test_mp +#proof_tree gv_test_and_swap +#view_info gv_test_documented +#view_info Nat.add_comm + +end Tests diff --git a/lake-manifest.json b/lake-manifest.json index 0602039..d6aba09 100644 --- a/lake-manifest.json +++ b/lake-manifest.json @@ -1,6 +1,96 @@ {"version": "1.2.0", "packagesDir": ".lake/packages", - "packages": [], + "packages": + [{"url": "https://github.com/leanprover-community/mathlib4", + "type": "git", + "subDir": null, + "scope": "", + "rev": "370a1edb4de30838e7b2b8e2f95b0a41dafe7e26", + "name": "mathlib", + "manifestFile": "lake-manifest.json", + "inputRev": "master", + "inherited": false, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/leanprover-community/plausible", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "293af9b2a383eed4d04d66b898d608d0a44b750f", + "name": "plausible", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/LeanSearchClient", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "c5d5b8fe6e5158def25cd28eb94e4141ad97c843", + "name": "LeanSearchClient", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/import-graph", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "cdab3938ccabbdb044be6896e251b5814bec932e", + "name": "importGraph", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/ProofWidgets4", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "2db6054a44326f8c0230ee0570e2ddb894816511", + "name": "proofwidgets", + "manifestFile": "lake-manifest.json", + "inputRev": "v0.0.98", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/leanprover-community/aesop", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "f0c6e183ea26531e82773feb4b73ab6595ca17a5", + "name": "aesop", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.30.0-rc2", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/quote4", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "1cc7e819b9b9bc1e87c9edcccb62e0269e00a809", + "name": "Qq", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.30.0-rc2", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/batteries", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "5c57f3857ba81924a88b2cdf4f062e34ec04ff11", + "name": "batteries", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.30.0-rc2", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/lean4-cli", + "type": "git", + "subDir": null, + "scope": "leanprover", + "rev": "13567aed1ac4f12aea9484178e07e51f8c9f7658", + "name": "Cli", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.30.0-rc2", + "inherited": true, + "configFile": "lakefile.toml"}], "name": "infoductor", "lakeDir": ".lake", "fixedToolchain": false} diff --git a/lakefile.toml b/lakefile.toml index 0c04120..1bd65ee 100644 --- a/lakefile.toml +++ b/lakefile.toml @@ -15,15 +15,36 @@ defaultTargets = ["Infoductor"] # Pairs naturally with the Pantograph project (the conductor of an # electric train sits atop the pantograph hardware). +# ── Dependencies ──────────────────────────────────────────────────────────── +# Mathlib is required by `Infoductor.Comonad` for its `Tactic.Explode` +# proof-decomposition primitive. `Infoductor.Foundation` does NOT +# import Mathlib, so consumers depending only on Foundation pay no +# Mathlib build cost (Lake compiles only the imported subgraph). +# Pinned to the commit mm-lean was tracking at port time (2026-05-01). +[[require]] +name = "mathlib" +git = "https://github.com/leanprover-community/mathlib4" +rev = "master" + +# ── Sub-libraries ─────────────────────────────────────────────────────────── +# `Infoductor` is the umbrella name; sub-libraries below are cherry- +# pickable. Downstream `import Infoductor.Foundation.Meta` (etc.) +# only pulls that sub-library's subgraph. + [[lean_lib]] name = "Infoductor" -# Default lib root. Subdirectories below carve out cherry-pickable -# sub-libs; downstream `import Infoductor.Foundation.Meta` only -# pulls Foundation's subgraph. +# Foundation lives directly under `Infoductor.Foundation.*`; this +# default lib root resolves the umbrella module + Foundation's +# sub-modules. No Mathlib dependency through this lib. -# Subdirectory `lean_lib`s — declared as we land each in turn. -# Foundation: pure algebra (Meta types, Edit/Context, restructure, -# registries). Lands first. +# Comonad: comonadic proof-pattern detection. Pulls Mathlib (for +# Tactic.Explode). Optional — only built when imported by +# downstream code. +[[lean_lib]] +name = "Infoductor.Comonad" +roots = ["Infoductor.Comonad"] + +# Future sub-libs (declared as each lands): # Tactics: reference dispatchers built on Foundation. # Pantograph: plugin / live integration (when ready). # Runner: headless surface (when concrete need arises).