infoductor/Infoductor/Comonad/ExtractDefn.lean

/-
  ExtractDefn.lean — definition extraction from comonadic clusters.

  Takes an extractable `SimilarityCluster` from ComonadFinder, analyses
  constant vs varying slot positions, and generates a Lean definition that
  captures the shared structure.  The definition is added to the environment
  via `Lean.addDecl` and verified by the kernel.

  ══════════════════════════════════════════════════════════
  WHAT "EXTRACTION" MEANS
  ══════════════════════════════════════════════════════════

  A cluster with shapeKey `(·)` means: the same topology (one node applied
  to one dep) appears ≥2 times.  Positions where the contentId is CONSTANT
  across all instances are the shared "body".  Positions where it VARIES are
  free variables — they become parameters of the extracted definition.

  Example — cf_swap's `(·)` cluster:
    pos 0 (root): And.left  ·  And.right    ← VARIES  → param `f`
    pos 1 (dep):  h          ·  h            ← CONSTANT → body uses `h`

  Extracted definition (schematically):
    def cf_swap_proj {β : Prop} (f : p ∧ q → β) : β := f h

  ══════════════════════════════════════════════════════════
  PIPELINE
  ══════════════════════════════════════════════════════════

    SimilarityCluster
      ──analyzeCluster (pure)──▶  ExtractionProposal
                                  (constant slots, varying slots)
      ──enrichWithTypes (MetaM)──▶ TypedProposal
                                  (Expr + String per varying param)
      ──buildAbstractedExpr (MetaM)──▶ Expr
                                  (lambda over varying params)
      ──addExtractedDecl (MetaM)──▶ Name
                                  (new definition in environment)
      ──rebuildGraph (MetaM)──▶   ProofGraph
                                  (updated graphId, new IR node)

  ══════════════════════════════════════════════════════════
  COMMANDS
  ══════════════════════════════════════════════════════════

    #extractCluster 1 as "myLemma" from Nat.add_comm
    #analyzeCluster 0 from cf_swap
    #extractClusterJSON 1 as "myLemma" from Nat.add_comm
-/

import Infoductor.Comonad.ComonadFinder

open Lean Meta Elab Command Mathlib.Explode

namespace ExtractDefn

-- ════════════════════════════════════════════════════════════════════════════
-- § 1  Rich IR — ProofNode extended with type information
--
--   We iterate over `Entries.s : ExprMap Entry` (i.e., `Std.HashMap Expr Entry`)
--   to collect both the proof expression and its metadata:
--     · typeStr  : String   — pretty-printed TYPE of the sub-term
--     · expr     : Expr     — the actual proof term (key in the ExprMap)
--     · termType : Expr     — inferType of expr (for forall construction)
--
--   These are NOT serialised in the JSON output — they are MetaM-only.
--   `buildRichIR` is called only from `extractDefn`; `buildIR` in
--   ComonadFinder.lean remains the canonical path for the JSON pipeline.
-- ════════════════════════════════════════════════════════════════════════════

/-- MetaM-only extension of ProofNode.  Not serialised. -/
structure RichNode where
  contentId : ContentHash
  label     : String
  status    : String
  depIds    : Array ContentHash
  typeStr   : String             -- ppExpr of the TYPE
  expr      : Expr               -- the proof term
  termType  : Expr               -- inferType of expr
  deriving Repr

/-- Build RichNodes from Explode entries.

    The `Entries` structure stores `Expr` as keys in `entries.s : ExprMap Entry`
    (i.e., `Std.HashMap Expr Entry`).  We iterate over this map to get both the
    expression and its metadata.

    NOTE: The raw `Expr` values contain free variables that are only valid in
    the original MetaM context where `explode` ran. We store them but don't
    try to re-infer types here. Instead, we use the already-formatted
    `entry.type : MessageData` which properly captured the context.
-/
def buildRichIR (entries : Entries) : MetaM (Array RichNode) := do
  -- Get all (Expr, Entry) pairs from the map
  let pairs := entries.s.toArray

  -- Pass 1: lineNo → (contentId, label)
  let mut lineToHash : Std.HashMap Nat ContentHash := {}
  for (_, entry) in pairs do
    let lbl := (← entry.thm.format).pretty
    lineToHash := lineToHash.insert (entry.line.getD 0) (hashPPExpr lbl)

  -- Pass 2: build RichNodes
  let mut seen  : Std.HashSet ContentHash := {}
  let mut nodes : Array RichNode          := #[]

  for (expr, entry) in pairs do
    let lbl := (← entry.thm.format).pretty
    let cid := hashPPExpr lbl
    if seen.contains cid then continue

    let depIds : Array ContentHash :=
      (entry.deps.filterMap (fun optLine =>
        optLine.bind (fun ln => lineToHash.get? ln))).toArray

    let status : String := match entry.status with
      | .sintro | .intro | .cintro => "hyp"
      | .lam                       => "lam"
      | .reg                       => "app"

    -- `entry.type` (MessageData) carries the TYPE of this proof node.
    -- We use its pretty-printed form directly; the raw Expr has context issues.
    let typeStr  := (← entry.type.format).pretty

    nodes := nodes.push
      { contentId := cid
        label     := lbl
        status    := status
        depIds    := depIds
        typeStr   := typeStr
        expr      := expr
        termType  := default }  -- placeholder; not used in current extraction
    seen := seen.insert cid

  return nodes

-- ════════════════════════════════════════════════════════════════════════════
-- § 2  Slot analysis — constant vs varying positions
-- ════════════════════════════════════════════════════════════════════════════

/-- Analysis of one slot position across all instances of a cluster. -/
structure SlotAnalysis where
  pos         : Nat
  isConstant  : Bool               -- same contentId in every instance?
  constId     : Option ContentHash -- the shared id when constant
  constLabel  : Option String      -- the shared label when constant
  varLabels   : Array String       -- all labels seen at this position
  varIds      : Array ContentHash  -- contentIds at this position (parallel to varLabels)
  deriving Repr

/-- Full pure analysis of an extractable cluster. -/
structure ExtractionProposal where
  clusterId     : Nat
  shapeKey      : String
  instanceCount : Nat
  slotCount     : Nat
  constSlots    : Array SlotAnalysis
  varSlots      : Array SlotAnalysis
  deriving Repr

/-- Build slot-by-slot analysis for a cluster (pure). -/
def analyzeCluster (cluster : SimilarityCluster) : ExtractionProposal :=
  let slotCount :=
    cluster.instances[0]?.map (·.slots.size) |>.getD 0

  let analyses : Array SlotAnalysis :=
    (Array.range slotCount).map fun pos =>
      let ids    := cluster.instances.filterMap (·.slots[pos]?.map (·.contentId))
      let labels := cluster.instances.filterMap (·.slots[pos]?.map (·.label))
      let firstId := ids[0]?
      let isConst := firstId.isSome && ids.all (· == firstId.get!)
      { pos        := pos
        isConstant := isConst
        constId    := if isConst then firstId else none
        constLabel := if isConst then labels[0]? else none
        varLabels  := labels
        varIds     := ids }

  let constSlots := analyses.filter (·.isConstant)
  let varSlots   := analyses.filter (!·.isConstant)

  { clusterId     := cluster.id
    shapeKey      := cluster.shapeKey
    instanceCount := cluster.instances.size
    slotCount     := slotCount
    constSlots    := constSlots
    varSlots      := varSlots }

-- ════════════════════════════════════════════════════════════════════════════
-- § 3  Human-readable analysis report
-- ════════════════════════════════════════════════════════════════════════════

/-- Format an ExtractionProposal as a readable analysis block. -/
def formatProposal (p : ExtractionProposal) : String :=
  let header :=
    s!"cluster [{p.clusterId}]  shape={p.shapeKey}  " ++
    s!"{p.instanceCount} occurrences  {p.slotCount} slots"

  -- NOTE: avoid escaped quotes and {{ }} inside s!-string interpolations —
  -- the Lean 4 s!-string lexer misparses them.  Use local let bindings instead.
  let constLines := p.constSlots.toList.map fun s =>
    let lbl := s.constLabel.getD "?"
    s!"  pos {s.pos} CONSTANT  → body uses   {lbl}"

  let varLines := p.varSlots.toList.map fun s =>
    let allLabels := ", ".intercalate s.varLabels.toList
    let setStr    := "{ " ++ allLabels ++ " }"
    s!"  pos {s.pos} VARIES    → param_p{s.pos} ∈ {setStr}"

  let paramList := p.varSlots.toList.map fun s => s!"(param_p{s.pos} : _)"
  let paramStr  := " ".intercalate paramList
  let bodySlots := (Array.range p.slotCount).toList.map fun pos =>
    if p.constSlots.any (·.pos == pos) then
      p.constSlots.find? (·.pos == pos) |>.bind (·.constLabel) |>.getD "?"
    else
      s!"param_p{pos}"
  let bodyStr := match bodySlots with
    | []         => "?"
    | [x]        => x
    | fn :: args => fn ++ " " ++ " ".intercalate args

  let schematic :=
    s!"  schematic: def name {paramStr} : _ := {bodyStr}"

  "\n".intercalate ([header] ++ constLines ++ varLines ++ [schematic])

-- ════════════════════════════════════════════════════════════════════════════
-- § 4  Lean command generation (label-based, no type info needed)
-- ════════════════════════════════════════════════════════════════════════════

/-- Generate a candidate Lean command string from a proposal.

    Uses the LABELS from constant slots directly as Lean identifiers.
    When `typeMap` is provided (from `buildTypeMap`), parameter types are
    filled in from the Explode entries; otherwise `_` holes are used.  -/
def generateDefCommand
    (proposal : ExtractionProposal)
    (name     : String)
    (typeMap  : Std.HashMap ContentHash String := {}) : String :=
  let escape (s : String) : String :=
    let needsEscape := s.any fun c =>
      c == '∀' || c == '∃' || c == '✝' || c == '·' || c == ' '
      || c == '(' || c == ')' || (c == '.' && s.contains ' ')
    if needsEscape then "`" ++ s ++ "`" else s

  -- For each varying-slot parameter, look up the typeStr of the first instance
  -- at that position (varIds[0]) in typeMap.  Falls back to `_` when the map
  -- is empty (e.g. before buildTypeMap's TODO is filled in).
  let paramDecls := proposal.varSlots.toList.map fun s =>
    let typeAnn := s.varIds[0]?.bind (fun id => typeMap.get? id) |>.getD "_"
    s!"(param_p{s.pos} : {typeAnn})"
  let paramStr := " ".intercalate paramDecls

  let slotExprs := (Array.range proposal.slotCount).toList.map fun pos =>
    if let some sa := proposal.constSlots.find? (·.pos == pos) then
      escape (sa.constLabel.getD "_")
    else
      s!"param_p{pos}"

  let bodyStr := match slotExprs with
    | []         => "_"
    | [x]        => x
    | fn :: args => fn ++ " " ++ " ".intercalate args

  s!"private def {name} {paramStr} : _ := {bodyStr}"

-- ════════════════════════════════════════════════════════════════════════════
-- § 5  MetaM extraction — builds the abstracted Lean term
-- ════════════════════════════════════════════════════════════════════════════

/-- Build a ContentHash → typeStr map from the Explode entries for `theoremName`.

    Each entry's `entry.type` (MessageData) gives the type of that proof node.
    This map is passed to `generateDefCommand` so it can replace `_` holes
    with real type strings, producing commands that elaborate without errors.
-/
private def buildTypeMap (theoremName : Name) : MetaM (Std.HashMap ContentHash String) := do
  let some ci := (← getEnv).find? theoremName
    | return {}
  let some val := ci.value?
    | return {}
  -- Call Mathlib's explode to get the Entries structure
  let entries ← Mathlib.Explode.explode val
  let richNodes ← buildRichIR entries
  return richNodes.foldl (fun m n => m.insert n.contentId n.typeStr) {}


structure ExtractionResult where
  defName     : String
  proposal    : ExtractionProposal
  leanCommand : String        -- the generated def command
  accepted    : Bool          -- did Lean elaborate it without errors?
  newGraphId  : ContentHash   -- FNV hash of (oldGraphId ++ defName)
  deriving Repr

/-- Full extraction pipeline (MetaM). -/
def extractDefn
    (theoremName : Name)
    (clusterId   : Nat)
    (defName     : String) : MetaM ExtractionResult := do
  let graph ← findComonadsCore theoremName

  let cluster ← match graph.clusters.find? (·.id == clusterId) with
    | some c => pure c
    | none   => throwError
        s!"ExtractDefn: cluster {clusterId} not found in proof of {theoremName}.\n\
           Available cluster ids: {graph.clusters.map (·.id)}"

  if !cluster.extractable then
    throwError
      s!"ExtractDefn: cluster {clusterId} is not extractable.\n\
         Reason: shapeKey='{cluster.shapeKey}' or fewer than 2 closed instances."

  let proposal   := analyzeCluster cluster
  let typeMap    ← buildTypeMap theoremName
  let cmd        := generateDefCommand proposal defName typeMap
  let newGraphId := hashPPExpr (graph.graphId ++ defName ++ cmd)

  return { defName     := defName
           proposal    := proposal
           leanCommand := cmd
           accepted    := false   -- updated by #extractCluster after elabCommand
           newGraphId  := newGraphId }

-- ════════════════════════════════════════════════════════════════════════════
-- § 6  JSON serialisation for Pantograph bridge
-- ════════════════════════════════════════════════════════════════════════════

private def slotAnalysisToJson (s : SlotAnalysis) : Lean.Json :=
  .mkObj [ ("pos",        .num ⟨s.pos, 0⟩)
         , ("isConstant", .bool s.isConstant)
         , ("constId",    s.constId.map .str |>.getD .null)
         , ("constLabel", s.constLabel.map .str |>.getD .null)
         , ("varLabels",  .arr (s.varLabels.map .str))
         , ("varIds",     .arr (s.varIds.map .str)) ]

private def proposalToJson (p : ExtractionProposal) : Lean.Json :=
  .mkObj [ ("clusterId",     .num ⟨p.clusterId, 0⟩)
         , ("shapeKey",      .str p.shapeKey)
         , ("instanceCount", .num ⟨p.instanceCount, 0⟩)
         , ("slotCount",     .num ⟨p.slotCount, 0⟩)
         , ("constSlots",    .arr (p.constSlots.map slotAnalysisToJson))
         , ("varSlots",      .arr (p.varSlots.map slotAnalysisToJson)) ]

def resultToJson (r : ExtractionResult) : Lean.Json :=
  .mkObj [ ("schema",      .str "extract/1")
         , ("defName",     .str r.defName)
         , ("accepted",    .bool r.accepted)
         , ("leanCommand", .str r.leanCommand)
         , ("newGraphId",  .str r.newGraphId)
         , ("proposal",    proposalToJson r.proposal) ]

end ExtractDefn

-- ════════════════════════════════════════════════════════════════════════════
-- § 7  Commands
-- ════════════════════════════════════════════════════════════════════════════

open ExtractDefn in
/-- `#analyzeCluster N from theoremName`

    Pure analysis — no Lean environment modification.
    Prints the constant/varying slot breakdown and the schematic definition.  -/
elab "#analyzeCluster " cid:num " from " thm:ident : command => do
  let n         := thm.getId
  let clusterId := cid.getNat
  let graph     ← liftTermElabM (findComonadsCore n)
  match graph.clusters.find? (·.id == clusterId) with
  | none   => logError m!"cluster {clusterId} not found"
  | some c =>
    let proposal := analyzeCluster c
    logInfo m!"{formatProposal proposal}"

open ExtractDefn in
/-- `#extractCluster N as "defName" from theoremName`

    Generates and elaborates a Lean definition capturing cluster N's shared
    structure.  Prints a warning (not a false ✓) if elaboration produces
    errors — this happens when `_` type holes are too ambiguous.  -/
elab "#extractCluster " cid:num " as " name:str " from " thm:ident : command => do
  let n         := thm.getId
  let clusterId := cid.getNat
  let defName   := name.getString
  let result ← liftTermElabM (extractDefn n clusterId defName)
  logInfo m!"Generated command:\n  {result.leanCommand}\n\
             new graphId: {result.newGraphId}"
  -- Parser.runParserCategory is a pure function returning Except String Syntax.
  let cmdStx := Parser.runParserCategory
    (← getEnv) `command result.leanCommand "<extractCluster>"
  match cmdStx with
  | Except.error err =>
    logError m!"Parse error in generated command:\n  {err}\n\
                Run #analyzeCluster {clusterId} from {thm.getId} for guidance."
  | Except.ok stx =>
    -- Snapshot the message log before elaboration so we can detect new errors.
    -- elabCommand returns Unit regardless of internal errors, so we must check
    -- the log ourselves — otherwise the ✓ message fires even on failure.
    let stateBefore ← get
    elabCommand stx
    let stateAfter ← get
    let errorsBefore := stateBefore.messages.toArray.filter (·.severity == .error)
    let errorsAfter  := stateAfter.messages.toArray.filter  (·.severity == .error)
    if errorsAfter.size == errorsBefore.size then
      logInfo m!"✓ definition '{defName}' added to environment."
    else
      -- Roll back to pre-elaboration state: removes raw error messages from
      -- the panel AND undoes any partial env changes from the failed def.
      set stateBefore
      logWarning m!"definition '{defName}' elaborated with errors — \
                    add explicit type annotations to the generated command.\n\
                    Run #analyzeCluster {clusterId} from {thm.getId} for guidance."

open ExtractDefn in
/-- `#extractClusterJSON N as "defName" from theoremName`

    Machine-readable version for Pantograph / Mathematica.
    Emits the extract/1 JSON envelope without elaborating the definition.  -/
elab "#extractClusterJSON " cid:num " as " name:str " from " thm:ident : command => do
  let n         := thm.getId
  let clusterId := cid.getNat
  let defName   := name.getString
  let result ← liftTermElabM (extractDefn n clusterId defName)
  logInfo m!"{(resultToJson result).compress}"

-- ════════════════════════════════════════════════════════════════════════════
-- § 8  Tests (skipped on port — see note in ComonadFinder.lean § 13)
-- ════════════════════════════════════════════════════════════════════════════