feat: basic premise selection algorithm based on MePo (#7844)

This PR adds a simple implementation of MePo, from "Lightweight relevance filtering for machine-generated resolution problems" by Meng and Paulson. This needs tuning, but is already useful as a baseline or test case. --------- Co-authored-by: Thomas Zhu <thomas.zhu.sh@hotmail.com>
2025-09-23 16:40:22 +10:00 · 2025-09-23 16:40:22 +10:00 · 0807f73171
commit 0807f73171
parent 27fa5b0bb5
5 changed files with 302 additions and 133 deletions
--- a/src/Lean/Data/NameMap.lean
+++ b/src/Lean/Data/NameMap.lean
@ -8,5 +8,3 @@ module
 prelude
 public import Lean.Data.NameMap.Basic
 public import Lean.Data.NameMap.AdditionalOperations
-
-public section
--- a/src/Lean/PremiseSelection.lean
+++ b/src/Lean/PremiseSelection.lean
@ -6,134 +6,5 @@ Authors: Kim Morrison
 module

 prelude
-public import Lean.Elab.Command
-public import Lean.Meta.Eval
-public import Lean.Meta.CompletionName
-public import Init.Data.Random
-
-public section
-
-/-!
-# An API for premise selection algorithms.
-
-This module provides a basic API for premise selection algorithms,
-which are used to suggest identifiers that should be introduced in a proof.
-
-The core interface is the `Selector` type, which is a function from a metavariable
-and a configuration to a list of suggestions.
-
-The `Selector` is registered as an environment extension, and the trivial (no suggestions) implementation
-is `Lean.PremiseSelection.empty`.
-
-Lean does not provide a default premise selector, so this module is intended to be used in conjunction
-with a downstream package which registers a premise selector.
-/
-
-namespace Lean.PremiseSelection
-
-/--
-A `Suggestion` is essentially just an identifier and a confidence score that the identifier is relevant.
-If the premise selection request included information about the intended use (e.g. in the simplifier, in `grind`, etc.)
-the score may be adjusted for that application.
-
-/
-structure Suggestion where
-  name : Name
-  /--
-  The score of the suggestion, as a probability that this suggestion should be used.
-  -/
-  score : Float
-
-structure Config where
-  /--
-  The maximum number of suggestions to return.
-  -/
-  maxSuggestions : Option Nat := none
-  /--
-  The tactic that is calling the premise selection, e.g. `simp`, `grind`, or `aesop`.
-  This may be used to adjust the score of the suggestions
-  -/
-  caller : Option Name := none
-  /--
-  A filter on suggestions; only suggestions returning `true` should be returned.
-  (It can be better to filter on the premise selection side, to ensure that enough suggestions are returned.)
-  -/
-  filter : Name → MetaM Bool := fun _ => pure true
-  /--
-  An optional arbitrary "hint" to the premise selection algorithm.
-  There is no guarantee that the algorithm will make any use of the hint.
-
-  Potential use cases include a natural language comment provided by the user
-  (e.g. allowing use of the premise selector as a search engine)
-  or including context from the current proof and/or file.
-
-  We may later split these use cases into separate fields if necessary.
-  -/
-  hint : Option String := none
-
-abbrev Selector : Type := MVarId → Config → MetaM (Array Suggestion)
-
-/--
-The trivial premise selector, which returns no suggestions.
-/
-def empty : Selector := fun _ _ => pure #[]
-
-/-- A random premise selection algorithm, provided solely for testing purposes. -/
-def random (gen : StdGen := ⟨37, 59⟩) : Selector := fun _ cfg => do
-  IO.stdGenRef.set gen
-  let env ← getEnv
-  let max := cfg.maxSuggestions.getD 10
-  let consts := env.const2ModIdx.keysArray
-  let mut suggestions := #[]
-  while suggestions.size < max do
-    let i ← IO.rand 0 consts.size
-    let name := consts[i]!
-    if ! (`Lean).isPrefixOf name && Lean.Meta.allowCompletion env name then
-      suggestions := suggestions.push { name := name, score := 1.0 / consts.size.toFloat }
-  return suggestions
-
-initialize premiseSelectorExt : EnvExtension (Option Selector) ←
-  registerEnvExtension (pure none)
-
-/-- Generate premise suggestions for the given metavariable, using the currently registered premise selector. -/
-def select (m : MVarId) (c : Config := {}) : MetaM (Array Suggestion) := do
-  let some selector := premiseSelectorExt.getState (← getEnv) |
-    throwError "No premise selector registered. \
-      (Note the Lean does not provide a default premise selector, these must be installed by a downstream library.)"
-  selector m c
-
-/-!
-Currently the registration mechanism is just global state.
-This means that if multiple modules register premise selectors,
-the behaviour will be dependent on the order of loading modules.
-
-We should replace this with a mechanism so that
-premise selectors are configured via options in the `lakefile`, and
-commands are only used to override in a single declaration or file.
-/
-
-/-- Set the current premise selector.-/
-def registerPremiseSelector (selector : Selector) : CoreM Unit := do
-  modifyEnv fun env => premiseSelectorExt.setState env (some selector)
-
-open Lean Elab Command in
-@[builtin_command_elab setPremiseSelectorCmd, inherit_doc setPremiseSelectorCmd]
-def elabSetPremiseSelector : CommandElab
-  | `(command| set_premise_selector $selector) => do
-    let selector ← liftTermElabM do
-      try
-        let selectorTerm ← Term.elabTermEnsuringType selector (some (Expr.const ``Selector []))
-        unsafe Meta.evalExpr Selector (Expr.const ``Selector []) selectorTerm
-      catch _ =>
-        throwError "Failed to elaborate {selector} as a `MVarId → Config → MetaM (Array Suggestion)`."
-    liftCoreM (registerPremiseSelector selector)
-  | _ => throwUnsupportedSyntax
-
-open Lean.Elab.Tactic in
-@[builtin_tactic Lean.Parser.Tactic.suggestPremises] def evalSuggestPremises : Tactic := fun _ =>
-  liftMetaTactic1 fun mvarId => do
-    let suggestions ← select mvarId
-    logInfo m!"Premise suggestions: {suggestions.map (·.name)}"
-    return mvarId
-
-end Lean.PremiseSelection
+import Lean.PremiseSelection.Basic
+import Lean.PremiseSelection.MePo
--- a/src/Lean/PremiseSelection/Basic.lean
+++ b/src/Lean/PremiseSelection/Basic.lean
@ -0,0 +1,180 @@
+/-
+Copyright (c) 2025 Lean FRO, LLC. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: Kim Morrison
+-/
+module
+
+prelude
+public import Lean.Elab.Command
+public import Lean.Meta.Eval
+public import Lean.Meta.CompletionName
+public import Init.Data.Random
+
+/-!
+# An API for premise selection algorithms.
+
+This module provides a basic API for premise selection algorithms,
+which are used to suggest identifiers that should be introduced in a proof.
+
+The core interface is the `Selector` type, which is a function from a metavariable
+and a configuration to a list of suggestions.
+
+The `Selector` is registered as an environment extension, and the trivial (no suggestions) implementation
+is `Lean.PremiseSelection.empty`.
+
+Lean does not provide a default premise selector, so this module is intended to be used in conjunction
+with a downstream package which registers a premise selector.
+-/
+
+@[expose] public section
+
+namespace Lean.PremiseSelection
+
+/--
+A `Suggestion` is essentially just an identifier and a confidence score that the identifier is relevant.
+If the premise selection request included information about the intended use (e.g. in the simplifier, in `grind`, etc.)
+the score may be adjusted for that application.
+
+-/
+structure Suggestion where
+  name : Name
+  /--
+  The score of the suggestion, as a probability that this suggestion should be used.
+  -/
+  score : Float
+
+structure Config where
+  /--
+  The maximum number of suggestions to return.
+  -/
+  maxSuggestions : Option Nat := none
+  /--
+  The tactic that is calling the premise selection, e.g. `simp`, `grind`, or `aesop`.
+  This may be used to adjust the score of the suggestions
+  -/
+  caller : Option Name := none
+  /--
+  A filter on suggestions; only suggestions returning `true` should be returned.
+  (It can be better to filter on the premise selection side, to ensure that enough suggestions are returned.)
+  -/
+  filter : Name → MetaM Bool := fun _ => pure true
+  /--
+  An optional arbitrary "hint" to the premise selection algorithm.
+  There is no guarantee that the algorithm will make any use of the hint.
+
+  Potential use cases include a natural language comment provided by the user
+  (e.g. allowing use of the premise selector as a search engine)
+  or including context from the current proof and/or file.
+
+  We may later split these use cases into separate fields if necessary.
+  -/
+  hint : Option String := none
+
+abbrev Selector : Type := MVarId → Config → MetaM (Array Suggestion)
+
+section DenyList
+
+/-- Premises from a module whose name has one of the following components are not retrieved. -/
+builtin_initialize moduleDenyListExt : SimplePersistentEnvExtension String (List String) ←
+  registerSimplePersistentEnvExtension {
+    addEntryFn := (·.cons)
+    addImportedFn := mkStateFromImportedEntries (·.cons) []
+  }
+
+/-- A premise whose name has one of the following components is not retrieved. -/
+builtin_initialize nameDenyListExt : SimplePersistentEnvExtension String (List String) ←
+  registerSimplePersistentEnvExtension {
+    addEntryFn := (·.cons)
+    addImportedFn := mkStateFromImportedEntries (·.cons) []
+  }
+
+/-- A premise whose `type.getForallBody.getAppFn` is a constant that has one of these prefixes is not retrieved. -/
+builtin_initialize typePrefixDenyListExt : SimplePersistentEnvExtension Name (List Name) ←
+  registerSimplePersistentEnvExtension {
+    addEntryFn := (·.cons)
+    addImportedFn := mkStateFromImportedEntries (·.cons) []
+  }
+
+def isDeniedModule (env : Environment) (moduleName : Name) : Bool :=
+  (moduleDenyListExt.getState env).any fun p => moduleName.anyS (· == p)
+
+def isDeniedPremise (env : Environment) (name : Name) : Bool := Id.run do
+  if name == ``sorryAx then return true
+  if name.isInternalDetail then return true
+  if (nameDenyListExt.getState env).any (fun p => name.anyS (· == p)) then return true
+  if let some moduleIdx := env.getModuleIdxFor? name then
+    let moduleName := env.header.moduleNames[moduleIdx.toNat]!
+    if isDeniedModule env moduleName then
+      return true
+  let some ci := env.find? name | return true
+  if let .const fnName _ := ci.type.getForallBody.getAppFn then
+    if (typePrefixDenyListExt.getState env).any (fun p => p.isPrefixOf fnName) then return true
+  return false
+
+end DenyList
+
+/--
+The trivial premise selector, which returns no suggestions.
+-/
+def empty : Selector := fun _ _ => pure #[]
+
+/-- A random premise selection algorithm, provided solely for testing purposes. -/
+def random (gen : StdGen := ⟨37, 59⟩) : Selector := fun _ cfg => do
+  IO.stdGenRef.set gen
+  let env ← getEnv
+  let max := cfg.maxSuggestions.getD 10
+  let consts := env.const2ModIdx.keysArray
+  let mut suggestions := #[]
+  while suggestions.size < max do
+    let i ← IO.rand 0 consts.size
+    let name := consts[i]!
+    unless isDeniedPremise env name do
+      suggestions := suggestions.push { name := name, score := 1.0 / consts.size.toFloat }
+  return suggestions
+
+builtin_initialize premiseSelectorExt : EnvExtension (Option Selector) ←
+  registerEnvExtension (pure none)
+
+/-- Generate premise suggestions for the given metavariable, using the currently registered premise selector. -/
+def select (m : MVarId) (c : Config := {}) : MetaM (Array Suggestion) := do
+  let some selector := premiseSelectorExt.getState (← getEnv) |
+    throwError "No premise selector registered. \
+      (Note the Lean does not provide a default premise selector, these must be installed by a downstream library.)"
+  selector m c
+
+/-!
+Currently the registration mechanism is just global state.
+This means that if multiple modules register premise selectors,
+the behaviour will be dependent on the order of loading modules.
+
+We should replace this with a mechanism so that
+premise selectors are configured via options in the `lakefile`, and
+commands are only used to override in a single declaration or file.
+-/
+
+/-- Set the current premise selector.-/
+def registerPremiseSelector (selector : Selector) : CoreM Unit := do
+  modifyEnv fun env => premiseSelectorExt.setState env (some selector)
+
+open Lean Elab Command in
+@[builtin_command_elab setPremiseSelectorCmd, inherit_doc setPremiseSelectorCmd]
+def elabSetPremiseSelector : CommandElab
+  | `(command| set_premise_selector $selector) => do
+    let selector ← liftTermElabM do
+      try
+        let selectorTerm ← Term.elabTermEnsuringType selector (some (Expr.const ``Selector []))
+        unsafe Meta.evalExpr Selector (Expr.const ``Selector []) selectorTerm
+      catch _ =>
+        throwError "Failed to elaborate {selector} as a `MVarId → Config → MetaM (Array Suggestion)`."
+    liftCoreM (registerPremiseSelector selector)
+  | _ => throwUnsupportedSyntax
+
+open Lean.Elab.Tactic in
+@[builtin_tactic Lean.Parser.Tactic.suggestPremises] def evalSuggestPremises : Tactic := fun _ =>
+  liftMetaTactic1 fun mvarId => do
+    let suggestions ← select mvarId
+    logInfo m!"Premise suggestions: {suggestions.map (·.name)}"
+    return mvarId
+
+end Lean.PremiseSelection
--- a/src/Lean/PremiseSelection/MePo.lean
+++ b/src/Lean/PremiseSelection/MePo.lean
@ -0,0 +1,94 @@
+/-
+Copyright (c) 2025 Lean FRO, LLC. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: Kim Morrison
+-/
+module
+
+prelude
+public import Lean.PremiseSelection.Basic
+import Lean.Util.FoldConsts
+import Lean.Meta.Basic
+
+/-!
+# MePo premise selection
+
+This is a naive implement of the MePo premise selection algorithm, from
+"Lightweight relevance filtering for machine-generated resolution problems" by Meng and Paulson.
+
+It needs to be tuned and evaluated for Lean.
+-/
+
+open Lean
+
+namespace Lean.PremiseSelection.MePo
+
+def symbolFrequency (env : Environment) : NameMap Nat := Id.run do
+  let mut map := {}
+  for (_, ci) in env.constants do
+    for n' in ci.type.getUsedConstantsAsSet do
+      map := map.alter n' fun i? => some (i?.getD 0 + 1)
+  return map
+
+def weightedScore (weight : Name → Float) (relevant candidate : NameSet) : Float :=
+  let S := candidate
+  let R := relevant ∩ S
+  let R' := S \ R
+  let M := R.foldl (fun acc n => acc + weight n) 0
+  M / (M + R'.size.toFloat)
+
+-- This function is taken from the MePo paper and needs to be tuned.
+def weightFunction (n : Nat) := 1.0 + 2.0 / (n.log2.toFloat + 1.0)
+
+def frequencyScore (frequency : Name → Nat) (relevant candidate : NameSet) : Float :=
+  weightedScore (fun n => weightFunction (frequency n)) relevant candidate
+
+def unweightedScore (relevant candidate : NameSet) : Float := weightedScore (fun _ => 1) relevant candidate
+
+def mepo (initialRelevant : NameSet) (score : NameSet → NameSet → Float) (accept : ConstantInfo → CoreM Bool) (p : Float) (c : Float) : CoreM (Array (Name × Float)) := do
+  let env ← getEnv
+  let mut p := p
+  let mut candidates := #[]
+  let mut relevant := initialRelevant
+  let mut accepted : Array (Name × Float) := {}
+  for (n, ci) in env.constants do
+    if ← accept ci then
+      candidates := candidates.push (n, ci.type.getUsedConstantsAsSet)
+  while candidates.size > 0 do
+    let (newAccepted, candidates') := candidates.map (fun (n, c) => (n, c, score relevant c)) |>.partition fun (_, _, s) => p ≤ s
+    if newAccepted.isEmpty then return accepted
+    accepted := newAccepted.foldl (fun acc (n, _, s) => acc.push (n, s)) accepted
+    candidates := candidates'.map fun (n, c, _) => (n, c)
+    relevant := newAccepted.foldl (fun acc (_, ns, _) => acc ++ ns) relevant
+    p := p + (1 - p) / c
+  return accepted
+
+open Lean Meta MVarId in
+def _root_.Lean.MVarId.getConstants (g : MVarId) : MetaM NameSet := withContext g do
+  let mut c := (← g.getType).getUsedConstantsAsSet
+  for t in (← getLocalHyps) do
+    c := c ∪ (← inferType t).getUsedConstantsAsSet
+  return c
+
+end MePo
+
+open MePo
+
+-- The values of p := 0.6 and c := 2.4 are taken from the MePo paper, and need to be tuned.
+-- When retrieving ≤32 premises for use by downstream automation, Thomas Zhu suggests that c := 0.5 is optimal.
+public def mepoSelector (useRarity : Bool) (p : Float := 0.6) (c : Float := 2.4) : Selector := fun g config => do
+  let constants ← g.getConstants
+  let env ← getEnv
+  let score := if useRarity then
+    let frequency := symbolFrequency env
+    frequencyScore (frequency.getD · 0)
+  else
+    unweightedScore
+  let accept := fun ci => return !isDeniedPremise env ci.name
+  let suggestions ← mepo constants score accept p c
+  let suggestions := suggestions
+    |>.map (fun (n, s) => { name := n, score := s })
+    |>.reverse  -- we favor constants that appear at the end of `env.constants`
+  match config.maxSuggestions with
+  | none => return suggestions
+  | some k => return suggestions.take k
--- a/tests/lean/run/premise_selection_mepo.lean
+++ b/tests/lean/run/premise_selection_mepo.lean
@ -0,0 +1,26 @@
+import Lean.PremiseSelection.MePo
+
+set_premise_selector Lean.PremiseSelection.mepoSelector (useRarity := false)
+
+example (a b : Int) : a + b = b + a := by
+  suggest_premises
+  sorry
+
+-- #time
+example (x y z : List Int) : x ++ y ++ z = x ++ (y ++ z) := by
+  suggest_premises
+  sorry
+
+-- `useRarity` is too slow in practice: it requires analyzing all the types in the environment.
+-- It would need to be cached.
+
+-- set_premise_selector Lean.PremiseSelection.mepoSelector (useRarity := true)
+
+-- example (a b : Int) : a + b = b + a := by
+--   suggest_premises
+--   sorry
+
+-- #time
+-- example (x y z : List Int) : x ++ y ++ z = x ++ (y ++ z) := by
+--   suggest_premises
+--   sorry