lean4-htt/src/Lean/Data/EditDistance.lean
Markus Himmel 2c2fcff4f8
refactor: do not use String.Iterator (#11127)
This PR removes all uses of `String.Iterator` from core, preferring
`String.ValidPos` instead.

In an upcoming PR, `String.Iterator` will be renamed to
`String.Legacy.Iterator`.
2025-11-11 11:46:58 +00:00

57 lines
1.7 KiB
Text

/-
Copyright (c) 2024-2025 Lean FRO, LLC. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: David Thrane Christiansen
-/
module
prelude
public import Init.Data.String.Basic
import Init.Data.Vector.Basic
set_option linter.missingDocs true
namespace Lean.EditDistance
/--
Computes the Levenshtein distance between two strings, up to some cutoff.
If the return value is `none`, then the distance is certainly greater than the cutoff value, but a
returned `some` does not necessarily indicate that the edit distance is less than or equal to the
cutoff.
-/
public def levenshtein (str1 str2 : String) (cutoff : Nat) : Option Nat := Id.run do
let len1 := str1.length
let len2 := str2.length
-- The lower bound on the Levenshtein distance is the difference in lengths
if max len1 len2 - min len1 len2 > cutoff then return none
let mut v0 := Vector.replicate (len2 + 1) 0
let mut v1 := v0
for h : i in [0:v0.size] do
v0 := v0.set i i
let mut iter1 := str1.startValidPos
let mut i := 0
while h1 : ¬iter1.IsAtEnd do
v1 := v1.set 0 (i+1)
let mut iter2 := str2.startValidPos
let mut j : Fin (len2 + 1) := 0
while h2 : ¬iter2.IsAtEnd do
let j' : Fin _ := j + 1
let deletionCost := v0[j'] + 1
let insertionCost := v1[j] + 1
let substCost :=
if iter1.get h1 == iter2.get h2 then v0[j]
else v0[j] + 1
let cost := min (min deletionCost insertionCost) substCost
v1 := v1.set j' cost
iter2 := iter2.next h2
j := j + 1
iter1 := iter1.next h1
i := i + 1
-- Terminate early if it's impossible that the result is below the cutoff
if v1.all (· > cutoff) then return none
v0 := v1
some v0[len2]