diff --git a/src/Init/Data/String/Pattern/String.lean b/src/Init/Data/String/Pattern/String.lean index 75ccb1f2a4..b8dc9ce662 100644 --- a/src/Init/Data/String/Pattern/String.lean +++ b/src/Init/Data/String/Pattern/String.lean @@ -10,6 +10,7 @@ public import Init.Data.String.Pattern.Basic public import Init.Data.Iterators.Internal.Termination public import Init.Data.Iterators.Consumers.Monadic.Loop import Init.Data.String.Termination +public import Init.Data.Vector.Basic set_option doc.verso true @@ -22,67 +23,87 @@ public section namespace String.Slice.Pattern -inductive ForwardSliceSearcher (s : Slice) where +namespace ForwardSliceSearcher + +def buildTable (pat : Slice) : Vector Nat pat.utf8ByteSize := + if h : pat.utf8ByteSize = 0 then + #v[].cast h.symm + else + let arr := Array.emptyWithCapacity pat.utf8ByteSize + let arr' := arr.push 0 + go arr' (by simp [arr']) (by simp [arr', arr]; omega) (by simp [arr', arr]) +where + go (table : Array Nat) (ht₀ : 0 < table.size) (ht : table.size ≤ pat.utf8ByteSize) (h : ∀ (i : Nat) hi, table[i]'hi ≤ i) : + Vector Nat pat.utf8ByteSize := + if hs : table.size < pat.utf8ByteSize then + let patByte := pat.getUTF8Byte ⟨table.size⟩ hs + let dist := computeDistance patByte table ht h (table[table.size - 1]) + (by have := h (table.size - 1) (by omega); omega) + let dist' := if pat.getUTF8Byte ⟨dist.1⟩ (by simp [Pos.Raw.lt_iff]; omega) = patByte then dist.1 + 1 else dist + go (table.push dist') (by simp) (by simp; omega) (by + intro i hi + by_cases hi' : i = table.size + · subst hi' + simp [dist'] + have := dist.2 + split <;> omega + · rw [Array.getElem_push_lt] + · apply h + · simp at hi + omega) + else + Vector.mk table (by omega) + + computeDistance (patByte : UInt8) (table : Array Nat) + (ht : table.size ≤ pat.utf8ByteSize) + (h : ∀ (i : Nat) hi, table[i]'hi ≤ i) (guess : Nat) (hg : guess < table.size) : + { n : Nat // n < table.size } := + if h' : guess = 0 ∨ pat.getUTF8Byte ⟨guess⟩ (by simp [Pos.Raw.lt_iff]; omega) = patByte then + ⟨guess, hg⟩ + else + have : table[guess - 1] < guess := by have := h (guess - 1) (by omega); omega + computeDistance patByte table ht h table[guess - 1] (by omega) + +theorem getElem_buildTable_le (pat : Slice) (i : Nat) (hi) : (buildTable pat)[i]'hi ≤ i := by + rw [buildTable] + split <;> rename_i h + · simp [h] at hi + · simp only [Array.emptyWithCapacity_eq, List.push_toArray, List.nil_append] + suffices ∀ pat' table ht₀ ht h (i : Nat) hi, (buildTable.go pat' table ht₀ ht h)[i]'hi ≤ i from this .. + intro pat' table ht₀ ht h i hi + fun_induction buildTable.go with + | case1 => assumption + | case2 table ht₀ ht ht' ht'' => apply ht' + +inductive _root_.String.Slice.Pattern.ForwardSliceSearcher (s : Slice) where | emptyBefore (pos : s.Pos) | emptyAt (pos : s.Pos) (h : pos ≠ s.endPos) - | proper (needle : Slice) (table : Array String.Pos.Raw) (stackPos : String.Pos.Raw) (needlePos : String.Pos.Raw) + | proper (needle : Slice) (table : Vector Nat needle.utf8ByteSize) (ht : table = buildTable needle) + (stackPos : String.Pos.Raw) (needlePos : String.Pos.Raw) (hn : needlePos < needle.rawEndPos) | atEnd deriving Inhabited -namespace ForwardSliceSearcher - -partial def buildTable (pat : Slice) : Array String.Pos.Raw := - if pat.utf8ByteSize == 0 then - #[] - else - let arr := Array.emptyWithCapacity pat.utf8ByteSize - let arr := arr.push 0 - go ⟨1⟩ arr -where - go (pos : String.Pos.Raw) (table : Array String.Pos.Raw) := - if h : pos < pat.rawEndPos then - let patByte := pat.getUTF8Byte pos h - let distance := computeDistance table[table.size - 1]! patByte table - let distance := if patByte = pat.getUTF8Byte! distance then distance.inc else distance - go pos.inc (table.push distance) - else - table - - computeDistance (distance : String.Pos.Raw) (patByte : UInt8) (table : Array String.Pos.Raw) : - String.Pos.Raw := - if distance > 0 && patByte != pat.getUTF8Byte! distance then - computeDistance table[distance.byteIdx - 1]! patByte table - else - distance - @[inline] def iter (s : Slice) (pat : Slice) : Std.Iter (α := ForwardSliceSearcher s) (SearchStep s) := - if pat.utf8ByteSize == 0 then + if h : pat.utf8ByteSize = 0 then { internalState := .emptyBefore s.startPos } else - { internalState := .proper pat (buildTable pat) s.startPos.offset pat.startPos.offset } - -partial def backtrackIfNecessary (pat : Slice) (table : Array String.Pos.Raw) (stackByte : UInt8) - (needlePos : String.Pos.Raw) : String.Pos.Raw := - if needlePos != 0 && stackByte != pat.getUTF8Byte! needlePos then - backtrackIfNecessary pat table stackByte table[needlePos.byteIdx - 1]! - else - needlePos + { internalState := .proper pat (buildTable pat) rfl s.startPos.offset pat.startPos.offset + (by simp [Pos.Raw.lt_iff]; omega) } instance (s : Slice) : Std.Iterators.Iterator (ForwardSliceSearcher s) Id (SearchStep s) where IsPlausibleStep it - | .yield it' out => - match it.internalState with + | .yield it' out | .skip it' => + match it.internalState with | .emptyBefore pos => (∃ h, it'.internalState = .emptyAt pos h) ∨ it'.internalState = .atEnd | .emptyAt pos h => ∃ newPos, pos < newPos ∧ it'.internalState = .emptyBefore newPos - | .proper needle table stackPos needlePos => - (∃ newStackPos newNeedlePos, - stackPos < newStackPos ∧ - newStackPos ≤ s.rawEndPos ∧ - it'.internalState = .proper needle table newStackPos newNeedlePos) ∨ + | .proper needle table ht stackPos needlePos hn => + (∃ newStackPos newNeedlePos hn, + it'.internalState = .proper needle table ht newStackPos newNeedlePos hn ∧ + ((s.utf8ByteSize - newStackPos.byteIdx < s.utf8ByteSize - stackPos.byteIdx) ∨ + (newStackPos = stackPos ∧ newNeedlePos < needlePos))) ∨ it'.internalState = .atEnd | .atEnd => False - | .skip _ => False | .done => True step := fun ⟨iter⟩ => match iter with @@ -95,67 +116,102 @@ instance (s : Slice) : Std.Iterators.Iterator (ForwardSliceSearcher s) Id (Searc | .emptyAt pos h => let res := .rejected pos (pos.next h) pure (.deflate ⟨.yield ⟨.emptyBefore (pos.next h)⟩ res, by simp⟩) - | .proper needle table stackPos needlePos => - let rec findNext (startPos : String.Pos.Raw) - (currStackPos : String.Pos.Raw) (needlePos : String.Pos.Raw) (h : stackPos ≤ currStackPos) := - if h1 : currStackPos < s.rawEndPos then - let stackByte := s.getUTF8Byte currStackPos h1 - let needlePos := backtrackIfNecessary needle table stackByte needlePos - let patByte := needle.getUTF8Byte! needlePos - if stackByte != patByte then - let nextStackPos := s.findNextPos currStackPos h1 |>.offset - let res := .rejected (s.pos! startPos) (s.pos! nextStackPos) - have hiter := by - left - exists nextStackPos - have haux := lt_offset_findNextPos h1 - simp only [String.Pos.Raw.lt_iff, proper.injEq, true_and, exists_and_left, exists_eq', and_true, - nextStackPos] - constructor - · simp [String.Pos.Raw.le_iff, String.Pos.Raw.lt_iff] at h haux ⊢ - omega - · apply Pos.Raw.IsValidForSlice.le_utf8ByteSize - apply Pos.isValidForSlice - .deflate ⟨.yield ⟨.proper needle table nextStackPos needlePos⟩ res, hiter⟩ + | .proper needle table htable stackPos needlePos hn => + -- **Invariant 1:** we have already covered everything up until `stackPos - needlePos` (exclusive), + -- with matches and rejections. + -- **Invariant 2:** `stackPos - needlePos` is a valid position + -- **Invariant 3:** the range from from `stackPos - needlePos` to `stackPos` (exclusive) is a + -- prefix of the pattern. + if h₁ : stackPos < s.rawEndPos then + let stackByte := s.getUTF8Byte stackPos h₁ + let patByte := needle.getUTF8Byte needlePos hn + if stackByte = patByte then + let nextStackPos := stackPos.inc + let nextNeedlePos := needlePos.inc + if h : nextNeedlePos = needle.rawEndPos then + -- Safety: the section from `nextStackPos.decreaseBy needle.utf8ByteSize` to `nextStackPos` + -- (exclusive) is exactly the needle, so it must represent a valid range. + let res := .matched (s.pos! (nextStackPos.decreaseBy needle.utf8ByteSize)) (s.pos! nextStackPos) + -- Invariants still satisfied + pure (.deflate ⟨.yield ⟨.proper needle table htable nextStackPos 0 + (by simp [Pos.Raw.lt_iff] at hn ⊢; omega)⟩ res, + by simpa using ⟨_, _, ⟨rfl, rfl⟩, by simp [Pos.Raw.lt_iff] at hn ⊢; omega, + Or.inl (by simp [nextStackPos, Pos.Raw.lt_iff] at h₁ ⊢; omega)⟩⟩) else - let needlePos := needlePos.inc - if needlePos == needle.rawEndPos then - let nextStackPos := currStackPos.inc - let res := .matched (s.pos! startPos) (s.pos! nextStackPos) - have hiter := by - left - exists nextStackPos - simp only [Pos.Raw.byteIdx_inc, proper.injEq, true_and, exists_and_left, - exists_eq', and_true, nextStackPos, String.Pos.Raw.lt_iff] - constructor - · simp [String.Pos.Raw.le_iff] at h ⊢ - omega - · simp [String.Pos.Raw.le_iff, String.Pos.Raw.lt_iff] at h1 ⊢ - omega - .deflate ⟨.yield ⟨.proper needle table nextStackPos 0⟩ res, hiter⟩ - else - have hinv := by - simp [String.Pos.Raw.le_iff] at h ⊢ - omega - findNext startPos currStackPos.inc needlePos hinv + -- Invariants still satisfied + pure (.deflate ⟨.skip ⟨.proper needle table htable nextStackPos nextNeedlePos + (by simp [Pos.Raw.lt_iff, nextNeedlePos, Pos.Raw.ext_iff] at h hn ⊢; omega)⟩, + by simpa using ⟨_, _, ⟨rfl, rfl⟩, by simp [nextNeedlePos, Pos.Raw.lt_iff, Pos.Raw.ext_iff] at h hn ⊢; omega, + Or.inl (by simp [nextStackPos, Pos.Raw.lt_iff] at h₁ ⊢; omega)⟩⟩) else - if startPos != s.rawEndPos then - let res := .rejected (s.pos! startPos) (s.pos! currStackPos) - .deflate ⟨.yield ⟨.atEnd⟩ res, by simp⟩ + if hnp : needlePos.byteIdx = 0 then + -- Safety: by invariant 2 + let basePos := s.pos! stackPos + -- Since we report (mis)matches by code point and not by byte, missing in the first byte + -- means that we should skip ahead to the next code point. + let nextStackPos := s.findNextPos stackPos h₁ + let res := .rejected basePos nextStackPos + -- Invariants still satisfied + pure (.deflate ⟨.yield ⟨.proper needle table htable nextStackPos.offset 0 + (by simp [Pos.Raw.lt_iff] at hn ⊢; omega)⟩ res, + by simpa using ⟨_, _, ⟨rfl, rfl⟩, by simp [Pos.Raw.lt_iff] at hn ⊢; omega, + Or.inl (by + have := lt_offset_findNextPos h₁ + have t₀ := (findNextPos _ _ h₁).isValidForSlice.le_utf8ByteSize + simp [nextStackPos, Pos.Raw.lt_iff] at this ⊢; omega)⟩⟩) else - .deflate ⟨.done, by simp⟩ - termination_by s.utf8ByteSize - currStackPos.byteIdx - decreasing_by - simp [String.Pos.Raw.lt_iff] at h1 ⊢ - omega - - findNext stackPos stackPos needlePos (by simp) + let newNeedlePos := table[needlePos.byteIdx - 1]'(by simp [Pos.Raw.lt_iff] at hn; omega) + if newNeedlePos = 0 then + -- Safety: by invariant 2 + let basePos := s.pos! (stackPos.unoffsetBy needlePos) + -- Since we report (mis)matches by code point and not by byte, missing in the first byte + -- means that we should skip ahead to the next code point. + let nextStackPos := (s.pos? stackPos).getD (s.findNextPos stackPos h₁) + let res := .rejected basePos nextStackPos + -- Invariants still satisfied + pure (.deflate ⟨.yield ⟨.proper needle table htable nextStackPos.offset 0 + (by simp [Pos.Raw.lt_iff] at hn ⊢; omega)⟩ res, + by simpa using ⟨_, _, ⟨rfl, rfl⟩, by simp [Pos.Raw.lt_iff] at hn ⊢; omega, by + simp only [pos?, Pos.Raw.isValidForSlice_eq_true_iff, nextStackPos] + split + · exact Or.inr (by simp [Pos.Raw.lt_iff]; omega) + · refine Or.inl ?_ + have := lt_offset_findNextPos h₁ + have t₀ := (findNextPos _ _ h₁).isValidForSlice.le_utf8ByteSize + simp [Pos.Raw.lt_iff] at this ⊢; omega⟩⟩) + else + let oldBasePos := s.pos! (stackPos.decreaseBy needlePos.byteIdx) + let newBasePos := s.pos! (stackPos.decreaseBy newNeedlePos) + let res := .rejected oldBasePos newBasePos + -- Invariants still satisfied by definition of the prefix table + pure (.deflate ⟨.yield ⟨.proper needle table htable stackPos ⟨newNeedlePos⟩ + (by + subst htable + have := getElem_buildTable_le needle (needlePos.byteIdx - 1) (by simp [Pos.Raw.lt_iff] at hn; omega) + simp [newNeedlePos, Pos.Raw.lt_iff] at hn ⊢ + omega)⟩ res, + by + simp only [proper.injEq, heq_eq_eq, true_and, exists_and_left, exists_prop, + reduceCtorEq, or_false] + refine ⟨_, _, ⟨rfl, rfl⟩, ?_, Or.inr ⟨rfl, ?_⟩⟩ + all_goals + subst htable + have := getElem_buildTable_le needle (needlePos.byteIdx - 1) (by simp [Pos.Raw.lt_iff] at hn; omega) + simp [newNeedlePos, Pos.Raw.lt_iff] at hn ⊢ + omega⟩) + else + if 0 < needlePos then + let basePos := stackPos.unoffsetBy needlePos + let res := .rejected (s.pos! basePos) s.endPos + pure (.deflate ⟨.yield ⟨.atEnd⟩ res, by simp⟩) + else + pure (.deflate ⟨.done, by simp⟩) | .atEnd => pure (.deflate ⟨.done, by simp⟩) private def toOption : ForwardSliceSearcher s → Option (Nat × Nat) | .emptyBefore pos => some (pos.remainingBytes, 1) | .emptyAt pos _ => some (pos.remainingBytes, 0) - | .proper _ _ sp _ => some (s.utf8ByteSize - sp.byteIdx, 0) + | .proper _ _ _ sp np _ => some (s.utf8ByteSize - sp.byteIdx, np.byteIdx) | .atEnd => none private instance : WellFoundedRelation (ForwardSliceSearcher s) where @@ -173,7 +229,8 @@ private def finitenessRelation : simp_wf obtain ⟨step, h, h'⟩ := h cases step - · cases h + all_goals try + cases h revert h' simp only [Std.Iterators.IterM.IsPlausibleStep, Std.Iterators.Iterator.IsPlausibleStep] match it.internalState with @@ -184,21 +241,21 @@ private def finitenessRelation : intro x hx h simpa [h, ForwardSliceSearcher.toOption, Option.lt, Prod.lex_def, ← Pos.lt_iff_remainingBytes_lt] - | .proper needle table stackPos needlePos => - simp only [exists_and_left] - rintro (⟨newStackPos, h₁, h₂, ⟨x, hx⟩⟩|h) - · simp [hx, ForwardSliceSearcher.toOption, Option.lt, Prod.lex_def, Pos.Raw.lt_iff, - Pos.Raw.le_iff] at ⊢ h₁ h₂ - omega + | .proper needle table ht stackPos needlePos hn => + rintro (⟨newStackPos, newNeedlePos, h₁, h₂, (h|⟨rfl, h⟩)⟩|h) + · simp [h₂, ForwardSliceSearcher.toOption, Option.lt, Prod.lex_def, h] + · simpa [h₂, ForwardSliceSearcher.toOption, Option.lt, Prod.lex_def, Pos.Raw.lt_iff] · simp [h, ForwardSliceSearcher.toOption, Option.lt] | .atEnd .. => simp - · cases h' · cases h @[no_expose] instance : Std.Iterators.Finite (ForwardSliceSearcher s) Id := .of_finitenessRelation finitenessRelation +instance : Std.Iterators.IteratorCollect (ForwardSliceSearcher s) Id Id := + .defaultImplementation + instance : Std.Iterators.IteratorLoop (ForwardSliceSearcher s) Id Id := .defaultImplementation diff --git a/tests/lean/run/string_kmp.lean b/tests/lean/run/string_kmp.lean new file mode 100644 index 0000000000..4bff62eb8a --- /dev/null +++ b/tests/lean/run/string_kmp.lean @@ -0,0 +1,39 @@ +module + +inductive S where + | m (b e : Nat) + | r (b e : Nat) +deriving Repr, BEq, DecidableEq + +def run (s pat : String) : List S := + String.Slice.Pattern.ForwardSliceSearcher.iter s.toSlice pat.toSlice + |>.map (fun | .matched b e => S.m b.offset.byteIdx e.offset.byteIdx | .rejected b e => S.r b.offset.byteIdx e.offset.byteIdx) + |>.toList + +-- 𝔸 is [240,157,148,184] +-- 𝕸 is [240,157,149,184] + +#guard run "aababaab" "a" = [.m 0 1, .m 1 2, .r 2 3, .m 3 4, .r 4 5, .m 5 6, .m 6 7, .r 7 8] +#guard run "aab" "ab" = [.r 0 1, .m 1 3] +#guard run "aababacab" "ab" = [.r 0 1, .m 1 3, .m 3 5, .r 5 6, .r 6 7, .m 7 9] +#guard run "aaab" "aab" = [.r 0 1, .m 1 4] +#guard run "aaaaa" "aa" = [.m 0 2, .m 2 4, .r 4 5] +#guard run "abcabd" "abd" = [.r 0 2, .r 2 3, .m 3 6] +#guard run "αβ" "β" = [.r 0 2, .m 2 4] +#guard run "𝔸" "𝕸" = [.r 0 4] +#guard run "𝔸𝕸" "𝕸" = [.r 0 4, .m 4 8] +#guard run "α𝔸€α𝔸₭" "α𝔸₭" = [.r 0 9, .m 9 18] +#guard run "α𝔸𝕸α𝔸₭" "α𝔸₭" = [.r 0 6, .r 6 10, .m 10 19] +#guard run "𝕸𝔸𝕸𝔸₭" "𝕸𝔸₭" = [.r 0 8, .m 8 19] +#guard run "𝕸𝔸𝕸β₭" "𝕸𝔸₭" = [.r 0 8, .r 8 12, .r 12 14, .r 14 17] +#guard run "𝔸𝔸𝔸𝔸𝕸𝔸𝔸𝔸𝕸" "𝔸𝔸𝕸" = [.r 0 4, .r 4 8, .m 8 20, .r 20 24, .m 24 36] +#guard run "𝔸b" "𝕸" = [.r 0 4, .r 4 5] +#guard run "𝔸bb𝕸β" "𝕸" = [.r 0 4, .r 4 5, .r 5 6, .m 6 10, .r 10 12] +#guard run "𝔸bbββαβαββββ𝕸β" "ββ𝕸" = [.r 0 4, .r 4 5, .r 5 6, .r 6 8, .r 8 10, .r 10 12, .r 12 14, .r 14 16, .r 16 18, .r 18 20, .m 20 28, .r 28 30] +#guard run "𝔸β𝕸" "𝕸" = [.r 0 4, .r 4 6, .m 6 10] +#guard run "𝔸b𝕸xu∅" "𝕸x" = [.r 0 4, .r 4 5, .m 5 10, .r 10 11, .r 11 14] +#guard run "é" "ù" = [.r 0 2] +#guard run "éB" "ù" = [.r 0 2, .r 2 3] +#guard run "abcabdabcabcabcabe" "abcabdabcabe" = [.r 0 6, .r 6 9, .r 9 12, .r 12 15, .r 15 17, .r 17 18] +#guard run "abcabdabcxabcabdabcabe" "abcabdabcabe" = [.r 0 6, .r 6 9, .r 9 10, .m 10 22] +#guard run "€α𝕸€α𝔸€α𝕸€α𝕸€α𝕸€αù" "€α𝕸€α𝔸€α𝕸€αù" = [.r 0 18, .r 18 27, .r 27 36, .r 36 45, .r 45 50, .r 50 52]