This PR renames `String.getUtf8Byte` to `String.getUTF8Byte` in order to adhere to the standard library naming convention.
273 lines
9.2 KiB
Text
273 lines
9.2 KiB
Text
/-
|
||
Copyright (c) 2025 Lean FRO, LLC. All rights reserved.
|
||
Released under Apache 2.0 license as described in the file LICENSE.
|
||
Authors: Henrik Böving
|
||
-/
|
||
module
|
||
|
||
prelude
|
||
public import Init.Data.String.Pattern.Basic
|
||
public import Init.Data.Iterators.Internal.Termination
|
||
public import Init.Data.Iterators.Consumers.Monadic.Loop
|
||
|
||
set_option doc.verso true
|
||
|
||
/-!
|
||
This module defines the necessary instances to register {name}`String` and {name}`String.Slice`
|
||
with the pattern framework.
|
||
-/
|
||
|
||
public section
|
||
|
||
namespace String.Slice.Pattern
|
||
|
||
inductive ForwardSliceSearcher (s : Slice) where
|
||
| empty (pos : s.Pos)
|
||
| proper (needle : Slice) (table : Array String.Pos.Raw) (stackPos : String.Pos.Raw) (needlePos : String.Pos.Raw)
|
||
| atEnd
|
||
deriving Inhabited
|
||
|
||
namespace ForwardSliceSearcher
|
||
|
||
partial def buildTable (pat : Slice) : Array String.Pos.Raw :=
|
||
if pat.utf8ByteSize == 0 then
|
||
#[]
|
||
else
|
||
let arr := Array.emptyWithCapacity pat.utf8ByteSize.byteIdx
|
||
let arr := arr.push 0
|
||
go ⟨1⟩ arr
|
||
where
|
||
go (pos : String.Pos.Raw) (table : Array String.Pos.Raw) :=
|
||
if h : pos < pat.utf8ByteSize then
|
||
let patByte := pat.getUTF8Byte pos h
|
||
let distance := computeDistance table[table.size - 1]! patByte table
|
||
let distance := if patByte = pat.getUTF8Byte! distance then distance.inc else distance
|
||
go pos.inc (table.push distance)
|
||
else
|
||
table
|
||
|
||
computeDistance (distance : String.Pos.Raw) (patByte : UInt8) (table : Array String.Pos.Raw) :
|
||
String.Pos.Raw :=
|
||
if distance > 0 && patByte != pat.getUTF8Byte! distance then
|
||
computeDistance table[distance.byteIdx - 1]! patByte table
|
||
else
|
||
distance
|
||
|
||
@[inline]
|
||
def iter (s : Slice) (pat : Slice) : Std.Iter (α := ForwardSliceSearcher s) (SearchStep s) :=
|
||
if pat.utf8ByteSize == 0 then
|
||
{ internalState := .empty s.startPos }
|
||
else
|
||
{ internalState := .proper pat (buildTable pat) s.startPos.offset pat.startPos.offset }
|
||
|
||
partial def backtrackIfNecessary (pat : Slice) (table : Array String.Pos.Raw) (stackByte : UInt8)
|
||
(needlePos : String.Pos.Raw) : String.Pos.Raw :=
|
||
if needlePos != 0 && stackByte != pat.getUTF8Byte! needlePos then
|
||
backtrackIfNecessary pat table stackByte table[needlePos.byteIdx - 1]!
|
||
else
|
||
needlePos
|
||
|
||
instance (s : Slice) : Std.Iterators.Iterator (ForwardSliceSearcher s) Id (SearchStep s) where
|
||
IsPlausibleStep it
|
||
| .yield it' out =>
|
||
match it.internalState with
|
||
| .empty pos =>
|
||
(∃ newPos, pos.offset < newPos.offset ∧ it'.internalState = .empty newPos) ∨
|
||
it'.internalState = .atEnd
|
||
| .proper needle table stackPos needlePos =>
|
||
(∃ newStackPos newNeedlePos,
|
||
stackPos < newStackPos ∧
|
||
newStackPos ≤ s.utf8ByteSize ∧
|
||
it'.internalState = .proper needle table newStackPos newNeedlePos) ∨
|
||
it'.internalState = .atEnd
|
||
| .atEnd => False
|
||
| .skip _ => False
|
||
| .done => True
|
||
step := fun ⟨iter⟩ =>
|
||
match iter with
|
||
| .empty pos =>
|
||
let res := .matched pos pos
|
||
if h : pos ≠ s.endPos then
|
||
pure ⟨.yield ⟨.empty (pos.next h)⟩ res, by simp [Char.utf8Size_pos]⟩
|
||
else
|
||
pure ⟨.yield ⟨.atEnd⟩ res, by simp⟩
|
||
| .proper needle table stackPos needlePos =>
|
||
let rec findNext (startPos : String.Pos.Raw)
|
||
(currStackPos : String.Pos.Raw) (needlePos : String.Pos.Raw) (h : stackPos ≤ currStackPos) :=
|
||
if h1 : currStackPos < s.utf8ByteSize then
|
||
let stackByte := s.getUTF8Byte currStackPos h1
|
||
let needlePos := backtrackIfNecessary needle table stackByte needlePos
|
||
let patByte := needle.getUTF8Byte! needlePos
|
||
if stackByte != patByte then
|
||
let nextStackPos := s.findNextPos currStackPos h1 |>.offset
|
||
let res := .rejected (s.pos! startPos) (s.pos! nextStackPos)
|
||
have hiter := by
|
||
left
|
||
exists nextStackPos
|
||
have haux := lt_offset_findNextPos h1
|
||
simp only [pos_lt_eq, proper.injEq, true_and, exists_and_left, exists_eq', and_true,
|
||
nextStackPos]
|
||
constructor
|
||
· simp [String.Pos.Raw.le_iff] at h haux ⊢
|
||
omega
|
||
· apply Pos.Raw.IsValidForSlice.le_utf8ByteSize
|
||
apply Pos.isValidForSlice
|
||
⟨.yield ⟨.proper needle table nextStackPos needlePos⟩ res, hiter⟩
|
||
else
|
||
let needlePos := needlePos.inc
|
||
if needlePos == needle.utf8ByteSize then
|
||
let nextStackPos := currStackPos.inc
|
||
let res := .matched (s.pos! startPos) (s.pos! nextStackPos)
|
||
have hiter := by
|
||
left
|
||
exists nextStackPos
|
||
simp only [pos_lt_eq, Pos.Raw.byteIdx_inc, proper.injEq, true_and, exists_and_left,
|
||
exists_eq', and_true, nextStackPos]
|
||
constructor
|
||
· simp [String.Pos.Raw.le_iff] at h ⊢
|
||
omega
|
||
· simp [String.Pos.Raw.le_iff] at h1 ⊢
|
||
omega
|
||
⟨.yield ⟨.proper needle table nextStackPos 0⟩ res, hiter⟩
|
||
else
|
||
have hinv := by
|
||
simp [String.Pos.Raw.le_iff] at h ⊢
|
||
omega
|
||
findNext startPos currStackPos.inc needlePos hinv
|
||
else
|
||
if startPos != s.utf8ByteSize then
|
||
let res := .rejected (s.pos! startPos) (s.pos! currStackPos)
|
||
⟨.yield ⟨.atEnd⟩ res, by simp⟩
|
||
else
|
||
⟨.done, by simp⟩
|
||
termination_by s.utf8ByteSize.byteIdx - currStackPos.byteIdx
|
||
decreasing_by
|
||
simp at h1 ⊢
|
||
omega
|
||
|
||
findNext stackPos stackPos needlePos (by simp)
|
||
| .atEnd => pure ⟨.done, by simp⟩
|
||
|
||
private def toPair : ForwardSliceSearcher s → (Nat × Nat)
|
||
| .empty pos => (1, s.utf8ByteSize.byteIdx - pos.offset.byteIdx)
|
||
| .proper _ _ sp _ => (1, s.utf8ByteSize.byteIdx - sp.byteIdx)
|
||
| .atEnd => (0, 0)
|
||
|
||
private instance : WellFoundedRelation (ForwardSliceSearcher s) where
|
||
rel s1 s2 := Prod.Lex (· < ·) (· < ·) s1.toPair s2.toPair
|
||
wf := by
|
||
apply InvImage.wf
|
||
apply (Prod.lex _ _).wf
|
||
|
||
private def finitenessRelation :
|
||
Std.Iterators.FinitenessRelation (ForwardSliceSearcher s) Id where
|
||
rel := InvImage WellFoundedRelation.rel (fun it => it.internalState)
|
||
wf := InvImage.wf _ WellFoundedRelation.wf
|
||
subrelation {it it'} h := by
|
||
simp_wf
|
||
obtain ⟨step, h, h'⟩ := h
|
||
cases step
|
||
· cases h
|
||
simp only [Std.Iterators.IterM.IsPlausibleStep, Std.Iterators.Iterator.IsPlausibleStep] at h'
|
||
split at h'
|
||
· next heq =>
|
||
rw [heq]
|
||
rcases h' with ⟨np, h1', h2'⟩ | h'
|
||
· rw [h2']
|
||
apply Prod.Lex.right'
|
||
· simp
|
||
· have haux := np.isValidForSlice.le_utf8ByteSize
|
||
simp [String.Pos.Raw.le_iff] at h1' haux ⊢
|
||
omega
|
||
· apply Prod.Lex.left
|
||
simp [h']
|
||
· next heq =>
|
||
rw [heq]
|
||
rcases h' with ⟨np, sp, h1', h2', h3'⟩ | h'
|
||
· rw [h3']
|
||
apply Prod.Lex.right'
|
||
· simp
|
||
· simp [String.Pos.Raw.le_iff] at h1' h2' ⊢
|
||
omega
|
||
· apply Prod.Lex.left
|
||
simp [h']
|
||
· contradiction
|
||
· cases h'
|
||
· cases h
|
||
|
||
@[no_expose]
|
||
instance : Std.Iterators.Finite (ForwardSliceSearcher s) Id :=
|
||
.of_finitenessRelation finitenessRelation
|
||
|
||
instance : Std.Iterators.IteratorLoop (ForwardSliceSearcher s) Id Id :=
|
||
.defaultImplementation
|
||
|
||
instance : ToForwardSearcher Slice ForwardSliceSearcher where
|
||
toSearcher := iter
|
||
|
||
@[inline]
|
||
def startsWith (s : Slice) (pat : Slice) : Bool :=
|
||
if h : pat.utf8ByteSize ≤ s.utf8ByteSize then
|
||
have hs := by
|
||
simp [Pos.Raw.le_iff] at h ⊢
|
||
omega
|
||
have hp := by
|
||
simp [Pos.Raw.le_iff]
|
||
Internal.memcmp s pat s.startPos.offset pat.startPos.offset pat.utf8ByteSize hs hp
|
||
else
|
||
false
|
||
|
||
@[inline]
|
||
def dropPrefix? (s : Slice) (pat : Slice) : Option Slice :=
|
||
if startsWith s pat then
|
||
some <| s.replaceStart <| s.pos! <| s.startPos.offset + pat.utf8ByteSize
|
||
else
|
||
none
|
||
|
||
instance : ForwardPattern Slice where
|
||
startsWith := startsWith
|
||
dropPrefix? := dropPrefix?
|
||
|
||
instance : ToForwardSearcher String ForwardSliceSearcher where
|
||
toSearcher slice pat := iter slice pat.toSlice
|
||
|
||
instance : ForwardPattern String where
|
||
startsWith s pat := startsWith s pat.toSlice
|
||
dropPrefix? s pat := dropPrefix? s pat.toSlice
|
||
|
||
end ForwardSliceSearcher
|
||
|
||
namespace BackwardSliceSearcher
|
||
|
||
@[inline]
|
||
def endsWith (s : Slice) (pat : Slice) : Bool :=
|
||
if h : pat.utf8ByteSize ≤ s.utf8ByteSize then
|
||
let sStart := s.endPos.offset - pat.utf8ByteSize
|
||
let patStart := pat.startPos.offset
|
||
have hs := by
|
||
simp [sStart, Pos.Raw.le_iff] at h ⊢
|
||
omega
|
||
have hp := by
|
||
simp [patStart, Pos.Raw.le_iff] at h ⊢
|
||
Internal.memcmp s pat sStart patStart pat.utf8ByteSize hs hp
|
||
else
|
||
false
|
||
|
||
@[inline]
|
||
def dropSuffix? (s : Slice) (pat : Slice) : Option Slice :=
|
||
if endsWith s pat then
|
||
some <| s.replaceEnd <| s.pos! <| s.endPos.offset - pat.utf8ByteSize
|
||
else
|
||
none
|
||
|
||
instance : BackwardPattern Slice where
|
||
endsWith := endsWith
|
||
dropSuffix? := dropSuffix?
|
||
|
||
instance : BackwardPattern String where
|
||
endsWith s pat := endsWith s pat.toSlice
|
||
dropSuffix? s pat := dropSuffix? s pat.toSlice
|
||
|
||
end BackwardSliceSearcher
|
||
|
||
end String.Slice.Pattern
|