lean4-htt/src/Lean/Data/Trie.lean
Sebastian Ullrich ddfeca1b1b
fix: do not allow access to private primitives in public scope (#9890)
This PR addresses a missing check in the module system where private
names that remain in the public environment map for technical reasons
(e.g. inductive constructors generated by the kernel and relied on by
the code generator) accidentally were accessible in the public scope.
2025-08-14 15:34:54 +00:00

208 lines
6 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/-
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Sebastian Ullrich, Leonardo de Moura, Joachim Breitner
A string trie data structure, used for tokenizing the Lean language
-/
module
prelude
public import Lean.Data.Format
public import Init.Data.Option.Coe
public section
namespace Lean
namespace Data
/-
## Implementation notes
Tries have typically many nodes with small degree, where a linear scan
through the (compact) `ByteArray` is faster than using binary search or
search trees like `Std.TreeMap`.
Moreover, many nodes have degree 1, which justifies the special case `Node1`
constructor.
The code would be a bit less repetitive if we used something like the following
```
mutual
@[expose] def Trie α := Option α × ByteAssoc α
inductive ByteAssoc α where
| leaf : Trie α
| node1 : UInt8 → Trie α → Trie α
| node : ByteArray → Array (Trie α) → Trie α
end
```
but that would come at the cost of extra indirections.
-/
/-- A Trie is a key-value store where the keys are of type `String`,
and the internal structure is a tree that branches on the bytes of the string. -/
inductive Trie (α : Type) where
| leaf : Option α → Trie α
| node1 : Option α → UInt8 → Trie α → Trie α
| node : Option α → ByteArray → Array (Trie α) → Trie α
namespace Trie
variable {α : Type}
/-- The empty `Trie` -/
def empty : Trie α := leaf none
instance : EmptyCollection (Trie α) :=
⟨empty⟩
instance : Inhabited (Trie α) where
default := empty
/-- Insert or update the value at a the given key `s`. -/
partial def upsert (t : Trie α) (s : String) (f : Option αα) : Trie α :=
let rec insertEmpty (i : Nat) : Trie α :=
if h : i < s.utf8ByteSize then
let c := s.getUtf8Byte i h
let t := insertEmpty (i + 1)
node1 none c t
else
leaf (f .none)
let rec loop
| i, leaf v =>
if h : i < s.utf8ByteSize then
let c := s.getUtf8Byte i h
let t := insertEmpty (i + 1)
node1 v c t
else
leaf (f v)
| i, node1 v c' t' =>
if h : i < s.utf8ByteSize then
let c := s.getUtf8Byte i h
if c == c'
then node1 v c' (loop (i + 1) t')
else
let t := insertEmpty (i + 1)
node v (.mk #[c, c']) #[t, t']
else
node1 (f v) c' t'
| i, node v cs ts =>
if h : i < s.utf8ByteSize then
let c := s.getUtf8Byte i h
match cs.findIdx? (· == c) with
| none =>
let t := insertEmpty (i + 1)
node v (cs.push c) (ts.push t)
| some idx =>
node v cs (ts.modify idx (loop (i + 1)))
else
node (f v) cs ts
loop 0 t
/-- Inserts a value at a the given key `s`, overriding an existing value if present. -/
partial def insert (t : Trie α) (s : String) (val : α) : Trie α :=
upsert t s (fun _ => val)
/-- Looks up a value at the given key `s`. -/
partial def find? (t : Trie α) (s : String) : Option α :=
let rec loop
| i, leaf val =>
if i < s.utf8ByteSize then
none
else
val
| i, node1 val c' t' =>
if h : i < s.utf8ByteSize then
let c := s.getUtf8Byte i h
if c == c'
then loop (i + 1) t'
else none
else
val
| i, node val cs ts =>
if h : i < s.utf8ByteSize then
let c := s.getUtf8Byte i h
match cs.findIdx? (· == c) with
| none => none
| some idx => loop (i + 1) ts[idx]!
else
val
loop 0 t
/-- Returns an `Array` of all values in the trie, in no particular order. -/
partial def values (t : Trie α) : Array α := go t |>.run #[] |>.2
where
go : Trie α → StateM (Array α) Unit
| leaf a? => do
if let some a := a? then
modify (·.push a)
| node1 a? _ t' => do
if let some a := a? then
modify (·.push a)
go t'
| node a? _ ts => do
if let some a := a? then
modify (·.push a)
ts.forM fun t' => go t'
/-- Returns all values whose key have the given string `pre` as a prefix, in no particular order. -/
partial def findPrefix (t : Trie α) (pre : String) : Array α := go t 0
where
go (t : Trie α) (i : Nat) : Array α :=
if h : i < pre.utf8ByteSize then
let c := pre.getUtf8Byte i h
match t with
| leaf _val => .empty
| node1 _val c' t' =>
if c == c'
then go t' (i + 1)
else .empty
| node _val cs ts =>
match cs.findIdx? (· == c) with
| none => .empty
| some idx => go ts[idx]! (i + 1)
else
t.values
/-- Find the longest _key_ in the trie that is contained in the given string `s` at position `i`,
and return the associated value. -/
partial def matchPrefix (s : String) (t : Trie α) (i : String.Pos) : Option α :=
let rec loop
| leaf v, _, res =>
if v.isSome then v else res
| node1 v c' t', i, res =>
let res := if v.isSome then v else res
if h : i < s.utf8ByteSize then
let c := s.getUtf8Byte i h
if c == c'
then loop t' (i + 1) res
else res
else
res
| node v cs ts, i, res =>
let res := if v.isSome then v else res
if h : i < s.utf8ByteSize then
let c := s.getUtf8Byte i h
match cs.findIdx? (· == c) with
| none => res
| some idx => loop ts[idx]! (i + 1) res
else
res
loop t i.byteIdx none
private partial def toStringAux {α : Type} : Trie α → List Format
| leaf _ => []
| node1 _ c t =>
[ format (repr c), Format.group $ Format.nest 4 $ flip Format.joinSep Format.line $ toStringAux t ]
| node _ cs ts =>
List.flatten $ List.zipWith (fun c t =>
[ format (repr c), (Format.group $ Format.nest 4 $ flip Format.joinSep Format.line $ toStringAux t) ]
) cs.toList ts.toList
instance {α : Type} : ToString (Trie α) where
toString t := private (flip Format.joinSep Format.line $ toStringAux t).pretty
end Trie
end Data
end Lean