feat: add useful functions in Parsec, add error variant and Std.Data.ByteSlice (#9599)

This PR adds the type `Std.Internal.Parsec.Error`, which contains the
constructors `.eof` (useful for checking if parsing failed due to not
having enough input and then retrying when more input arrives that is
useful in the HTTP server) and `.other`, which describes other errors.
It also adds documentation to many functions, along with some new
functions to the `ByteArray` Parsec, such as `peekWhen?`, `octDigit`,
`takeWhile`, `takeUntil`, `skipWhile`, and `skipUntil`.
This commit is contained in:
Sofia Rodrigues 2025-09-11 11:53:41 -03:00 committed by GitHub
parent 5c88a2bf56
commit a966ce64ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 1072 additions and 48 deletions

View file

@ -93,7 +93,7 @@ where
upToWs (nonempty : Bool) : Parser String := fun it =>
let it' := it.find fun c => c.isWhitespace
if nonempty && it'.pos == it.pos then
.error it' "Expected a nonempty string"
.error it' (.other "Expected a nonempty string")
else
.success it' (it.extract it')
@ -180,7 +180,7 @@ private abbrev ValidationM := Parsec ValidationState
private def ValidationM.run (p : ValidationM α) (input : String) : Except (Nat × String) α :=
match p (.ofSource input) with
| .success _ res => Except.ok res
| .error s err => Except.error (s.getLineNumber, err)
| .error s err => Except.error (s.getLineNumber, toString err)
/--
Matches `p` as many times as possible, followed by EOF. If `p` cannot be matched prior to the end
@ -290,7 +290,7 @@ where
labelingExampleErrors {α} (header : String) (x : ValidationM α) : ValidationM α := fun s =>
match x s with
| res@(.success ..) => res
| .error s' msg => .error s' s!"Example '{header}' is malformed: {msg}"
| .error s' msg => .error s' (.other s!"Example '{header}' is malformed: {msg}")
/--
If `line` is a level-`level` header and, if `title?` is non-`none`, its title is `title?`,

View file

@ -30,5 +30,6 @@ public import Std.Data.TreeMap.Raw
public import Std.Data.TreeSet.Raw
public import Std.Data.Iterators
public import Std.Data.ByteSlice
@[expose] public section

338
src/Std/Data/ByteSlice.lean Normal file
View file

@ -0,0 +1,338 @@
/-
Copyright (c) 2025 Lean FRO, LLC. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Sofia Rodrigues
-/
module
prelude
public import Init.Core
public import Init.GetElem
public import Init.Data.ByteArray
public import Init.Data.Slice.Basic
public import Init.Data.Slice.Notation
public import Init.Data.Range.Polymorphic.Nat
/-!
This module defines the `ByteSlice` structure. It's a modified version of the `SubArray` code,
with some functions specific to `ByteSlice`.
-/
public section
set_option linter.indexVariables true
set_option linter.missingDocs true
universe u v w
/--
Internal representation of `ByteSlice`, which is an abbreviation for `Slice ByteSliceData`.
-/
structure Std.Slice.Internal.ByteSliceData where
private mk ::
/--
The underlying byte array.
-/
byteArray : ByteArray
/--
The starting index of the region of interest (inclusive).
-/
start : Nat
/--
The ending index of the region of interest (exclusive).
-/
stop : Nat
/--
The starting index is no later than the ending index.
The ending index is exclusive. If the starting and ending indices are equal, then the byte slice is
empty.
-/
start_le_stop : start ≤ stop
/-- The stopping index is no later than the end of the byte array.
The ending index is exclusive. If it is equal to the size of the byte array, then the last byte of
the byte array is in the byte slice.
-/
stop_le_size_byteArray : stop ≤ byteArray.size
open Std.Slice
/--
A region of some underlying byte array.
A byte slice contains a byte array together with the start and end indices of a region of interest.
Byte slices can be used to avoid copying or allocating space, while being more convenient than
tracking the bounds by hand. The region of interest consists of every index that is both greater
than or equal to `start` and strictly less than `stop`.
-/
abbrev ByteSlice := Std.Slice Internal.ByteSliceData
instance : Self (Std.Slice Internal.ByteSliceData) ByteSlice where
namespace ByteSlice
@[always_inline, inline, expose, inherit_doc Internal.ByteSliceData.byteArray]
def byteArray (xs : @& ByteSlice) : ByteArray :=
xs.internalRepresentation.byteArray
@[always_inline, inline, expose, inherit_doc Internal.ByteSliceData.start]
def start (xs : @& ByteSlice) : Nat :=
xs.internalRepresentation.start
@[always_inline, inline, expose, inherit_doc Internal.ByteSliceData.stop]
def stop (xs : @& ByteSlice) : Nat :=
xs.internalRepresentation.stop
@[inherit_doc Internal.ByteSliceData.start_le_stop]
theorem start_le_stop (xs : ByteSlice) : xs.start ≤ xs.stop :=
xs.internalRepresentation.start_le_stop
@[inherit_doc Internal.ByteSliceData.stop_le_size_byteArray]
theorem stop_le_size_byteArray (xs : ByteSlice) : xs.stop ≤ xs.byteArray.size :=
xs.internalRepresentation.stop_le_size_byteArray
/--
Computes the size of the byte slice.
-/
def size (s : ByteSlice) : Nat :=
s.stop - s.start
theorem size_le_size_byteArray {s : ByteSlice} : s.size ≤ s.byteArray.size := by
let ⟨{byteArray, start, stop, start_le_stop, stop_le_size_byteArray}⟩ := s
simp only [size, ge_iff_le]
apply Nat.le_trans (Nat.sub_le stop start)
assumption
/--
Extracts a byte from the byte slice.
The index is relative to the start of the byte slice, rather than the underlying byte array.
-/
def get (s : ByteSlice) (i : Fin s.size) : UInt8 :=
have : s.start + i.val < s.byteArray.size := by
apply Nat.lt_of_lt_of_le _ s.stop_le_size_byteArray
have := i.isLt
simp only [size] at this
rw [Nat.add_comm]
exact Nat.add_lt_of_lt_sub this
s.byteArray[s.start + i.val]
instance : GetElem ByteSlice Nat UInt8 fun xs i => i < xs.size where
getElem xs i h := xs.get ⟨i, h⟩
/--
Extracts a byte from the byte slice, or returns a default value `v₀` when the index is out of
bounds.
The index is relative to the start and end of the byte slice, rather than the underlying byte array.
-/
@[inline] def getD (s : ByteSlice) (i : Nat) (v₀ : UInt8) : UInt8 :=
if h : i < s.size then s[i] else v₀
/--
Extracts a byte from the byte slice, or returns a default value when the index is out of bounds.
The index is relative to the start and end of the byte slice, rather than the underlying byte array. The
default value is 0.
-/
abbrev get! (s : ByteSlice) (i : Nat) : UInt8 :=
getD s i 0
/--
The empty byte slice.
This empty byte slice is backed by an empty byte array.
-/
protected def empty : ByteSlice := ⟨{
byteArray := ByteArray.mk #[]
start := 0
stop := 0
start_le_stop := Nat.le_refl 0
stop_le_size_byteArray := Nat.le_refl 0
}⟩
/--
Creates a new ByteSlice of a ByteArray
-/
protected def ofByteArray (ba : ByteArray) : ByteSlice := ⟨{
byteArray := ba
start := 0
stop := ba.size
start_le_stop := Nat.zero_le _
stop_le_size_byteArray := Nat.le_refl _
}⟩
instance : EmptyCollection (ByteSlice) :=
⟨ByteSlice.empty⟩
instance : Inhabited (ByteSlice) :=
⟨{}⟩
/--
Converts a byte slice back to a byte array by copying the relevant portion.
-/
def toByteArray (s : ByteSlice) : ByteArray :=
s.byteArray.extract s.start (s.start + s.size)
/--
Comparison function
-/
@[extern "lean_byteslice_beq"]
protected def beq (a b : ByteSlice) : Bool :=
a.toByteArray == b.toByteArray
instance : BEq ByteSlice := ⟨ByteSlice.beq⟩
/--
Folds a monadic operation from right to left over the bytes in a byte slice.
An accumulator of type `β` is constructed by starting with `init` and monadically combining each
byte of the byte slice with the current accumulator value in turn, moving from the end to the
start. The monad in question may permit early termination or repetition.
Examples:
```lean example
#eval (ByteArray.mk #[1, 2, 3]).toByteSlice.foldrM (init := 0) fun x acc =>
some x.toNat + acc
```
```output
some 6
```
-/
@[inline]
def foldrM {β : Type v} {m : Type v → Type w} [Monad m] (f : UInt8 → β → m β) (init : β) (as : ByteSlice) : m β :=
let rec loop (i : Nat) (acc : β) : m β :=
if h : i < as.size then do
let newAcc ← f as[as.size - 1 - i] acc
loop (i + 1) newAcc
else
pure acc
loop 0 init
/--
Runs a monadic action on each byte of a byte slice.
The bytes are processed starting at the lowest index and moving up.
-/
@[inline]
def forM {m : Type v → Type w} [Monad m] (f : UInt8 → m PUnit) (as : ByteSlice) : m PUnit :=
let rec loop (i : Nat) : m PUnit :=
if h : i < as.size then do
f as[i]
loop (i + 1)
else
pure ⟨⟩
loop 0
/--
Folds an operation from right to left over the bytes in a byte slice.
An accumulator of type `β` is constructed by starting with `init` and combining each byte of the
byte slice with the current accumulator value in turn, moving from the end to the start.
Examples:
* `(ByteArray.mk #[1, 2, 3]).toByteSlice.foldr (·.toNat + ·) 0 = 6`
* `(ByteArray.mk #[1, 2, 3]).toByteSlice.popFront.foldr (·.toNat + ·) 0 = 5`
-/
@[inline]
def foldr {β : Type v} (f : UInt8 → β → β) (init : β) (as : ByteSlice) : β :=
Id.run <| as.foldrM (pure <| f · ·) (init := init)
/--
Creates a sub-slice of the byte slice with the given bounds.
If `start` or `stop` are not valid bounds for a sub-slice, then they are clamped to the slice's size.
Additionally, the starting index is clamped to the ending index.
The indices are relative to the current slice, not the underlying byte array.
-/
def slice (s : ByteSlice) (start : Nat := 0) (stop : Nat := s.size) : ByteSlice :=
let actualStart := s.start + min start s.size
let actualStop := s.start + min stop s.size
if h₂ : actualStop ≤ s.byteArray.size then
if h₁ : actualStart ≤ actualStop then
⟨{ byteArray := s.byteArray,
start := actualStart,
stop := actualStop,
start_le_stop := h₁,
stop_le_size_byteArray := h₂ }⟩
else
⟨{ byteArray := s.byteArray,
start := actualStop,
stop := actualStop,
start_le_stop := Nat.le_refl _,
stop_le_size_byteArray := h₂ }⟩
else
⟨{ byteArray := s.byteArray,
start := s.start,
stop := s.start,
start_le_stop := Nat.le_refl _,
stop_le_size_byteArray := Nat.le_trans (Nat.le_refl _) (Nat.le_trans s.start_le_stop s.stop_le_size_byteArray) }⟩
/--
Checks if the byte slice contains a specific byte value.
Returns `true` if any byte in the slice equals the given value, `false` otherwise.
-/
def contains (s : ByteSlice) (byte : UInt8) : Bool :=
let rec loop (i : Nat) : Bool :=
if h : i < s.size then
if s[i] == byte then true
else loop (i + 1)
else
false
loop 0
end ByteSlice
namespace ByteArray
/--
Returns a byte slice of a byte array, with the given bounds.
If `start` or `stop` are not valid bounds for a byte slice, then they are clamped to byte array's size.
Additionally, the starting index is clamped to the ending index.
-/
def toByteSlice (as : ByteArray) (start : Nat := 0) (stop : Nat := as.size) : ByteSlice :=
if h₂ : stop ≤ as.size then
if h₁ : start ≤ stop then
⟨{ byteArray := as, start := start, stop := stop,
start_le_stop := h₁, stop_le_size_byteArray := h₂ }⟩
else
⟨{ byteArray := as, start := stop, stop := stop,
start_le_stop := Nat.le_refl _, stop_le_size_byteArray := h₂ }⟩
else
if h₁ : start ≤ as.size then
⟨{ byteArray := as,
start := start,
stop := as.size,
start_le_stop := h₁,
stop_le_size_byteArray := Nat.le_refl _ }⟩
else
⟨{ byteArray := as,
start := as.size,
stop := as.size,
start_le_stop := Nat.le_refl _,
stop_le_size_byteArray := Nat.le_refl _ }⟩
end ByteArray
open Std Slice PRange
variable {shape : RangeShape} {α : Type u}
instance [ClosedOpenIntersection shape Nat] : Sliceable shape ByteArray Nat ByteSlice where
mkSlice xs range :=
let halfOpenRange := ClosedOpenIntersection.intersection range 0...<xs.size
(xs.toByteSlice halfOpenRange.lower halfOpenRange.upper)
instance [ClosedOpenIntersection shape Nat] : Sliceable shape ByteSlice Nat ByteSlice where
mkSlice xs range :=
let halfOpenRange := ClosedOpenIntersection.intersection range 0...<xs.size
(xs.slice halfOpenRange.lower halfOpenRange.upper)

View file

@ -12,21 +12,60 @@ import Init.Data.String.Basic
public section
namespace Std.Internal
namespace Std
namespace Internal
namespace Parsec
/--
Represents an error that can occur during parsing.
-/
inductive Error where
/--
Indicates that the parser reached the end of the input unexpectedly.
-/
| eof
/--
Represents any other kind of parsing error with an associated message.
-/
| other (s : String)
deriving Repr
instance : ToString Error where
toString
| .eof => "unexpected end of input"
| .other s => s
/--
The result of parsing some string.
-/
inductive ParseResult (α : Type) (ι : Type) where
/--
Parsing succeeded, returning the new position `pos` and the parsed result `res`.
-/
| success (pos : ι) (res : α)
| error (pos : ι) (err : String)
deriving Repr
/--
Parsing failed, returning the position `pos` where the error occurred and the error `err`.
-/
| error (pos : ι) (err : Error)
deriving Repr
end Parsec
@[expose] def Parsec (ι : Type) (α : Type) : Type := ι → Parsec.ParseResult α ι
/--
A `Parsec ι α` represents a parser that consumes input of type `ι` and, produces a
`ParseResult` containing a value of type `α` (the result of parsing) and the remaining input.
-/
@[expose]
def Parsec (ι : Type) (α : Type) : Type :=
ι → Parsec.ParseResult α ι
namespace Parsec
/--
Interface for an input iterator with position tracking and lookahead support.
-/
class Input (ι : Type) (elem : outParam Type) (idx : outParam Type) [DecidableEq idx] [DecidableEq elem] where
pos : ι → idx
next : ιι
@ -39,27 +78,28 @@ variable {α : Type} {ι : Type} {elem : Type} {idx : Type}
variable [DecidableEq idx] [DecidableEq elem] [Input ι elem idx]
instance : Inhabited (Parsec ι α) where
default := fun it => .error it ""
default := fun it => ParseResult.error it (.other "")
@[always_inline, inline]
protected def pure (a : α) : Parsec ι α := fun it =>
.success it a
@[always_inline, inline]
def bind {α β : Type} (f : Parsec ι α) (g : α → Parsec ι β) : Parsec ι β := fun it =>
protected def bind {α β : Type} (f : Parsec ι α) (g : α → Parsec ι β) : Parsec ι β := fun it =>
match f it with
| .success rem a => g a rem
| .error pos msg => .error pos msg
@[always_inline]
instance : Monad (Parsec ι) where
pure := Parsec.pure
bind := Parsec.bind
/--
Parser that always fails with the given error message.
-/
@[always_inline, inline]
def fail (msg : String) : Parsec ι α := fun it =>
.error it msg
.error it (.other msg)
/--
Try `p`, then decide what to do based on success or failure without consuming input on failure.
-/
@[inline]
def tryCatch (p : Parsec ι α) (csuccess : α → Parsec ι β) (cerror : Unit → Parsec ι β)
: Parsec ι β := fun it =>
@ -69,10 +109,21 @@ def tryCatch (p : Parsec ι α) (csuccess : α → Parsec ι β) (cerror : Unit
-- We assume that it.s never changes as the `Parsec` monad only modifies `it.pos`.
if Input.pos it = Input.pos rem then cerror () rem else .error rem err
@[always_inline]
instance : Monad (Parsec ι) where
pure := Parsec.pure
bind := Parsec.bind
/--
Try `p`, and if it fails without consuming input, run `q ()` instead.
-/
@[always_inline, inline]
def orElse (p : Parsec ι α) (q : Unit → Parsec ι α) : Parsec ι α :=
tryCatch p pure q
/--
Attempt to parse with `p`, but don't consume input on failure.
-/
@[always_inline, inline]
def attempt (p : Parsec ι α) : Parsec ι α := fun it =>
match p it with
@ -84,12 +135,13 @@ instance : Alternative (Parsec ι) where
failure := fail ""
orElse := orElse
def expectedEndOfInput := "expected end of input"
/--
Succeeds only if input is at end-of-file.
-/
@[inline]
def eof : Parsec ι Unit := fun it =>
if Input.hasNext it then
.error it expectedEndOfInput
.error it (.other "expected end of input")
else
.success it ()
@ -107,8 +159,9 @@ def many (p : Parsec ι α) : Parsec ι <| Array α := manyCore p #[]
@[inline]
def many1 (p : Parsec ι α) : Parsec ι <| Array α := do manyCore p #[← p]
def unexpectedEndOfInput := "unexpected end of input"
/--
Gets the next input element.
-/
@[inline]
def any : Parsec ι elem := fun it =>
if h : Input.hasNext it then
@ -116,19 +169,28 @@ def any : Parsec ι elem := fun it =>
let it' := Input.next' it h
.success it' c
else
.error it unexpectedEndOfInput
.error it .eof
/--
Checks if the next input element matches some condition.
-/
@[inline]
def satisfy (p : elem → Bool) : Parsec ι elem := attempt do
let c ← any
if p c then return c else fail "condition not satisfied"
/--
Fails if `p` succeeds, otherwise succeeds without consuming input.
-/
@[inline]
def notFollowedBy (p : Parsec ι α) : Parsec ι Unit := fun it =>
match p it with
| .success _ _ => .error it ""
| .success _ _ => .error it (.other "")
| .error _ _ => .success it ()
/--
Peeks at the next element, returns `some` if exists else `none`, does not consume input.
-/
@[inline]
def peek? : Parsec ι (Option elem) := fun it =>
if h : Input.hasNext it then
@ -136,13 +198,32 @@ def peek? : Parsec ι (Option elem) := fun it =>
else
.success it none
/--
Peeks at the next element, returns `some elem` if it satisfies `p`, else `none`. Does not consume input.
-/
@[inline]
def peekWhen? (p : elem → Bool) : Parsec ι (Option elem) := do
let some data ← peek?
| return none
if p data then
return some data
else
return none
/--
Peeks at the next element, errors on EOF, does not consume input.
-/
@[inline]
def peek! : Parsec ι elem := fun it =>
if h : Input.hasNext it then
.success it (Input.curr' it h)
else
.error it unexpectedEndOfInput
.error it .eof
/--
Peeks at the next element or returns a default if at EOF, does not consume input.
-/
@[inline]
def peekD (default : elem) : Parsec ι elem := fun it =>
if h : Input.hasNext it then
@ -150,23 +231,37 @@ def peekD (default : elem) : Parsec ι elem := fun it =>
else
.success it default
/--
Consumes one element if available, otherwise errors on EOF.
-/
@[inline]
def skip : Parsec ι Unit := fun it =>
if h : Input.hasNext it then
.success (Input.next' it h) ()
else
.error it unexpectedEndOfInput
.error it .eof
/--
Parses zero or more chars with `p`, accumulates into a string.
-/
@[specialize]
partial def manyCharsCore (p : Parsec ι Char) (acc : String) : Parsec ι String :=
tryCatch p (manyCharsCore p <| acc.push ·) (fun _ => pure acc)
/--
Parses zero or more chars with `p` into a string.
-/
@[inline]
def manyChars (p : Parsec ι Char) : Parsec ι String := manyCharsCore p ""
def manyChars (p : Parsec ι Char) : Parsec ι String := do
manyCharsCore p ""
/--
Parses one or more chars with `p` into a string, errors if none.
-/
@[inline]
def many1Chars (p : Parsec ι Char) : Parsec ι String := do manyCharsCore p (← p).toString
def many1Chars (p : Parsec ι Char) : Parsec ι String := do
manyCharsCore p (← p).toString
end Parsec
end Std.Internal
end Internal
end Std

View file

@ -9,10 +9,12 @@ prelude
public import Std.Internal.Parsec.Basic
public import Init.Data.ByteArray.Basic
public import Init.Data.String.Extra
public import Std.Data.ByteSlice
public section
namespace Std.Internal
namespace Std
namespace Internal
namespace Parsec
namespace ByteArray
@ -24,29 +26,52 @@ instance : Input ByteArray.Iterator UInt8 Nat where
next' it := it.next'
curr' it := it.curr'
/--
`Parser α` is a parser that consumes a `ByteArray` input using a `ByteArray.Iterator` and returns a result of type `α`.
-/
abbrev Parser (α : Type) : Type := Parsec ByteArray.Iterator α
/--
Run a `Parser` on a `ByteArray`, returns either the result or an error string with offset.
-/
protected def Parser.run (p : Parser α) (arr : ByteArray) : Except String α :=
match p arr.iter with
| .success _ res => Except.ok res
| .error it err => Except.error s!"offset {repr it.pos}: {err}"
| .error it err => Except.error s!"offset {repr it.pos}: {err}"
/--
Parse a single byte equal to `b`, fails if different.
-/
@[inline]
def pbyte (b : UInt8) : Parser UInt8 := attempt do
if (← any) = b then pure b else fail s!"expected: '{b}'"
/--
Skip a single byte equal to `b`, fails if different.
-/
@[inline]
def skipByte (b : UInt8) : Parser Unit := pbyte b *> pure ()
def skipByte (b : UInt8) : Parser Unit :=
pbyte b *> pure ()
/--
Skip a sequence of bytes equal to the given `ByteArray`.
-/
def skipBytes (arr : ByteArray) : Parser Unit := do
for b in arr do
skipByte b
/--
Parse a string by matching its UTF-8 bytes, returns the string on success.
-/
@[inline]
def pstring (s : String) : Parser String := do
skipBytes s.toUTF8
return s
/--
Skip a string by matching its UTF-8 bytes.
-/
@[inline]
def skipString (s : String) : Parser Unit := pstring s *> pure ()
@ -63,14 +88,24 @@ Skip a `Char` that can be represented in 1 byte. If `c` uses more than 1 byte it
@[inline]
def skipByteChar (c : Char) : Parser Unit := skipByte c.toUInt8
/--
Parse an ASCII digit `0-9` as a `Char`.
-/
@[inline]
def digit : Parser Char := attempt do
let b ← any
if '0'.toUInt8 ≤ b ∧ b ≤ '9'.toUInt8 then return Char.ofUInt8 b else fail s!"digit expected"
/--
Convert a byte representing `'0'..'9'` to a `Nat`.
-/
@[inline]
private def digitToNat (b : UInt8) : Nat := (b - '0'.toUInt8).toNat
private def digitToNat (b : UInt8) : Nat :=
(b - '0'.toUInt8).toNat
/--
Parse zero or more ASCII digits into a `Nat`, continuing until non-digit or EOF.
-/
@[inline]
private partial def digitsCore (acc : Nat) : Parser Nat := fun it =>
/-
@ -92,11 +127,17 @@ where
else
(acc, it)
/--
Parse one or more ASCII digits into a `Nat`.
-/
@[inline]
def digits : Parser Nat := do
let d ← digit
digitsCore (digitToNat d.toUInt8)
/--
Parse a hex digit `0-9`, `a-f`, or `A-F` as a `Char`.
-/
@[inline]
def hexDigit : Parser Char := attempt do
let b ← any
@ -104,6 +145,20 @@ def hexDigit : Parser Char := attempt do
('a'.toUInt8 ≤ b ∧ b ≤ 'f'.toUInt8)
('A'.toUInt8 ≤ b ∧ b ≤ 'F'.toUInt8) then return Char.ofUInt8 b else fail s!"hex digit expected"
/--
Parse an octal digit `0-7` as a `Char`.
-/
@[inline]
def octDigit : Parser Char := attempt do
let b ← any
if '0'.toUInt8 ≤ b ∧ b ≤ '7'.toUInt8 then
return Char.ofUInt8 b
else
fail s!"octal digit expected"
/--
Parse an ASCII letter `a-z` or `A-Z` as a `Char`.
-/
@[inline]
def asciiLetter : Parser Char := attempt do
let b ← any
@ -122,17 +177,125 @@ private partial def skipWs (it : ByteArray.Iterator) : ByteArray.Iterator :=
else
it
/--
Skip whitespace: tabs, newlines, carriage returns, and spaces.
-/
@[inline]
def ws : Parser Unit := fun it =>
.success (skipWs it) ()
def take (n : Nat) : Parser ByteArray := fun it =>
let subarr := it.array.extract it.idx (it.idx + n)
if subarr.size != n then
.error it s!"expected: {n} bytes"
/--
Parse `n` bytes from the input into a `ByteSlice`, errors if not enough bytes.
-/
def take (n : Nat) : Parser ByteSlice := fun it =>
if it.remainingBytes < n then
.error it .eof
else
.success (it.forward n) subarr
.success (it.forward n) (it.array[it.idx...(it.idx+n)])
/--
Parses while a predicate is satisfied.
-/
@[inline]
partial def takeWhile (pred : UInt8 → Bool) : Parser ByteSlice :=
fun it =>
let rec findEnd (count : Nat) (iter : ByteArray.Iterator) : Nat × ByteArray.Iterator :=
if ¬iter.hasNext then (count, iter)
else if pred iter.curr then findEnd (count + 1) iter.next
else (count, iter)
let (length, newIt) := findEnd 0 it
.success newIt (it.array[it.idx...(it.idx + length)])
/--
Parses until a predicate is satisfied (exclusive).
-/
@[inline]
def takeUntil (pred : UInt8 → Bool) : Parser ByteSlice :=
takeWhile (fun b => ¬pred b)
/--
Skips while a predicate is satisfied.
-/
@[inline]
partial def skipWhile (pred : UInt8 → Bool) : Parser Unit :=
fun it =>
let rec findEnd (count : Nat) (iter : ByteArray.Iterator) : ByteArray.Iterator :=
if ¬iter.hasNext then iter
else if pred iter.curr then findEnd (count + 1) iter.next
else iter
.success (findEnd 0 it) ()
/--
Skips until a predicate is satisfied.
-/
@[inline]
def skipUntil (pred : UInt8 → Bool) : Parser Unit :=
skipWhile (fun b => ¬pred b)
/--
Parses while a predicate is satisfied, up to a given limit.
-/
@[inline]
partial def takeWhileUpTo (pred : UInt8 → Bool) (limit : Nat) : Parser ByteSlice :=
fun it =>
let rec findEnd (count : Nat) (iter : ByteArray.Iterator) : Nat × ByteArray.Iterator :=
if count ≥ limit then (count, iter)
else if ¬iter.hasNext then (count, iter)
else if pred iter.curr then findEnd (count + 1) iter.next
else (count, iter)
let (length, newIt) := findEnd 0 it
.success newIt (it.array[it.idx...(it.idx + length)])
/--
Parses while a predicate is satisfied, up to a given limit, requiring at least one byte.
-/
@[inline]
def takeWhileUpTo1 (pred : UInt8 → Bool) (limit : Nat) : Parser ByteSlice :=
fun it =>
let rec findEnd (count : Nat) (iter : ByteArray.Iterator) : Nat × ByteArray.Iterator :=
if count ≥ limit then (count, iter)
else if ¬iter.hasNext then (count, iter)
else if pred iter.curr then findEnd (count + 1) iter.next
else (count, iter)
let (length, newIt) := findEnd 0 it
if length = 0 then
.error it (if newIt.atEnd then .eof else .other "expected at least one char")
else
.success newIt (it.array[it.idx...(it.idx + length)])
/--
Parses until a predicate is satisfied (exclusive), up to a given limit.
-/
@[inline]
def takeUntilUpTo (pred : UInt8 → Bool) (limit : Nat) : Parser ByteSlice :=
takeWhileUpTo (fun b => ¬pred b) limit
/--
Skips while a predicate is satisfied, up to a given limit.
-/
@[inline]
partial def skipWhileUpTo (pred : UInt8 → Bool) (limit : Nat) : Parser Unit :=
fun it =>
let rec findEnd (count : Nat) (iter : ByteArray.Iterator) : ByteArray.Iterator :=
if count ≥ limit then iter
else if ¬iter.hasNext then iter
else if pred iter.curr then findEnd (count + 1) iter.next
else iter
.success (findEnd 0 it) ()
/--
Skips until a predicate is satisfied, up to a given limit.
-/
@[inline]
def skipUntilUpTo (pred : UInt8 → Bool) (limit : Nat) : Parser Unit :=
skipWhileUpTo (fun b => ¬pred b) limit
end ByteArray
end Parsec
end Std.Internal
end Internal
end Std

View file

@ -23,36 +23,59 @@ instance : Input String.Iterator Char String.Pos where
next' it := it.next'
curr' it := it.curr'
/--
`Parser α` is a parser that consumes a `String` input using a `String.Iterator` and returns a result of type `α`.
-/
abbrev Parser (α : Type) : Type := Parsec String.Iterator α
/--
Run a `Parser` on a `String`, returns either the result or an error string with offset.
-/
protected def Parser.run (p : Parser α) (s : String) : Except String α :=
match p s.mkIterator with
| .success _ res => Except.ok res
| .error it err => Except.error s!"offset {repr it.i.byteIdx}: {err}"
| .error it err => Except.error s!"offset {repr it.pos.byteIdx}: {err}"
/-- Parses the given string. -/
/--
Parses the given string.
-/
def pstring (s : String) : Parser String := fun it =>
let substr := it.extract (it.forward s.length)
if substr = s then
.success (it.forward s.length) substr
else
.error it s!"expected: {s}"
.error it (.other s!"expected: {s}")
/--
Skips the given string.
-/
@[inline]
def skipString (s : String) : Parser Unit := pstring s *> pure ()
/--
Parses the given char.
-/
@[inline]
def pchar (c : Char) : Parser Char := attempt do
if (← any) = c then pure c else fail s!"expected: '{c}'"
/--
Skips the given char.
-/
@[inline]
def skipChar (c : Char) : Parser Unit := pchar c *> pure ()
/--
Parse an ASCII digit `0-9` as a `Char`.
-/
@[inline]
def digit : Parser Char := attempt do
let c ← any
if '0' ≤ c ∧ c ≤ '9' then return c else fail s!"digit expected"
/--
Convert a byte representing `'0'..'9'` to a `Nat`.
-/
@[inline]
private def digitToNat (b : Char) : Nat := b.toNat - '0'.toNat
@ -77,11 +100,17 @@ where
else
(acc, it)
/--
Parse one or more ASCII digits into a `Nat`.
-/
@[inline]
def digits : Parser Nat := do
let d ← digit
digitsCore (digitToNat d)
/--
Parse a hex digit `0-9`, `a-f`, or `A-F` as a `Char`.
-/
@[inline]
def hexDigit : Parser Char := attempt do
let c ← any
@ -89,6 +118,9 @@ def hexDigit : Parser Char := attempt do
('a' ≤ c ∧ c ≤ 'f')
('A' ≤ c ∧ c ≤ 'F') then return c else fail s!"hex digit expected"
/--
Parse an ASCII letter `a-z` or `A-Z` as a `Char`.
-/
@[inline]
def asciiLetter : Parser Char := attempt do
let c ← any
@ -104,14 +136,20 @@ private partial def skipWs (it : String.Iterator) : String.Iterator :=
else
it
/--
Skip whitespace: tabs, newlines, carriage returns, and spaces.
-/
@[inline]
def ws : Parser Unit := fun it =>
.success (skipWs it) ()
/--
Takes a fixed amount of chars from the iterator.
-/
def take (n : Nat) : Parser String := fun it =>
let substr := it.extract (it.forward n)
if substr.length != n then
.error it s!"expected: {n} codepoints"
.error it .eof
else
.success (it.forward n) substr

View file

@ -205,10 +205,10 @@ private def manyN (n : Nat) (p : Parser α) : Parser (Array α) := do
result := result.push x
return result
private def pu64 : Parser UInt64 := ByteArray.toUInt64LE! <$> take 8
private def pi64 : Parser Int64 := toInt64 <$> take 8
private def pu32 : Parser UInt32 := toUInt32 <$> take 4
private def pi32 : Parser Int32 := toInt32 <$> take 4
private def pu64 : Parser UInt64 := ByteArray.toUInt64LE! <$> ByteSlice.toByteArray <$> take 8
private def pi64 : Parser Int64 := toInt64 <$> ByteSlice.toByteArray <$> take 8
private def pu32 : Parser UInt32 := toUInt32 <$> ByteSlice.toByteArray <$> take 4
private def pi32 : Parser Int32 := toInt32 <$> ByteSlice.toByteArray <$> take 4
private def pu8 : Parser UInt8 := any
private def pbool : Parser Bool := (· != 0) <$> pu8

View file

@ -1,6 +1,6 @@
set(RUNTIME_OBJS debug.cpp thread.cpp mpz.cpp utf8.cpp
object.cpp apply.cpp exception.cpp interrupt.cpp memory.cpp
stackinfo.cpp compact.cpp init_module.cpp io.cpp hash.cpp
stackinfo.cpp compact.cpp init_module.cpp io.cpp hash.cpp byteslice.cpp
platform.cpp alloc.cpp allocprof.cpp sharecommon.cpp stack_overflow.cpp
process.cpp object_ref.cpp mpn.cpp mutex.cpp libuv.cpp uv/net_addr.cpp uv/event_loop.cpp
uv/timer.cpp uv/tcp.cpp uv/udp.cpp uv/dns.cpp uv/system.cpp)

38
src/runtime/byteslice.cpp Normal file
View file

@ -0,0 +1,38 @@
/*
Copyright (c) 2025 Lean FRO. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Sofia Rodrigues
*/
#include <lean/lean.h>
#include "runtime/byteslice.h"
#include "runtime/io.h"
#include "runtime/object.h"
#include "runtime/thread.h"
namespace lean {
extern "C" LEAN_EXPORT uint8_t lean_byteslice_beq(b_obj_arg a, b_obj_arg b) {
if (a == b) { return true; }
lean_object* bytearray_a = lean_ctor_get(a, 0);
size_t start_a = lean_unbox(lean_ctor_get(a, 1));
size_t end_a = lean_unbox(lean_ctor_get(a, 2));
lean_object* bytearray_b = lean_ctor_get(b, 0);
size_t start_b = lean_unbox(lean_ctor_get(b, 1));
size_t end_b = lean_unbox(lean_ctor_get(b, 2));
size_t size_a = end_a - start_a;
size_t size_b = end_b - start_b;
if (size_a != size_b) { return false; }
if (size_a == 0) { return true; }
const uint8_t* ptr_a = lean_sarray_cptr(bytearray_a) + start_a;
const uint8_t* ptr_b = lean_sarray_cptr(bytearray_b) + start_b;
return memcmp(ptr_a, ptr_b, size_a) == 0;
}
}

8
src/runtime/byteslice.h Normal file
View file

@ -0,0 +1,8 @@
/*
Copyright (c) 2025 Lean FRO. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Sofia Rodrigues
*/
#pragma once
#include <string.h>

View file

@ -0,0 +1,343 @@
import Std.Data.ByteSlice
-- Test data
def ba := ByteArray.mk #[1,2,3,4,5,6,7,8,9,10]
def bb := ByteArray.mk #[1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10]
def emptyArray := ByteArray.mk #[]
def singleByte := ByteArray.mk #[42]
-- Basic slicing tests (existing)
/--
info: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-/
#guard_msgs in
#eval bb[10...20] |>.toByteArray
/--
info: [3, 4]
-/
#guard_msgs in
#eval bb[10...20][2...4] |>.toByteArray
/--
info: 4
-/
#guard_msgs in
#eval bb[10...20][2...4] |>.get! 1
/--
info: true
-/
#guard_msgs in
#eval bb[10...20][2...4] |>.contains 4
-- Size function tests
/--
info: 10
-/
#guard_msgs in
#eval ba.toByteSlice.size
/--
info: 0
-/
#guard_msgs in
#eval emptyArray.toByteSlice.size
/--
info: 1
-/
#guard_msgs in
#eval singleByte.toByteSlice.size
/--
info: 5
-/
#guard_msgs in
#eval ba[2...7].size
-- Element access tests
/--
info: 1
-/
#guard_msgs in
#eval ba[0...5][0]!
/--
info: 5
-/
#guard_msgs in
#eval ba[0...5][4]!
-- getD (get with default) tests
/--
info: 3
-/
#guard_msgs in
#eval ba.toByteSlice.getD 2 99
/--
info: 99
-/
#guard_msgs in
#eval ba.toByteSlice.getD 15 99
/--
info: 255
-/
#guard_msgs in
#eval emptyArray.toByteSlice.getD 0 255
-- get! tests (with bounds checking)
/--
info: 0
-/
#guard_msgs in
#eval ba.toByteSlice.get! 15
/--
info: 0
-/
#guard_msgs in
#eval emptyArray.toByteSlice.get! 0
/--
info: 42
-/
#guard_msgs in
#eval singleByte.toByteSlice.get! 0
-- Contains function tests
/--
info: true
-/
#guard_msgs in
#eval ba.toByteSlice.contains 5
/--
info: false
-/
#guard_msgs in
#eval ba.toByteSlice.contains 15
/--
info: false
-/
#guard_msgs in
#eval emptyArray.toByteSlice.contains 1
/--
info: true
-/
#guard_msgs in
#eval bb[5...15].contains 10
-- Equality tests
/--
info: true
-/
#guard_msgs in
#eval ba[0...5] == ba[0...5]
/--
info: true
-/
#guard_msgs in
#eval ba[0...3].toByteArray == bb[0...3].toByteArray
/--
info: false
-/
#guard_msgs in
#eval ba[0...3] == ba[1...4]
/--
info: true
-/
#guard_msgs in
#eval ByteSlice.empty == emptyArray.toByteSlice
-- Edge case: empty slices
/--
info: 0
-/
#guard_msgs in
#eval ByteSlice.empty.size
/--
info: []
-/
#guard_msgs in
#eval ByteSlice.empty.toByteArray
/--
info: false
-/
#guard_msgs in
#eval ByteSlice.empty.contains 0
-- Edge case: slice bounds clamping
/--
info: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-/
#guard_msgs in
#eval ba[0...100].toByteArray
/--
info: []
-/
#guard_msgs in
#eval ba[100...200].toByteArray
/--
info: [6, 7, 8, 9, 10]
-/
#guard_msgs in
#eval ba[5...100].toByteArray
-- Foldr function tests
/--
info: 55
-/
#guard_msgs in
#eval ba.toByteSlice.foldr (fun b acc => b.toNat + acc) 0
/--
info: 0
-/
#guard_msgs in
#eval ByteSlice.empty.foldr (fun b acc => b.toNat + acc) 0
/--
info: 42
-/
#guard_msgs in
#eval singleByte.toByteSlice.foldr (fun b acc => b.toNat + acc) 0
-- Nested slicing with element access
/--
info: 7
-/
#guard_msgs in
#eval bb[0...10][5...8][1]!
/--
info: true
-/
#guard_msgs in
#eval bb[0...10][5...8].contains 6
/--
info: false
-/
#guard_msgs in
#eval bb[0...10][5...8].contains 5
-- Size consistency tests
/--
info: 3
-/
#guard_msgs in
#eval bb[0...10][5...8].size
/--
info: 5
-/
#guard_msgs in
#eval bb[5...15][0...5].size
-- Boundary testing with single elements
/--
info: [10]
-/
#guard_msgs in
#eval ba[9...10].toByteArray
/--
info: 1
-/
#guard_msgs in
#eval ba[9...10].size
/--
info: 10
-/
#guard_msgs in
#eval ba[9...10][0]!
-- Out of bounds slice handling
/--
info: []
-/
#guard_msgs in
#eval ba[15...20].toByteArray
/--
info: 0
-/
#guard_msgs in
#eval ba[15...20].size
/--
info: []
-/
#guard_msgs in
#eval ba.toByteSlice.slice 8 3 |>.toByteArray
/--
info: 0
-/
#guard_msgs in
#eval ba.toByteSlice.slice 8 3 |>.size
/--
info: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-/
#guard_msgs in
#eval ba.toByteSlice.slice 0 10 |>.toByteArray
/--
info: true
-/
#guard_msgs in
#eval bb[0...10] == bb[10...20]
/--
info: [1, 2, 3]
-/
#guard_msgs in
#eval bb[10...13].toByteArray
-- Testing with different default values in getD
/--
info: 128
-/
#guard_msgs in
#eval ba[0...3].getD 10 128
/--
info: 2
-/
#guard_msgs in
#eval ba[0...3].getD 1 128
def maxByteArray := ByteArray.mk #[255, 0, 128, 1]
/--
info: true
-/
#guard_msgs in
#eval maxByteArray.toByteSlice.contains 255
/--
info: 384
-/
#guard_msgs in
#eval maxByteArray.toByteSlice.foldr (fun b acc => b.toNat + acc) 0
/--
info: [255, 0]
-/
#guard_msgs in
#eval maxByteArray[0...2].toByteArray