doc: docstring review for Substring (#7635)

This PR adds missing docstrings for `Substring` and makes the style of
`Substring` docstrings consistent.
This commit is contained in:
David Thrane Christiansen 2025-03-25 08:57:55 +01:00 committed by GitHub
parent 1465c23e12
commit b26516e33c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 235 additions and 20 deletions

View file

@ -905,7 +905,7 @@ termination_by stopPos.1 - i.1
/--
Folds a function over a string from the left, accumulating a value starting with `init`. The
accumulated value is combined with character in order, using `f`.
accumulated value is combined with each character in order, using `f`.
Examples:
* `"coffee tea water".foldl (fun n c => if c.isWhitespace then n + 1 else n) 0 = 2`
@ -927,7 +927,7 @@ termination_by i.1
/--
Folds a function over a string from the right, accumulating a value starting with `init`. The
accumulated value is combined with the each character in reverse order, using `f`.
accumulated value is combined with each character in reverse order, using `f`.
Examples:
* `"coffee tea water".foldr (fun c n => if c.isWhitespace then n + 1 else n) 0 = 2`
@ -1166,21 +1166,46 @@ end String
namespace Substring
/--
Checks whether a substring is empty.
A substring is empty if its start and end positions are the same.
-/
@[inline] def isEmpty (ss : Substring) : Bool :=
ss.bsize == 0
/--
Copies the region of the underlying string pointed to by a substring into a fresh string.
-/
@[inline] def toString : Substring → String
| ⟨s, b, e⟩ => s.extract b e
/--
Returns an iterator into the underlying string, at the substring's starting position. The ending
position is discarded, so the iterator alone cannot be used to determine whether its current
position is within the original substring.
-/
@[inline] def toIterator : Substring → String.Iterator
| ⟨s, b, _⟩ => ⟨s, b⟩
/-- Return the codepoint at the given offset into the substring. -/
/--
Returns the character at the given position in the substring.
The position is relative to the substring, rather than the underlying string, and no bounds checking
is performed with respect to the substring's end position. If the relative position is not a valid
position in the underlying string, the fallback value `(default : Char)`, which is `'A'`, is
returned. Does not panic.
-/
@[inline] def get : Substring → String.Pos → Char
| ⟨s, b, _⟩, p => s.get (b+p)
/-- Given an offset of a codepoint into the substring,
return the offset there of the next codepoint. -/
/--
Returns the next position in a substring after the given position. If the position is at the end of
the substring, it is returned unmodified.
Both the input position and the returned position are interpreted relative to the substring's start
position, not the underlying string.
-/
@[inline] def next : Substring → String.Pos → String.Pos
| ⟨s, b, e⟩, p =>
let absP := b+p
@ -1195,48 +1220,124 @@ theorem lt_next (s : Substring) (i : String.Pos) (h : i.1 < s.bsize) :
apply Nat.lt_sub_of_add_lt
rw [Nat.add_comm]; apply String.lt_next
/-- Given an offset of a codepoint into the substring,
return the offset there of the previous codepoint. -/
/--
Returns the previous position in a substring, just prior to the given position. If the position is
at the beginning of the substring, it is returned unmodified.
Both the input position and the returned position are interpreted relative to the substring's start
position, not the underlying string.
-/
@[inline] def prev : Substring → String.Pos → String.Pos
| ⟨s, b, _⟩, p =>
let absP := b+p
if absP = b then p else { byteIdx := (s.prev absP).byteIdx - b.byteIdx }
/--
Returns the position that's the specified number of characters forward from the given position in a
substring. If the end position of the substring is reached, it is returned.
Both the input position and the returned position are interpreted relative to the substring's start
position, not the underlying string.
-/
def nextn : Substring → Nat → String.Pos → String.Pos
| _, 0, p => p
| ss, i+1, p => ss.nextn i (ss.next p)
/--
Returns the position that's the specified number of characters prior to the given position in a
substring. If the start position of the substring is reached, it is returned.
Both the input position and the returned position are interpreted relative to the substring's start
position, not the underlying string.
-/
def prevn : Substring → Nat → String.Pos → String.Pos
| _, 0, p => p
| ss, i+1, p => ss.prevn i (ss.prev p)
/--
Returns the first character in the substring.
If the substring is empty, but the substring's start position is a valid position in the underlying
string, then the character at the start position is returned. If the substring's start position is
not a valid position in the string, the fallback value `(default : Char)`, which is `'A'`, is
returned. Does not panic.
-/
@[inline] def front (s : Substring) : Char :=
s.get 0
/-- Return the offset into `s` of the first occurrence of `c` in `s`,
or `s.bsize` if `c` doesn't occur. -/
/--
Returns the substring-relative position of the first occurrence of `c` in `s`, or `s.bsize` if `c`
doesn't occur.
-/
@[inline] def posOf (s : Substring) (c : Char) : String.Pos :=
match s with
| ⟨s, b, e⟩ => { byteIdx := (String.posOfAux s c e b).byteIdx - b.byteIdx }
/--
Removes the specified number of characters (Unicode code points) from the beginning of a substring
by advancing its start position.
If the substring's end position is reached, the start position is not advanced past it.
-/
@[inline] def drop : Substring → Nat → Substring
| ss@⟨s, b, e⟩, n => ⟨s, b + ss.nextn n 0, e⟩
/--
Removes the specified number of characters (Unicode code points) from the end of a substring
by moving its end position towards its start position.
If the substring's start position is reached, the end position is not retracted past it.
-/
@[inline] def dropRight : Substring → Nat → Substring
| ss@⟨s, b, _⟩, n => ⟨s, b, b + ss.prevn n ⟨ss.bsize⟩⟩
/--
Retains only the specified number of characters (Unicode code points) at the beginning of a
substring, by moving its end position towards its start position.
If the substring's start position is reached, the end position is not retracted past it.
-/
@[inline] def take : Substring → Nat → Substring
| ss@⟨s, b, _⟩, n => ⟨s, b, b + ss.nextn n 0⟩
/--
Retains only the specified number of characters (Unicode code points) at the end of a substring, by
moving its start position towards its end position.
If the substring's end position is reached, the start position is not advanced past it.
-/
@[inline] def takeRight : Substring → Nat → Substring
| ss@⟨s, b, e⟩, n => ⟨s, b + ss.prevn n ⟨ss.bsize⟩, e⟩
/--
Checks whether a position in a substring is precisely equal to its ending position.
The position is understood relative to the substring's starting position, rather than the underlying
string's starting position.
-/
@[inline] def atEnd : Substring → String.Pos → Bool
| ⟨_, b, e⟩, p => b + p == e
/--
Returns the region of the substring delimited by the provided start and stop positions, as a
substring. The positions are interpreted with respect to the substring's start position, rather than
the underlying string.
If the resulting substring is empty, then the resulting substring is a substring of the empty string
`""`. Otherwise, the underlying string is that of the input substring with the beginning and end
positions adjusted.
-/
@[inline] def extract : Substring → String.Pos → String.Pos → Substring
| ⟨s, b, e⟩, b', e' => if b' ≥ e' then ⟨"", 0, 0⟩ else ⟨s, e.min (b+b'), e.min (b+e')⟩
/--
Splits a substring `s` on occurrences of the separator string `sep`. The default separator is `" "`.
When `sep` is empty, the result is `[s]`. When `sep` occurs in overlapping patterns, the first match
is taken. There will always be exactly `n+1` elements in the returned list if there were `n`
non-overlapping matches of `sep` in the string. The separators are not included in the returned
substrings, which are all substrings of `s`'s string.
-/
def splitOn (s : Substring) (sep : String := " ") : List Substring :=
if sep == "" then
[s]
@ -1262,21 +1363,42 @@ def splitOn (s : Substring) (sep : String := " ") : List Substring :=
termination_by s.bsize - i.1
loop 0 0 0 []
/--
Folds a function over a substring from the left, accumulating a value starting with `init`. The
accumulated value is combined with each character in order, using `f`.
-/
@[inline] def foldl {α : Type u} (f : α → Char → α) (init : α) (s : Substring) : α :=
match s with
| ⟨s, b, e⟩ => String.foldlAux f s e b init
/--
Folds a function over a substring from the right, accumulating a value starting with `init`. The
accumulated value is combined with each character in reverse order, using `f`.
-/
@[inline] def foldr {α : Type u} (f : Char → αα) (init : α) (s : Substring) : α :=
match s with
| ⟨s, b, e⟩ => String.foldrAux f init s e b
/--
Checks whether the Boolean predicate `p` returns `true` for any character in a substring.
Short-circuits at the first character for which `p` returns `true`.
-/
@[inline] def any (s : Substring) (p : Char → Bool) : Bool :=
match s with
| ⟨s, b, e⟩ => String.anyAux s e p b
/--
Checks whether the Boolean predicate `p` returns `true` for every character in a substring.
Short-circuits at the first character for which `p` returns `false`.
-/
@[inline] def all (s : Substring) (p : Char → Bool) : Bool :=
!s.any (fun c => !p c)
/--
Checks whether a substring contains the specified character.
-/
@[inline] def contains (s : Substring) (c : Char) : Bool :=
s.any (fun a => a == c)
@ -1289,11 +1411,21 @@ def splitOn (s : Substring) (sep : String := " ") : List Substring :=
else i
termination_by stopPos.1 - i.1
/--
Retains only the longest prefix of a substring in which a Boolean predicate returns `true` for all
characters by moving the substring's end position towards its start position.
-/
@[inline] def takeWhile : Substring → (Char → Bool) → Substring
| ⟨s, b, e⟩, p =>
let e := takeWhileAux s e p b;
⟨s, b, e⟩
/--
Removes the longest prefix of a substring in which a Boolean predicate returns `true` for all
characters by moving the substring's start position. The start position is moved to the position of
the first character for which the predicate returns `false`, or to the substring's end position if
the predicate always returns `true`.
-/
@[inline] def dropWhile : Substring → (Char → Bool) → Substring
| ⟨s, b, e⟩, p =>
let b := takeWhileAux s e p b;
@ -1310,28 +1442,75 @@ termination_by stopPos.1 - i.1
else i
termination_by i.1
/--
Retains only the longest suffix of a substring in which a Boolean predicate returns `true` for all
characters by moving the substring's start position towards its end position.
-/
@[inline] def takeRightWhile : Substring → (Char → Bool) → Substring
| ⟨s, b, e⟩, p =>
let b := takeRightWhileAux s b p e
⟨s, b, e⟩
/--
Removes the longest suffix of a substring in which a Boolean predicate returns `true` for all
characters by moving the substring's end position. The end position is moved just after the position
of the last character for which the predicate returns `false`, or to the substring's start position
if the predicate always returns `true`.
-/
@[inline] def dropRightWhile : Substring → (Char → Bool) → Substring
| ⟨s, b, e⟩, p =>
let e := takeRightWhileAux s b p e
⟨s, b, e⟩
/--
Removes leading whitespace from a substring by moving its start position to the first non-whitespace
character, or to its end position if there is no non-whitespace character.
“Whitespace” is defined as characters for which `Char.isWhitespace` returns `true`.
-/
@[inline] def trimLeft (s : Substring) : Substring :=
s.dropWhile Char.isWhitespace
/--
Removes trailing whitespace from a substring by moving its end position to the last non-whitespace
character, or to its start position if there is no non-whitespace character.
“Whitespace” is defined as characters for which `Char.isWhitespace` returns `true`.
-/
@[inline] def trimRight (s : Substring) : Substring :=
s.dropRightWhile Char.isWhitespace
/--
Removes leading and trailing whitespace from a substring by first moving its start position to the
first non-whitespace character, and then moving its end position to the last non-whitespace
character.
If the substring consists only of whitespace, then the resulting substring's start position is moved
to its end position.
“Whitespace” is defined as characters for which `Char.isWhitespace` returns `true`.
Examples:
* `" red green blue ".toSubstring.trim.toString = "red green blue"`
* `" red green blue ".toSubstring.trim.startPos = ⟨1⟩`
* `" red green blue ".toSubstring.trim.stopPos = ⟨15⟩`
* `" ".toSubstring.trim.startPos = ⟨5⟩`
-/
@[inline] def trim : Substring → Substring
| ⟨s, b, e⟩ =>
let b := takeWhileAux s e Char.isWhitespace b
let e := takeRightWhileAux s b Char.isWhitespace e
⟨s, b, e⟩
/--
Checks whether the substring can be interpreted as the decimal representation of a natural number.
A substring can be interpreted as a decimal natural number if it is not empty and all the characters
in it are digits.
Use `Substring.toNat?` to convert such a string to a natural number.
-/
@[inline] def isNat (s : Substring) : Bool :=
s.all fun c => c.isDigit
@ -1341,18 +1520,29 @@ def toNat? (s : Substring) : Option Nat :=
else
none
/--
Checks whether two substrings represent equal strings. Usually accessed via the `==` operator.
Two substrings do not need to have the same underlying string or the same start and end positions;
instead, they are equal if they contain the same sequence of characters.
-/
def beq (ss1 ss2 : Substring) : Bool :=
ss1.bsize == ss2.bsize && ss1.str.substrEq ss1.startPos ss2.str ss2.startPos ss1.bsize
instance hasBeq : BEq Substring := ⟨beq⟩
/-- Checks whether two substrings have the same position and content. -/
/--
Checks whether two substrings have the same position and content.
The two substrings do not need to have the same underlying string for this check to succeed.
-/
def sameAs (ss1 ss2 : Substring) : Bool :=
ss1.startPos == ss2.startPos && ss1 == ss2
/--
Returns the longest common prefix of two substrings.
The returned substring will use the same underlying string as `s`.
The returned substring uses the same underlying string as `s`.
-/
def commonPrefix (s t : Substring) : Substring :=
{ s with stopPos := loop s.startPos t.startPos }
@ -1371,7 +1561,8 @@ where
/--
Returns the longest common suffix of two substrings.
The returned substring will use the same underlying string as `s`.
The returned substring uses the same underlying string as `s`.
-/
def commonSuffix (s t : Substring) : Substring :=
{ s with startPos := loop s.stopPos t.stopPos }
@ -1391,7 +1582,11 @@ where
termination_by spos.byteIdx
/--
If `pre` is a prefix of `s`, i.e. `s = pre ++ t`, returns the remainder `t`.
If `pre` is a prefix of `s`, returns the remainder. Returns `none` otherwise.
The substring `pre` is a prefix of `s` if there exists a `t : Substring` such that
`s.toString = pre.toString ++ t.toString`. If so, the result is the substring of `s` without the
prefix.
-/
def dropPrefix? (s : Substring) (pre : Substring) : Option Substring :=
let t := s.commonPrefix pre
@ -1401,7 +1596,11 @@ def dropPrefix? (s : Substring) (pre : Substring) : Option Substring :=
none
/--
If `suff` is a suffix of `s`, i.e. `s = t ++ suff`, returns the remainder `t`.
If `suff` is a suffix of `s`, returns the remainder. Returns `none` otherwise.
The substring `suff` is a suffix of `s` if there exists a `t : Substring` such that
`s.toString = t.toString ++ suff.toString`. If so, the result the substring of `s` without the
suffix.
-/
def dropSuffix? (s : Substring) (suff : Substring) : Option Substring :=
let t := s.commonSuffix suff

View file

@ -1009,6 +1009,13 @@ private partial def splitNameLitAux (ss : Substring) (acc : List Substring) : Li
def splitNameLit (ss : Substring) : List Substring :=
splitNameLitAux ss [] |>.reverse
/--
Converts a substring to the Lean compiler's representation of names. The resulting name is
hierarchical, and the string is split at the dots (`'.'`).
`"a.b".toSubstring.toName` is the name `a.b`, not `«a.b»`. For the latter, use
`Name.mkSimple ∘ Substring.toString`.
-/
def _root_.Substring.toName (s : Substring) : Name :=
match splitNameLitAux s [] with
| [] => .anonymous

View file

@ -2709,13 +2709,20 @@ instance : DecidableEq String.Pos :=
| isFalse h => isFalse (fun he => String.Pos.noConfusion he fun he => absurd he h)
/--
A `Substring` is a view into some subslice of a `String`.
The actual string slicing is deferred because this would require copying the
string; here we only store a reference to the original string for
garbage collection purposes.
A region or slice of some underlying string.
A substring contains an string together with the start and end byte positions of a region of
interest. Actually extracting a substring requires copying and memory allocation, while many
substrings of the same underlying string may exist with very little overhead, and they are more
convenient than tracking the bounds by hand.
Using its constructor explicitly, it is possible to construct a `Substring` in which one or both of
the positions is invalid for the string. Many operations will return unexpected or confusing results
if the start and stop positions are not valid. Instead, it's better to use API functions that ensure
the validity of the positions in a substring to create and manipulate them.
-/
structure Substring where
/-- The underlying string to slice. -/
/-- The underlying string. -/
str : String
/-- The byte position of the start of the string slice. -/
startPos : String.Pos
@ -2725,7 +2732,9 @@ structure Substring where
instance : Inhabited Substring where
default := ⟨"", {}, {}⟩
/-- The byte length of the substring. -/
/--
The number of bytes used by the string's UTF-8 encoding.
-/
@[inline] def Substring.bsize : Substring → Nat
| ⟨_, b, e⟩ => e.byteIdx.sub b.byteIdx