/- Copyright (c) 2016 Microsoft Corporation. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. Author: Leonardo de Moura, Mario Carneiro -/ module prelude public import Init.Data.ByteArray.Basic public import Init.Data.String.PosRaw import Init.Data.ByteArray.Lemmas /-! # Preliminary developments for strings This file contains the material about strings which we can write down without the results in `Init.Data.String.Decode`, i.e., without knowing about the bijection between `String` and `List Char` given by UTF-8 decoding and encoding. Note that this file, despite being called `Defs`, contains quite a few lemmas. -/ public section @[simp] theorem List.utf8Encode_nil : List.utf8Encode [] = ByteArray.empty := by simp [utf8Encode] theorem List.utf8Encode_singleton {c : Char} : [c].utf8Encode = (String.utf8EncodeChar c).toByteArray := by simp [utf8Encode] @[simp] theorem List.utf8Encode_append {l l' : List Char} : (l ++ l').utf8Encode = l.utf8Encode ++ l'.utf8Encode := by simp [utf8Encode] theorem List.utf8Encode_cons {c : Char} {l : List Char} : (c :: l).utf8Encode = [c].utf8Encode ++ l.utf8Encode := by rw [← singleton_append, List.utf8Encode_append] @[simp] theorem String.utf8EncodeChar_ne_nil {c : Char} : String.utf8EncodeChar c ≠ [] := by fun_cases String.utf8EncodeChar with simp @[simp] theorem List.utf8Encode_eq_empty {l : List Char} : l.utf8Encode = ByteArray.empty ↔ l = [] := by simp [utf8Encode, ← List.eq_nil_iff_forall_not_mem] theorem ByteArray.isValidUTF8_utf8Encode {l : List Char} : IsValidUTF8 l.utf8Encode := .intro l rfl @[simp] theorem ByteArray.isValidUTF8_empty : IsValidUTF8 ByteArray.empty := .intro [] (by simp) theorem Char.isValidUTF8_toByteArray_utf8EncodeChar {c : Char} : ByteArray.IsValidUTF8 (String.utf8EncodeChar c).toByteArray := .intro [c] (by simp [List.utf8Encode_singleton]) theorem ByteArray.IsValidUTF8.append {b b' : ByteArray} (h : IsValidUTF8 b) (h' : IsValidUTF8 b') : IsValidUTF8 (b ++ b') := by rcases h with ⟨m, rfl⟩ rcases h' with ⟨m', rfl⟩ exact .intro (m ++ m') (by simp) /-- Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into the corresponding string. -/ @[inline, expose] def String.fromUTF8 (a : @& ByteArray) (h : a.IsValidUTF8) : String := .ofByteArray a h /-- Encodes a string in UTF-8 as an array of bytes. -/ @[extern "lean_string_to_utf8"] def String.toUTF8 (a : @& String) : ByteArray := a.bytes @[simp] theorem String.toUTF8_eq_bytes {s : String} : s.toUTF8 = s.bytes := (rfl) @[simp] theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl) /-- Appends two strings. Usually accessed via the `++` operator. The internal implementation will perform destructive updates if the string is not shared. Examples: * `"abc".append "def" = "abcdef"` * `"abc" ++ "def" = "abcdef"` * `"" ++ "" = ""` -/ @[extern "lean_string_append", expose] def String.append (s : String) (t : @& String) : String where bytes := s.bytes ++ t.bytes isValidUTF8 := s.isValidUTF8.append t.isValidUTF8 instance : Append String where append s t := s.append t @[simp] theorem String.bytes_append {s t : String} : (s ++ t).bytes = s.bytes ++ t.bytes := (rfl) theorem String.bytes_inj {s t : String} : s.bytes = t.bytes ↔ s = t := by refine ⟨fun h => ?_, (· ▸ rfl)⟩ rcases s with ⟨s⟩ rcases t with ⟨t⟩ subst h rfl @[simp] theorem List.bytes_asString {l : List Char} : l.asString.bytes = l.utf8Encode := by simp [List.asString, String.mk] theorem String.exists_eq_asString (s : String) : ∃ l : List Char, s = l.asString := by rcases s with ⟨_, ⟨l, rfl⟩⟩ refine ⟨l, by simp [← String.bytes_inj]⟩ @[simp] theorem String.utf8ByteSize_empty : "".utf8ByteSize = 0 := (rfl) @[simp] theorem String.utf8ByteSize_append {s t : String} : (s ++ t).utf8ByteSize = s.utf8ByteSize + t.utf8ByteSize := by simp [utf8ByteSize] @[simp] theorem String.size_bytes {s : String} : s.bytes.size = s.utf8ByteSize := rfl @[simp] theorem String.bytes_push {s : String} {c : Char} : (s.push c).bytes = s.bytes ++ [c].utf8Encode := by simp [push] namespace String @[deprecated rawEndPos (since := "2025-10-20")] def endPos (s : String) : String.Pos.Raw := s.rawEndPos /-- The start position of the string, as a `String.Pos.Raw.` -/ def rawStartPos (_s : String) : String.Pos.Raw := 0 @[simp] theorem rawStartPos_eq {s : String} : s.rawStartPos = 0 := (rfl) @[simp] theorem byteIdx_rawEndPos {s : String} : s.rawEndPos.byteIdx = s.utf8ByteSize := rfl @[deprecated byteIdx_rawEndPos (since := "2025-10-20")] theorem byteIdx_endPos {s : String} : s.rawEndPos.byteIdx = s.utf8ByteSize := rfl @[simp] theorem utf8ByteSize_ofByteArray {b : ByteArray} {h} : (String.ofByteArray b h).utf8ByteSize = b.size := rfl @[simp] theorem bytes_singleton {c : Char} : (String.singleton c).bytes = [c].utf8Encode := by simp [singleton] theorem singleton_eq_asString {c : Char} : String.singleton c = [c].asString := by simp [← String.bytes_inj] @[simp] theorem append_singleton {s : String} {c : Char} : s ++ singleton c = s.push c := by simp [← bytes_inj] @[simp] theorem append_left_inj {s₁ s₂ : String} (t : String) : s₁ ++ t = s₂ ++ t ↔ s₁ = s₂ := by simp [← bytes_inj] theorem append_assoc {s₁ s₂ s₃ : String} : s₁ ++ s₂ ++ s₃ = s₁ ++ (s₂ ++ s₃) := by simp [← bytes_inj, ByteArray.append_assoc] @[simp] theorem utf8ByteSize_eq_zero_iff {s : String} : s.utf8ByteSize = 0 ↔ s = "" := by refine ⟨fun h => ?_, fun h => h ▸ utf8ByteSize_empty⟩ simpa [← bytes_inj, ← ByteArray.size_eq_zero_iff] using h theorem rawEndPos_eq_zero_iff {b : String} : b.rawEndPos = 0 ↔ b = "" := by simp @[deprecated rawEndPos_eq_zero_iff (since := "2025-10-20")] theorem endPos_eq_zero_iff {b : String} : b.rawEndPos = 0 ↔ b = "" := rawEndPos_eq_zero_iff /-- Adds multiple repetitions of a character to the end of a string. Returns `s`, with `n` repetitions of `c` at the end. Internally, the implementation repeatedly calls `String.push`, so the string is modified in-place if there is a unique reference to it. Examples: * `"indeed".pushn '!' 2 = "indeed!!"` * `"indeed".pushn '!' 0 = "indeed"` * `"".pushn ' ' 4 = " "` -/ @[inline] def pushn (s : String) (c : Char) (n : Nat) : String := n.repeat (fun s => s.push c) s theorem pushn_eq_repeat_push {s : String} {c : Char} {n : Nat} : s.pushn c n = n.repeat (fun s => s.push c) s := (rfl) @[export lean_string_pushn] def Internal.pushnImpl (s : String) (c : Char) (n : Nat) : String := String.pushn s c n /-- Checks whether a string is empty. Empty strings are equal to `""` and have length and end position `0`. Examples: * `"".isEmpty = true` * `"empty".isEmpty = false` * `" ".isEmpty = false` -/ @[inline] def isEmpty (s : String) : Bool := s.utf8ByteSize == 0 @[export lean_string_isempty] def Internal.isEmptyImpl (s : String) : Bool := String.isEmpty s /-- Appends all the strings in a list of strings, in order. Use `String.intercalate` to place a separator string between the strings in a list. Examples: * `String.join ["gr", "ee", "n"] = "green"` * `String.join ["b", "", "l", "", "ue"] = "blue"` * `String.join [] = ""` -/ @[inline] def join (l : List String) : String := l.foldl (fun r s => r ++ s) "" /-- Appends the strings in a list of strings, placing the separator `s` between each pair. Examples: * `", ".intercalate ["red", "green", "blue"] = "red, green, blue"` * `" and ".intercalate ["tea", "coffee"] = "tea and coffee"` * `" | ".intercalate ["M", "", "N"] = "M | | N"` -/ def intercalate (s : String) : List String → String | [] => "" | a :: as => go a s as where go (acc : String) (s : String) : List String → String | a :: as => go (acc ++ s ++ a) s as | [] => acc @[export lean_string_intercalate] def Internal.intercalateImpl (s : String) : List String → String := String.intercalate s /-- Predicate for validity of positions inside a `String`. There are multiple equivalent definitions for validity. We say that a position is valid if the string obtained by taking all of the bytes up to, but excluding, the given position, is valid UTF-8; see `Pos.isValid_iff_isValidUTF8_extract_zero`. Similarly, a position is valid if the string obtained by taking all of the bytes starting at the given position is valid UTF-8; see `Pos.isValid_iff_isValidUTF8_extract_utf8ByteSize`. An equivalent condition is that the position is the length of the UTF-8 encoding of some prefix of the characters of the string; see `Pos.isValid_iff_exists_append` and `Pos.isValid_iff_exists_take_data`. Another equivalent condition that can be checked efficiently is that the position is either the end position or strictly smaller than the end position and the byte at the position satisfies `UInt8.IsUTF8FirstByte`; see `Pos.isValid_iff_isUTF8FirstByte`. Examples: * `String.Pos.IsValid "abc" ⟨0⟩` * `String.Pos.IsValid "abc" ⟨1⟩` * `String.Pos.IsValid "abc" ⟨3⟩` * `¬ String.Pos.IsValid "abc" ⟨4⟩` * `String.Pos.IsValid "𝒫(A)" ⟨0⟩` * `¬ String.Pos.IsValid "𝒫(A)" ⟨1⟩` * `¬ String.Pos.IsValid "𝒫(A)" ⟨2⟩` * `¬ String.Pos.IsValid "𝒫(A)" ⟨3⟩` * `String.Pos.IsValid "𝒫(A)" ⟨4⟩` -/ structure Pos.Raw.IsValid (s : String) (off : String.Pos.Raw) : Prop where private mk :: le_rawEndPos : off ≤ s.rawEndPos isValidUTF8_extract_zero : (s.bytes.extract 0 off.byteIdx).IsValidUTF8 theorem Pos.Raw.IsValid.le_utf8ByteSize {s : String} {off : String.Pos.Raw} (h : off.IsValid s) : off.byteIdx ≤ s.utf8ByteSize := by simpa [Pos.Raw.le_iff] using h.le_rawEndPos theorem Pos.Raw.isValid_iff_isValidUTF8_extract_zero {s : String} {p : Pos.Raw} : p.IsValid s ↔ p ≤ s.rawEndPos ∧ (s.bytes.extract 0 p.byteIdx).IsValidUTF8 := ⟨fun ⟨h₁, h₂⟩ => ⟨h₁, h₂⟩, fun ⟨h₁, h₂⟩ => ⟨h₁, h₂⟩⟩ @[deprecated le_rawEndPos (since := "2025-10-20")] theorem Pos.Raw.IsValid.le_endPos {s : String} {off : String.Pos.Raw} (h : off.IsValid s) : off ≤ s.rawEndPos := h.le_rawEndPos @[simp] theorem Pos.Raw.isValid_zero {s : String} : (0 : Pos.Raw).IsValid s where le_rawEndPos := by simp [Pos.Raw.le_iff] isValidUTF8_extract_zero := by simp @[simp] theorem Pos.Raw.isValid_rawEndPos {s : String} : s.rawEndPos.IsValid s where le_rawEndPos := by simp isValidUTF8_extract_zero := by simp [← size_bytes, s.isValidUTF8] @[simp] theorem Pos.Raw.isValid_empty_iff {p : Pos.Raw} : p.IsValid "" ↔ p = 0 := by refine ⟨?_, ?_⟩ · rintro ⟨h₁, h₂⟩ simp only [le_iff, byteIdx_rawEndPos, utf8ByteSize_empty, Nat.le_zero_eq] at h₁ ext omega · rintro rfl simp /-- A `ValidPos s` is a byte offset in `s` together with a proof that this position is at a UTF-8 character boundary. -/ @[ext] structure ValidPos (s : String) where /-- The underlying byte offset of the `ValidPos`. -/ offset : Pos.Raw /-- The proof that `offset` is valid for the string `s`. -/ isValid : offset.IsValid s deriving @[expose] DecidableEq /-- The start position of `s`, as an `s.ValidPos`. -/ @[inline, expose] def startValidPos (s : String) : s.ValidPos where offset := 0 isValid := by simp @[simp] theorem offset_startValidPos {s : String} : s.startValidPos.offset = 0 := (rfl) instance {s : String} : Inhabited s.ValidPos where default := s.startValidPos /-- The past-the-end position of `s`, as an `s.ValidPos`. -/ @[inline, expose] def endValidPos (s : String) : s.ValidPos where offset := s.rawEndPos isValid := by simp @[simp] theorem offset_endValidPos {s : String} : s.endValidPos.offset = s.rawEndPos := (rfl) instance {s : String} : LE s.ValidPos where le l r := l.offset ≤ r.offset instance {s : String} : LT s.ValidPos where lt l r := l.offset < r.offset theorem ValidPos.le_iff {s : String} {l r : s.ValidPos} : l ≤ r ↔ l.offset ≤ r.offset := Iff.rfl theorem ValidPos.lt_iff {s : String} {l r : s.ValidPos} : l < r ↔ l.offset < r.offset := Iff.rfl instance {s : String} (l r : s.ValidPos) : Decidable (l ≤ r) := decidable_of_iff' _ ValidPos.le_iff instance {s : String} (l r : s.ValidPos) : Decidable (l < r) := decidable_of_iff' _ ValidPos.lt_iff /-- A region or slice of some underlying string. A slice consists of a string together with the start and end byte positions of a region of interest. Actually extracting a substring requires copying and memory allocation, while many slices of the same underlying string may exist with very little overhead. While this could be achieved by tracking the bounds by hand, the slice API is much more convenient. `String.Slice` bundles proofs to ensure that the start and end positions always delineate a valid string. For this reason, it should be preferred over `Substring`. -/ structure Slice where /-- The underlying strings. -/ str : String /-- The byte position of the start of the string slice. -/ startInclusive : str.ValidPos /-- The byte position of the end of the string slice. -/ endExclusive : str.ValidPos /-- The slice is not degenerate (but it may be empty). -/ startInclusive_le_endExclusive : startInclusive ≤ endExclusive instance : Inhabited Slice where default := ⟨"", "".startValidPos, "".startValidPos, by simp [ValidPos.le_iff]⟩ /-- Returns a slice that contains the entire string. -/ @[inline, expose] -- expose for the defeq `s.toSlice.str = s`. def toSlice (s : String) : Slice where str := s startInclusive := s.startValidPos endExclusive := s.endValidPos startInclusive_le_endExclusive := by simp [ValidPos.le_iff, Pos.Raw.le_iff] /-- The number of bytes of the UTF-8 encoding of the string slice. -/ @[expose] def Slice.utf8ByteSize (s : Slice) : Nat := s.startInclusive.offset.byteDistance s.endExclusive.offset theorem Slice.utf8ByteSize_eq {s : Slice} : s.utf8ByteSize = s.endExclusive.offset.byteIdx - s.startInclusive.offset.byteIdx := (rfl) instance : HAdd Pos.Raw Slice Pos.Raw where hAdd p s := { byteIdx := p.byteIdx + s.utf8ByteSize } instance : HAdd Slice Pos.Raw Pos.Raw where hAdd s p := { byteIdx := s.utf8ByteSize + p.byteIdx } instance : HSub Pos.Raw Slice Pos.Raw where hSub p s := { byteIdx := p.byteIdx - s.utf8ByteSize } @[simp] theorem Pos.Raw.byteIdx_add_slide {p : Pos.Raw} {s : Slice} : (p + s).byteIdx = p.byteIdx + s.utf8ByteSize := rfl @[simp] theorem Pos.Raw.byteIdx_slice_add {s : Slice} {p : Pos.Raw} : (s + p).byteIdx = s.utf8ByteSize + p.byteIdx := rfl @[simp] theorem Pos.Raw.byteIdx_sub_slice {p : Pos.Raw} {s : Slice} : (p - s).byteIdx = p.byteIdx - s.utf8ByteSize := rfl /-- The end position of a slice, as a `Pos.Raw`. -/ @[expose] def Slice.rawEndPos (s : Slice) : Pos.Raw where byteIdx := s.utf8ByteSize @[simp] theorem Slice.byteIdx_rawEndPos {s : Slice} : s.rawEndPos.byteIdx = s.utf8ByteSize := (rfl) /-- Criterion for validity of positions in string slices. -/ structure Pos.Raw.IsValidForSlice (s : Slice) (p : Pos.Raw) : Prop where le_rawEndPos : p ≤ s.rawEndPos isValid_offsetBy : (p.offsetBy s.startInclusive.offset).IsValid s.str theorem Pos.Raw.IsValidForSlice.le_utf8ByteSize {s : Slice} {p : Pos.Raw} (h : p.IsValidForSlice s) : p.byteIdx ≤ s.utf8ByteSize := by simpa [Pos.Raw.le_iff] using h.le_rawEndPos /-- Accesses the indicated byte in the UTF-8 encoding of a string slice. At runtime, this function is implemented by efficient, constant-time code. -/ @[inline, expose] def Slice.getUTF8Byte (s : Slice) (p : Pos.Raw) (h : p < s.rawEndPos) : UInt8 := s.str.getUTF8Byte (p.offsetBy s.startInclusive.offset) (by have := s.endExclusive.isValid.le_rawEndPos simp only [Pos.Raw.lt_iff, byteIdx_rawEndPos, utf8ByteSize_eq, Pos.Raw.le_iff, byteIdx_rawEndPos, Pos.Raw.byteIdx_offsetBy] at * omega) /-- Accesses the indicated byte in the UTF-8 encoding of the string slice, or panics if the position is out-of-bounds. -/ @[expose] def Slice.getUTF8Byte! (s : Slice) (p : String.Pos.Raw) : UInt8 := if h : p < s.rawEndPos then s.getUTF8Byte p h else panic! "String slice access is out of bounds." /-- A `Slice.Pos s` is a byte offset in `s` together with a proof that this position is at a UTF-8 character boundary. -/ @[ext] structure Slice.Pos (s : Slice) where /-- The underlying byte offset of the `Slice.Pos`. -/ offset : String.Pos.Raw /-- The proof that `offset` is valid for the string slice `s`. -/ isValidForSlice : offset.IsValidForSlice s deriving @[expose] DecidableEq /-- The start position of `s`, as an `s.Pos`. -/ @[inline, expose] def Slice.startPos (s : Slice) : s.Pos where offset := 0 isValidForSlice := ⟨by simp [Pos.Raw.le_iff], by simpa using s.startInclusive.isValid⟩ @[simp] theorem Slice.offset_startPos {s : Slice} : s.startPos.offset = 0 := (rfl) instance {s : Slice} : Inhabited s.Pos where default := s.startPos @[simp] theorem Slice.offset_startInclusive_add_self {s : Slice} : s.startInclusive.offset + s = s.endExclusive.offset := by have := s.startInclusive_le_endExclusive simp_all [String.Pos.Raw.ext_iff, ValidPos.le_iff, Pos.Raw.le_iff, utf8ByteSize_eq] @[simp] theorem Pos.Raw.offsetBy_rawEndPos_left {p : Pos.Raw} {s : String} : s.rawEndPos.offsetBy p = p + s := by simp [Pos.Raw.ext_iff] @[simp] theorem Pos.Raw.offsetBy_rawEndPos_right {p : Pos.Raw} {s : String} : p.offsetBy s.rawEndPos = s + p := by simp [Pos.Raw.ext_iff] @[simp] theorem Pos.Raw.offsetBy_sliceRawEndPos_left {p : Pos.Raw} {s : Slice} : s.rawEndPos.offsetBy p = p + s := by simp [Pos.Raw.ext_iff] @[simp] theorem Pos.Raw.offsetBy_sliceRawEndPos_right {p : Pos.Raw} {s : Slice} : p.offsetBy s.rawEndPos = s + p := by simp [Pos.Raw.ext_iff] /-- The past-the-end position of `s`, as an `s.Pos`. -/ @[inline, expose] def Slice.endPos (s : Slice) : s.Pos where offset := s.rawEndPos isValidForSlice := ⟨by simp, by simpa using s.endExclusive.isValid⟩ @[simp] theorem Slice.offset_endPos {s : Slice} : s.endPos.offset = s.rawEndPos := (rfl) instance {s : Slice} : LE s.Pos where le l r := l.offset ≤ r.offset instance {s : Slice} : LT s.Pos where lt l r := l.offset < r.offset theorem Slice.Pos.le_iff {s : Slice} {l r : s.Pos} : l ≤ r ↔ l.offset ≤ r.offset := Iff.rfl theorem Slice.Pos.lt_iff {s : Slice} {l r : s.Pos} : l < r ↔ l.offset < r.offset := Iff.rfl instance {s : Slice} (l r : s.Pos) : Decidable (l ≤ r) := decidable_of_iff' _ Slice.Pos.le_iff instance {s : Slice} (l r : s.Pos) : Decidable (l < r) := decidable_of_iff' _ Slice.Pos.lt_iff /-- Returns the byte at a position in a slice that is not the end position. -/ @[inline, expose] def Slice.Pos.byte {s : Slice} (pos : s.Pos) (h : pos ≠ s.endPos) : UInt8 := s.getUTF8Byte pos.offset (by have := pos.isValidForSlice.le_rawEndPos simp_all [Pos.ext_iff, String.Pos.Raw.ext_iff, Pos.Raw.le_iff, Pos.Raw.lt_iff] omega) @[simp] theorem default_eq : default = "" := rfl @[simp] theorem mk_eq_asString (s : List Char) : String.mk s = List.asString s := rfl theorem push_eq_append (c : Char) : String.push s c = s ++ singleton c := by simp end String