diff --git a/src/Init/Core.lean b/src/Init/Core.lean index abeca11449..2d9fbff8ee 100644 --- a/src/Init/Core.lean +++ b/src/Init/Core.lean @@ -1580,6 +1580,7 @@ instance {p q : Prop} [d : Decidable (p ↔ q)] : Decidable (p = q) := gen_injective_theorems% Array gen_injective_theorems% BitVec +gen_injective_theorems% ByteArray gen_injective_theorems% Char gen_injective_theorems% DoResultBC gen_injective_theorems% DoResultPR diff --git a/src/Init/Data/BitVec/Lemmas.lean b/src/Init/Data/BitVec/Lemmas.lean index 11898eea0a..eaeecfa675 100644 --- a/src/Init/Data/BitVec/Lemmas.lean +++ b/src/Init/Data/BitVec/Lemmas.lean @@ -1100,6 +1100,10 @@ theorem toInt_setWidth' {m n : Nat} (p : m ≤ n) {x : BitVec m} : rw [setWidth'_eq, toFin_setWidth, Fin.val_ofNat, Fin.coe_castLE, val_toFin, Nat.mod_eq_of_lt (by apply BitVec.toNat_lt_twoPow_of_le p)] +theorem toNat_setWidth_of_le {w w' : Nat} {b : BitVec w} (h : w ≤ w') : (b.setWidth w').toNat = b.toNat := by + rw [BitVec.toNat_setWidth, Nat.mod_eq_of_lt] + exact BitVec.toNat_lt_twoPow_of_le h + /-! ## extractLsb -/ @[simp, grind =] @@ -1287,6 +1291,17 @@ theorem extractLsb'_eq_zero {x : BitVec w} {start : Nat} : ext i hi omega +theorem extractLsb'_setWidth_of_le {b : BitVec w} {start len w' : Nat} (h : start + len ≤ w') : + (b.setWidth w').extractLsb' start len = b.extractLsb' start len := by + ext i h_i + simp + omega + +theorem setWidth_extractLsb'_of_le {c : BitVec w} (h : len₁ ≤ len₂) : + (c.extractLsb' start len₂).setWidth len₁ = c.extractLsb' start len₁ := by + ext i hi + simp [show i < len₂ by omega] + /-! ### allOnes -/ @[simp, grind =] theorem toNat_allOnes : (allOnes v).toNat = 2^v - 1 := by @@ -1530,6 +1545,12 @@ theorem extractLsb_and {x : BitVec w} {hi lo : Nat} : @[simp, grind =] theorem ofNat_and {x y : Nat} : BitVec.ofNat w (x &&& y) = BitVec.ofNat w x &&& BitVec.ofNat w y := eq_of_toNat_eq (by simp [Nat.and_mod_two_pow]) +theorem and_or_distrib_left {x y z : BitVec w} : x &&& (y ||| z) = (x &&& y) ||| (x &&& z) := + BitVec.eq_of_getElem_eq (by simp [Bool.and_or_distrib_left]) + +theorem and_or_distrib_right {x y z : BitVec w} : (x ||| y) &&& z = (x &&& z) ||| (y &&& z) := + BitVec.eq_of_getElem_eq (by simp [Bool.and_or_distrib_right]) + /-! ### xor -/ @[simp, grind =] theorem toNat_xor (x y : BitVec v) : @@ -2180,6 +2201,10 @@ theorem msb_ushiftRight {x : BitVec w} {n : Nat} : have := lt_of_getLsbD ha omega +theorem setWidth_ushiftRight_eq_extractLsb {b : BitVec w} : (b >>> w').setWidth w'' = b.extractLsb' w' w'' := by + ext i hi + simp + /-! ### ushiftRight reductions from BitVec to Nat -/ @[simp, grind =] @@ -2970,10 +2995,9 @@ theorem shiftLeft_eq_concat_of_lt {x : BitVec w} {n : Nat} (hn : n < w) : /-- Combine adjacent `extractLsb'` operations into a single `extractLsb'`. -/ theorem extractLsb'_append_extractLsb'_eq_extractLsb' {x : BitVec w} (h : start₂ = start₁ + len₁) : ((x.extractLsb' start₂ len₂) ++ (x.extractLsb' start₁ len₁)) = - (x.extractLsb' start₁ (len₁ + len₂)).cast (by omega) := by + x.extractLsb' start₁ (len₂ + len₁) := by ext i h - simp only [getElem_append, getElem_extractLsb', dite_eq_ite, getElem_cast, ite_eq_left_iff, - Nat.not_lt] + simp only [getElem_append, getElem_extractLsb', dite_eq_ite, ite_eq_left_iff, Nat.not_lt] intro hi congr 1 omega @@ -3085,6 +3109,51 @@ theorem extractLsb'_append_eq_of_le {v w} {xhi : BitVec v} {xlo : BitVec w} extractLsb' start len (xhi ++ xlo) = extractLsb' (start - w) len xhi := by simp [extractLsb'_append_eq_ite, show ¬ start < w by omega] +theorem extractLsb'_append_eq_left {a : BitVec w} {b : BitVec w'} : (a ++ b).extractLsb' w' w = a := by + simp [BitVec.extractLsb'_append_eq_of_le] + +theorem extractLsb'_append_eq_right {a : BitVec w} {b : BitVec w'} : (a ++ b).extractLsb' 0 w' = b := by + simp [BitVec.extractLsb'_append_eq_of_add_le] + +theorem setWidth_append_eq_right {a : BitVec w} {b : BitVec w'} : (a ++ b).setWidth w' = b := by + ext i hi + simp [getLsbD_append, hi] + +theorem append_left_inj {s₁ s₂ : BitVec w} (t : BitVec w') : s₁ ++ t = s₂ ++ t ↔ s₁ = s₂ := by + refine ⟨fun h => ?_, fun h => h ▸ rfl⟩ + ext i hi + simpa [getElem_append, dif_neg] using congrArg (·[i + w']'(by omega)) h + +theorem append_right_inj (s : BitVec w) {t₁ t₂ : BitVec w'} : s ++ t₁ = s ++ t₂ ↔ t₁ = t₂ := by + refine ⟨fun h => ?_, fun h => h ▸ rfl⟩ + ext i hi + simpa [getElem_append, hi] using congrArg (·[i]) h + +theorem setWidth_append_eq_shiftLeft_setWidth_or {b : BitVec w} {b' : BitVec w'} : + (b ++ b').setWidth w'' = (b.setWidth w'' <<< w') ||| b'.setWidth w'' := by + ext i hi + simp only [getElem_setWidth, getElem_or, getElem_shiftLeft] + rw [getLsbD_append] + split <;> simp_all + +theorem setWidth_append_append_eq_shiftLeft_setWidth_or {b : BitVec w} {b' : BitVec w'} {b'' : BitVec w''} : + (b ++ b' ++ b'').setWidth w''' = (b.setWidth w''' <<< (w' + w'')) ||| (b'.setWidth w''' <<< w'') ||| b''.setWidth w''' := by + rw [BitVec.setWidth_append_eq_shiftLeft_setWidth_or, + BitVec.setWidth_append_eq_shiftLeft_setWidth_or, + BitVec.shiftLeft_or_distrib, BitVec.shiftLeft_add] + +theorem setWidth_append_append_append_eq_shiftLeft_setWidth_or {b : BitVec w} {b' : BitVec w'} {b'' : BitVec w''} {b''' : BitVec w'''} : + (b ++ b' ++ b'' ++ b''').setWidth w'''' = (b.setWidth w'''' <<< (w' + w'' + w''')) ||| (b'.setWidth w'''' <<< (w'' + w''')) ||| + (b''.setWidth w'''' <<< w''') ||| b'''.setWidth w'''' := by + simp only [BitVec.setWidth_append_eq_shiftLeft_setWidth_or, BitVec.shiftLeft_or_distrib, BitVec.shiftLeft_add] + +theorem and_setWidth_allOnes (w' w : Nat) (b : BitVec (w' + w)) : + b &&& (BitVec.allOnes w).setWidth (w' + w) = 0#w' ++ b.setWidth w := by + ext i hi + simp only [getElem_and, getElem_setWidth, getLsbD_allOnes] + rw [BitVec.getElem_append] + split <;> simp_all + /-! ### rev -/ @[grind =] @@ -4041,6 +4110,9 @@ instance instLawfulOrderLT : LawfulOrderLT (BitVec n) := by apply LawfulOrderLT.of_le simpa using fun _ _ => BitVec.lt_asymm +theorem length_pos_of_lt {b b' : BitVec w} (h : b < b') : 0 < w := + length_pos_of_ne (BitVec.ne_of_lt h) + protected theorem umod_lt (x : BitVec n) {y : BitVec n} : 0 < y → x % y < y := by simp only [ofNat_eq_ofNat, lt_def, toNat_ofNat, Nat.zero_mod] apply Nat.mod_lt @@ -4112,6 +4184,14 @@ theorem lt_of_msb_false_of_msb_true {x y : BitVec w} (hx : x.msb = false) (hy : simp omega +theorem lt_add_one {b : BitVec w} (h : b ≠ allOnes w) : b < b + 1 := by + simp only [ne_eq, ← toNat_inj, toNat_allOnes] at h + simp only [BitVec.lt_def, ofNat_eq_ofNat, toNat_add, toNat_ofNat, Nat.add_mod_mod] + rw [Nat.mod_eq_of_lt] + · exact Nat.lt_add_one _ + · have := b.toNat_lt_twoPow_of_le (Nat.le_refl _) + omega + /-! ### udiv -/ theorem udiv_def {x y : BitVec n} : x / y = BitVec.ofNat n (x.toNat / y.toNat) := by @@ -5257,7 +5337,7 @@ theorem replicate_succ' {x : BitVec w} : (replicate n x ++ x).cast (by rw [Nat.mul_succ]) := by simp [replicate_append_self] -theorem BitVec.setWidth_add_eq_mod {x y : BitVec w} : BitVec.setWidth i (x + y) = (BitVec.setWidth i x + BitVec.setWidth i y) % (BitVec.twoPow i w) := by +theorem setWidth_add_eq_mod {x y : BitVec w} : BitVec.setWidth i (x + y) = (BitVec.setWidth i x + BitVec.setWidth i y) % (BitVec.twoPow i w) := by apply BitVec.eq_of_toNat_eq rw [toNat_setWidth] simp only [toNat_setWidth, toNat_add, toNat_umod, Nat.add_mod_mod, Nat.mod_add_mod, toNat_twoPow] @@ -5266,6 +5346,14 @@ theorem BitVec.setWidth_add_eq_mod {x y : BitVec w} : BitVec.setWidth i (x + y) · have hk : 2 ^ w < 2 ^ i := Nat.pow_lt_pow_of_lt (by decide) (Nat.lt_of_not_le h) rw [Nat.mod_eq_of_lt hk, Nat.mod_mod_eq_mod_mod_of_dvd (Nat.pow_dvd_pow _ (Nat.le_of_not_le h))] +theorem setWidth_setWidth_eq_self {a : BitVec w} {w' : Nat} (h : a < BitVec.twoPow w w') : (a.setWidth w').setWidth w = a := by + by_cases hw : w' < w + · simp only [toNat_eq, toNat_setWidth] + rw [Nat.mod_mod_of_dvd' (Nat.pow_dvd_pow _ (Nat.le_of_lt hw)), Nat.mod_eq_of_lt] + rwa [BitVec.lt_def, BitVec.toNat_twoPow_of_lt hw] at h + · rw [BitVec.lt_def, BitVec.toNat_twoPow_of_le (by omega)] at h + simp at h + /-! ### intMin -/ @[grind =] @@ -5928,6 +6016,12 @@ theorem getElem_eq_true_of_lt_of_le {x : BitVec w} (hk' : k < w) (hlt: x.toNat < omega · simp [show w ≤ k + k' by omega] at hk' +theorem not_lt_iff {b : BitVec w} : ~~~b < b ↔ 0 < w ∧ b.msb = true := by + refine ⟨fun h => ?_, fun ⟨hw, hb⟩ => ?_⟩ + · have := length_pos_of_lt h + exact ⟨this, by rwa [← ult_iff_lt, ult_eq_msb_of_msb_neq (by simp_all)] at h⟩ + · rwa [← ult_iff_lt, ult_eq_msb_of_msb_neq (by simp_all)] + /-! ### Count leading zeros -/ theorem clzAuxRec_zero (x : BitVec w) : diff --git a/src/Init/Data/ByteArray.lean b/src/Init/Data/ByteArray.lean index 4f1cf8249b..a9a32612cd 100644 --- a/src/Init/Data/ByteArray.lean +++ b/src/Init/Data/ByteArray.lean @@ -7,6 +7,8 @@ module prelude public import Init.Data.ByteArray.Basic +public import Init.Data.ByteArray.Bootstrap public import Init.Data.ByteArray.Extra +public import Init.Data.ByteArray.Lemmas public section diff --git a/src/Init/Data/ByteArray/Basic.lean b/src/Init/Data/ByteArray/Basic.lean index 41654438b8..0faadabd3e 100644 --- a/src/Init/Data/ByteArray/Basic.lean +++ b/src/Init/Data/ByteArray/Basic.lean @@ -11,16 +11,11 @@ public import Init.Data.UInt.Basic public import Init.Data.UInt.BasicAux import all Init.Data.UInt.BasicAux public import Init.Data.Option.Basic +public import Init.Data.Array.Extract @[expose] public section universe u -structure ByteArray where - data : Array UInt8 - -attribute [extern "lean_byte_array_mk"] ByteArray.mk -attribute [extern "lean_byte_array_data"] ByteArray.data - namespace ByteArray deriving instance BEq for ByteArray @@ -30,29 +25,15 @@ attribute [ext] ByteArray instance : DecidableEq ByteArray := fun _ _ => decidable_of_decidable_of_iff ByteArray.ext_iff.symm -@[extern "lean_mk_empty_byte_array"] -def emptyWithCapacity (c : @& Nat) : ByteArray := - { data := #[] } - @[deprecated emptyWithCapacity (since := "2025-03-12")] abbrev mkEmpty := emptyWithCapacity -def empty : ByteArray := emptyWithCapacity 0 - instance : Inhabited ByteArray where default := empty instance : EmptyCollection ByteArray where emptyCollection := ByteArray.empty -@[extern "lean_byte_array_push"] -def push : ByteArray → UInt8 → ByteArray - | ⟨bs⟩, b => ⟨bs.push b⟩ - -@[extern "lean_byte_array_size"] -def size : (@& ByteArray) → Nat - | ⟨bs⟩ => bs.size - @[extern "lean_sarray_size", simp] def usize (a : @& ByteArray) : USize := a.size.toUSize @@ -106,11 +87,31 @@ def copySlice (src : @& ByteArray) (srcOff : Nat) (dest : ByteArray) (destOff le def extract (a : ByteArray) (b e : Nat) : ByteArray := a.copySlice b empty 0 (e - b) -protected def append (a : ByteArray) (b : ByteArray) : ByteArray := +protected def fastAppend (a : ByteArray) (b : ByteArray) : ByteArray := -- we assume that `append`s may be repeated, so use asymptotic growing; use `copySlice` directly to customize b.copySlice 0 a a.size b.size false -instance : Append ByteArray := ⟨ByteArray.append⟩ +@[simp] +theorem size_data {a : ByteArray} : + a.data.size = a.size := rfl + +@[csimp] +theorem append_eq_fastAppend : @ByteArray.append = @ByteArray.fastAppend := by + funext a b + ext1 + apply Array.ext' + simp [ByteArray.fastAppend, copySlice, ← size_data, - Array.append_assoc] + +-- Needs to come after the `csimp` lemma +instance : Append ByteArray where + append := ByteArray.append + +@[simp] +theorem append_eq {a b : ByteArray} : a.append b = a ++ b := rfl + +@[simp] +theorem fastAppend_eq {a b : ByteArray} : a.fastAppend b = a ++ b := by + simp [← append_eq_fastAppend] def toList (bs : ByteArray) : List UInt8 := let rec loop (i : Nat) (r : List UInt8) := @@ -350,13 +351,4 @@ def prevn : Iterator → Nat → Iterator end Iterator end ByteArray -/-- -Converts a list of bytes into a `ByteArray`. --/ -def List.toByteArray (bs : List UInt8) : ByteArray := - let rec loop - | [], r => r - | b::bs, r => loop bs (r.push b) - loop bs ByteArray.empty - instance : ToString ByteArray := ⟨fun bs => bs.toList.toString⟩ diff --git a/src/Init/Data/ByteArray/Bootstrap.lean b/src/Init/Data/ByteArray/Bootstrap.lean new file mode 100644 index 0000000000..4e9a5df2ea --- /dev/null +++ b/src/Init/Data/ByteArray/Bootstrap.lean @@ -0,0 +1,53 @@ +/- +Copyright (c) 2025 Lean FRO, LLC. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Author: Markus Himmel +-/ +module + +prelude +public import Init.Prelude +public import Init.Data.List.Basic + +public section + +namespace ByteArray + +@[simp] +theorem data_push {a : ByteArray} {b : UInt8} : (a.push b).data = a.data.push b := rfl + +@[expose] +protected def append (a b : ByteArray) : ByteArray := + ⟨⟨a.data.toList ++ b.data.toList⟩⟩ + +@[simp] +theorem toList_data_append' {a b : ByteArray} : + (a.append b).data.toList = a.data.toList ++ b.data.toList := by + have ⟨⟨a⟩⟩ := a + have ⟨⟨b⟩⟩ := b + rfl + +theorem ext : {x y : ByteArray} → x.data = y.data → x = y + | ⟨_⟩, ⟨_⟩, rfl => rfl + +end ByteArray + +@[simp] +theorem List.toList_data_toByteArray {l : List UInt8} : + l.toByteArray.data.toList = l := by + rw [List.toByteArray] + suffices ∀ a b, (List.toByteArray.loop a b).data.toList = b.data.toList ++ a by + simpa using this l ByteArray.empty + intro a b + fun_induction List.toByteArray.loop a b with simp_all [toList_push] +where + toList_push {xs : Array UInt8} {x : UInt8} : (xs.push x).toList = xs.toList ++ [x] := by + have ⟨xs⟩ := xs + simp [Array.push, List.concat_eq_append] + +theorem List.toByteArray_append' {l l' : List UInt8} : + (l ++ l').toByteArray = l.toByteArray.append l'.toByteArray := + ByteArray.ext (ext (by simp)) +where + ext : {x y : Array UInt8} → x.toList = y.toList → x = y + | ⟨_⟩, ⟨_⟩, rfl => rfl diff --git a/src/Init/Data/ByteArray/Lemmas.lean b/src/Init/Data/ByteArray/Lemmas.lean new file mode 100644 index 0000000000..4026d99b58 --- /dev/null +++ b/src/Init/Data/ByteArray/Lemmas.lean @@ -0,0 +1,252 @@ +/- +Copyright (c) 2025 Lean FRO, LLC. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Author: Markus Himmel +-/ +module + +prelude +public import Init.Data.ByteArray.Basic +public import Init.Data.Array.Extract + +public section + +@[simp] +theorem ByteArray.data_empty : ByteArray.empty.data = #[] := rfl + +@[simp] +theorem ByteArray.data_extract {a : ByteArray} {b e : Nat} : + (a.extract b e).data = a.data.extract b e := by + simp [extract, copySlice] + by_cases b ≤ e + · rw [(by omega : b + (e - b) = e)] + · rw [Array.extract_eq_empty_of_le (by omega), Array.extract_eq_empty_of_le (by omega)] + +@[simp] +theorem ByteArray.extract_zero_size {b : ByteArray} : b.extract 0 b.size = b := by + ext1 + simp + +@[simp] +theorem ByteArray.extract_same {b : ByteArray} {i : Nat} : b.extract i i = ByteArray.empty := by + ext1 + simp [Nat.min_le_left] + +theorem ByteArray.fastAppend_eq_copySlice {a b : ByteArray} : + a.fastAppend b = b.copySlice 0 a a.size b.size false := rfl + +@[simp] +theorem List.toByteArray_append {l l' : List UInt8} : + (l ++ l').toByteArray = l.toByteArray ++ l'.toByteArray := by + simp [List.toByteArray_append'] + +@[simp] +theorem ByteArray.toList_data_append {l l' : ByteArray} : + (l ++ l').data.toList = l.data.toList ++ l'.data.toList := by + simp [← append_eq] + +@[simp] +theorem ByteArray.data_append {l l' : ByteArray} : + (l ++ l').data = l.data ++ l'.data := by + simp [← Array.toList_inj] + +@[simp] +theorem ByteArray.size_empty : ByteArray.empty.size = 0 := by + simp [← ByteArray.size_data] + +@[simp] +theorem List.data_toByteArray {l : List UInt8} : + l.toByteArray.data = l.toArray := by + rw [List.toByteArray] + suffices ∀ a b, (List.toByteArray.loop a b).data = b.data ++ a.toArray by + simpa using this l ByteArray.empty + intro a b + fun_induction List.toByteArray.loop a b with simp_all + +@[simp] +theorem List.size_toByteArray {l : List UInt8} : + l.toByteArray.size = l.length := by + simp [← ByteArray.size_data] + +@[simp] +theorem List.toByteArray_nil : List.toByteArray [] = ByteArray.empty := rfl + +@[simp] +theorem ByteArray.empty_append {b : ByteArray} : ByteArray.empty ++ b = b := by + ext1 + simp + +@[simp] +theorem ByteArray.append_empty {b : ByteArray} : b ++ ByteArray.empty = b := by + ext1 + simp + +@[simp, grind] +theorem ByteArray.size_append {a b : ByteArray} : (a ++ b).size = a.size + b.size := by + simp [← size_data] + +@[simp] +theorem ByteArray.size_eq_zero_iff {a : ByteArray} : a.size = 0 ↔ a = ByteArray.empty := by + refine ⟨fun h => ?_, fun h => h ▸ ByteArray.size_empty⟩ + ext1 + simp [← Array.size_eq_zero_iff, h] + +theorem ByteArray.getElem_eq_getElem_data {a : ByteArray} {i : Nat} {h : i < a.size} : + a[i] = a.data[i]'(by simpa [← size_data]) := rfl + +@[simp] +theorem ByteArray.getElem_append_left {i : Nat} {a b : ByteArray} {h : i < (a ++ b).size} + (hlt : i < a.size) : (a ++ b)[i] = a[i] := by + simp only [getElem_eq_getElem_data, data_append] + rw [Array.getElem_append_left (by simpa)] + +theorem ByteArray.getElem_append_right {i : Nat} {a b : ByteArray} {h : i < (a ++ b).size} + (hle : a.size ≤ i) : (a ++ b)[i] = b[i - a.size]'(by simp_all; omega) := by + simp only [getElem_eq_getElem_data, data_append] + rw [Array.getElem_append_right (by simpa)] + simp + +@[simp] +theorem List.getElem_toByteArray {l : List UInt8} {i : Nat} {h : i < l.toByteArray.size} : + l.toByteArray[i]'h = l[i]'(by simp_all) := by + simp [ByteArray.getElem_eq_getElem_data] + +theorem List.getElem_eq_getElem_toByteArray {l : List UInt8} {i : Nat} {h : i < l.length} : + l[i]'h = l.toByteArray[i]'(by simp_all) := by + simp + +@[simp] +theorem ByteArray.size_extract {a : ByteArray} {b e : Nat} : + (a.extract b e).size = min e a.size - b := by + simp [← size_data] + +@[simp] +theorem ByteArray.extract_eq_empty_iff {b : ByteArray} {i j : Nat} : b.extract i j = ByteArray.empty ↔ min j b.size ≤ i := by + rw [← size_eq_zero_iff, size_extract] + omega + +@[simp] +theorem ByteArray.extract_add_left {b : ByteArray} {i j : Nat} : b.extract (i + j) i = ByteArray.empty := by + simp only [extract_eq_empty_iff] + exact Nat.le_trans (Nat.min_le_left _ _) (by simp) + +@[simp] +theorem ByteArray.append_eq_empty_iff {a b : ByteArray} : + a ++ b = ByteArray.empty ↔ a = ByteArray.empty ∧ b = ByteArray.empty := by + simp [← size_eq_zero_iff, size_append] + +@[simp] +theorem List.toByteArray_eq_empty {l : List UInt8} : + l.toByteArray = ByteArray.empty ↔ l = [] := by + simp [← ByteArray.size_eq_zero_iff] + +theorem ByteArray.append_right_inj {ys₁ ys₂ : ByteArray} (xs : ByteArray) : + xs ++ ys₁ = xs ++ ys₂ ↔ ys₁ = ys₂ := by + simp [ByteArray.ext_iff, Array.append_right_inj] + +@[simp] +theorem ByteArray.extract_append_extract {a : ByteArray} {i j k : Nat} : + a.extract i j ++ a.extract j k = a.extract (min i j) (max j k) := by + ext1 + simp + +theorem ByteArray.extract_eq_extract_append_extract {a : ByteArray} {i k : Nat} (j : Nat) + (hi : i ≤ j) (hk : j ≤ k) : + a.extract i k = a.extract i j ++ a.extract j k := by + simp + rw [Nat.min_eq_left hi, Nat.max_eq_right hk] + +theorem ByteArray.append_inj_left {xs₁ xs₂ ys₁ ys₂ : ByteArray} (h : xs₁ ++ ys₁ = xs₂ ++ ys₂) (hl : xs₁.size = xs₂.size) : xs₁ = xs₂ := by + simp only [ByteArray.ext_iff, ← ByteArray.size_data, ByteArray.data_append] at * + exact Array.append_inj_left h hl + +theorem ByteArray.extract_append_eq_right {a b : ByteArray} {i : Nat} (hi : i = a.size) : + (a ++ b).extract i (a ++ b).size = b := by + subst hi + ext1 + simp [← size_data] + +theorem ByteArray.extract_append_eq_left {a b : ByteArray} {i : Nat} (hi : i = a.size) : + (a ++ b).extract 0 i = a := by + subst hi + ext1 + simp + +theorem ByteArray.extract_append_size_left {a b : ByteArray} {i : Nat} : + (a ++ b).extract i a.size = a.extract i a.size := by + ext1 + simp + +theorem ByteArray.extract_append_size_add {a b : ByteArray} {i j : Nat} : + (a ++ b).extract (a.size + i) (a.size + j) = b.extract i j := by + ext1 + simp + +theorem ByteArray.extract_append {as bs : ByteArray} {i j : Nat} : + (as ++ bs).extract i j = as.extract i j ++ bs.extract (i - as.size) (j - as.size) := by + ext1 + simp + +theorem ByteArray.extract_append_size_add' {a b : ByteArray} {i j k : Nat} (h : k = a.size) : + (a ++ b).extract (k + i) (k + j) = b.extract i j := by + cases h + rw [extract_append_size_add] + +theorem ByteArray.extract_extract {a : ByteArray} {i j k l : Nat} : + (a.extract i j).extract k l = a.extract (i + k) (min (i + l) j) := by + ext1 + simp + +theorem ByteArray.getElem_extract_aux {xs : ByteArray} {start stop : Nat} (h : i < (xs.extract start stop).size) : + start + i < xs.size := by + rw [size_extract] at h; apply Nat.add_lt_of_lt_sub'; apply Nat.lt_of_lt_of_le h + apply Nat.sub_le_sub_right; apply Nat.min_le_right + +theorem ByteArray.getElem_extract {i : Nat} {b : ByteArray} {start stop : Nat} + (h) : (b.extract start stop)[i]'h = b[start + i]'(getElem_extract_aux h) := by + simp [getElem_eq_getElem_data] + +theorem ByteArray.extract_eq_extract_left {a : ByteArray} {i i' j : Nat} : + a.extract i j = a.extract i' j ↔ min j a.size - i = min j a.size - i' := by + simp [ByteArray.ext_iff, Array.extract_eq_extract_left] + +theorem ByteArray.extract_add_one {a : ByteArray} {i : Nat} (ha : i + 1 ≤ a.size) : + a.extract i (i + 1) = [a[i]].toByteArray := by + ext + · simp + omega + · rename_i j hj hj' + obtain rfl : j = 0 := by simpa using hj' + simp [ByteArray.getElem_eq_getElem_data] + +theorem ByteArray.extract_add_two {a : ByteArray} {i : Nat} (ha : i + 2 ≤ a.size) : + a.extract i (i + 2) = [a[i], a[i + 1]].toByteArray := by + rw [extract_eq_extract_append_extract (i + 1) (by simp) (by omega), + extract_add_one (by omega), extract_add_one (by omega)] + simp [← List.toByteArray_append] + +theorem ByteArray.extract_add_three {a : ByteArray} {i : Nat} (ha : i + 3 ≤ a.size) : + a.extract i (i + 3) = [a[i], a[i + 1], a[i + 2]].toByteArray := by + rw [extract_eq_extract_append_extract (i + 1) (by simp) (by omega), + extract_add_one (by omega), extract_add_two (by omega)] + simp [← List.toByteArray_append] + +theorem ByteArray.extract_add_four {a : ByteArray} {i : Nat} (ha : i + 4 ≤ a.size) : + a.extract i (i + 4) = [a[i], a[i + 1], a[i + 2], a[i + 3]].toByteArray := by + rw [extract_eq_extract_append_extract (i + 1) (by simp) (by omega), + extract_add_one (by omega), extract_add_three (by omega)] + simp [← List.toByteArray_append] + +theorem ByteArray.append_assoc {a b c : ByteArray} : a ++ b ++ c = a ++ (b ++ c) := by + ext1 + simp + +@[simp] +theorem ByteArray.toList_empty : ByteArray.empty.toList = [] := by + simp [ByteArray.toList, ByteArray.toList.loop] + +theorem ByteArray.copySlice_eq_append {src : ByteArray} {srcOff : Nat} {dest : ByteArray} {destOff len : Nat} {exact : Bool} : + ByteArray.copySlice src srcOff dest destOff len exact = + dest.extract 0 destOff ++ src.extract srcOff (srcOff +len) ++ dest.extract (destOff + min len (src.data.size - srcOff)) dest.data.size := by + ext1 + simp [copySlice] diff --git a/src/Init/Data/List/Basic.lean b/src/Init/Data/List/Basic.lean index a8631f9bee..f26a258425 100644 --- a/src/Init/Data/List/Basic.lean +++ b/src/Init/Data/List/Basic.lean @@ -475,21 +475,6 @@ We define the basic functional programming operations on `List`: /-! ### map -/ -/-- -Applies a function to each element of the list, returning the resulting list of values. - -`O(|l|)`. - -Examples: -* `[a, b, c].map f = [f a, f b, f c]` -* `[].map Nat.succ = []` -* `["one", "two", "three"].map (·.length) = [3, 3, 5]` -* `["one", "two", "three"].map (·.reverse) = ["eno", "owt", "eerht"]` --/ -@[specialize] def map (f : α → β) : (l : List α) → List β - | [] => [] - | a::as => f a :: map f as - @[simp, grind =] theorem map_nil {f : α → β} : map f [] = [] := rfl @[simp, grind =] theorem map_cons {f : α → β} {a : α} {l : List α} : map f (a :: l) = f a :: map f l := rfl @@ -606,20 +591,6 @@ Appends two lists. Normally used via the `++` operator. Appending lists takes time proportional to the length of the first list: `O(|xs|)`. -Examples: - * `[1, 2, 3] ++ [4, 5] = [1, 2, 3, 4, 5]`. - * `[] ++ [4, 5] = [4, 5]`. - * `[1, 2, 3] ++ [] = [1, 2, 3]`. --/ -protected def append : (xs ys : List α) → List α - | [], bs => bs - | a::as, bs => a :: List.append as bs - -/-- -Appends two lists. Normally used via the `++` operator. - -Appending lists takes time proportional to the length of the first list: `O(|xs|)`. - This is a tail-recursive version of `List.append`. Examples: @@ -691,18 +662,6 @@ theorem reverseAux_eq_append {as bs : List α} : reverseAux as bs = reverseAux a /-! ### flatten -/ -/-- -Concatenates a list of lists into a single list, preserving the order of the elements. - -`O(|flatten L|)`. - -Examples: -* `[["a"], ["b", "c"]].flatten = ["a", "b", "c"]` -* `[["a"], [], ["b", "c"], ["d", "e", "f"]].flatten = ["a", "b", "c", "d", "e", "f"]` --/ -def flatten : List (List α) → List α - | [] => [] - | l :: L => l ++ flatten L @[simp, grind =] theorem flatten_nil : List.flatten ([] : List (List α)) = [] := rfl @[simp, grind =] theorem flatten_cons : (l :: L).flatten = l ++ L.flatten := rfl @@ -721,20 +680,14 @@ Examples: /-! ### flatMap -/ -/-- -Applies a function that returns a list to each element of a list, and concatenates the resulting -lists. - -Examples: -* `[2, 3, 2].flatMap List.range = [0, 1, 0, 1, 2, 0, 1]` -* `["red", "blue"].flatMap String.toList = ['r', 'e', 'd', 'b', 'l', 'u', 'e']` --/ -@[inline] def flatMap {α : Type u} {β : Type v} (b : α → List β) (as : List α) : List β := flatten (map b as) - @[simp, grind =] theorem flatMap_nil {f : α → List β} : List.flatMap f [] = [] := by simp [List.flatMap] @[simp, grind =] theorem flatMap_cons {x : α} {xs : List α} {f : α → List β} : List.flatMap f (x :: xs) = f x ++ List.flatMap f xs := by simp [List.flatMap] +@[simp, grind _=_] theorem flatMap_append {xs ys : List α} {f : α → List β} : + (xs ++ ys).flatMap f = xs.flatMap f ++ ys.flatMap f := by + induction xs; {rfl}; simp_all [flatMap_cons, append_assoc] + /-! ### replicate -/ /-- diff --git a/src/Init/Data/List/Lemmas.lean b/src/Init/Data/List/Lemmas.lean index 81ccbff94d..14c37995ec 100644 --- a/src/Init/Data/List/Lemmas.lean +++ b/src/Init/Data/List/Lemmas.lean @@ -348,6 +348,18 @@ theorem ext_getElem {l₁ l₂ : List α} (hl : length l₁ = length l₂) theorem getElem?_concat_length {l : List α} {a : α} : (l ++ [a])[l.length]? = some a := by simp +theorem eq_getElem_of_length_eq_one : (l : List α) → (hl : l.length = 1) → l = [l[0]'(hl ▸ by decide)] + | [_], _ => rfl + +theorem eq_getElem_of_length_eq_two : (l : List α) → (hl : l.length = 2) → l = [l[0]'(hl ▸ by decide), l[1]'(hl ▸ by decide)] + | [_, _], _ => rfl + +theorem eq_getElem_of_length_eq_three : (l : List α) → (hl : l.length = 3) → l = [l[0]'(hl ▸ by decide), l[1]'(hl ▸ by decide), l[2]'(hl ▸ by decide)] + | [_, _, _], _ => rfl + +theorem eq_getElem_of_length_eq_four : (l : List α) → (hl : l.length = 4) → l = [l[0]'(hl ▸ by decide), l[1]'(hl ▸ by decide), l[2]'(hl ▸ by decide), l[3]'(hl ▸ by decide)] + | [_, _, _, _], _ => rfl + /-! ### getD We simplify away `getD`, replacing `getD l n a` with `(l[n]?).getD a`. @@ -2136,10 +2148,6 @@ theorem flatMap_singleton (f : α → List β) (x : α) : [x].flatMap f = f x := simp only [findSome?_cons] split <;> simp_all -@[simp, grind _=_] theorem flatMap_append {xs ys : List α} {f : α → List β} : - (xs ++ ys).flatMap f = xs.flatMap f ++ ys.flatMap f := by - induction xs; {rfl}; simp_all [flatMap_cons, append_assoc] - theorem flatMap_assoc {l : List α} {f : α → List β} {g : β → List γ} : (l.flatMap f).flatMap g = l.flatMap fun x => (f x).flatMap g := by induction l <;> simp [*] diff --git a/src/Init/Data/List/Pairwise.lean b/src/Init/Data/List/Pairwise.lean index 6c9b13b6a8..0bc2fb17c7 100644 --- a/src/Init/Data/List/Pairwise.lean +++ b/src/Init/Data/List/Pairwise.lean @@ -171,7 +171,7 @@ theorem pairwise_append_comm {R : α → α → Prop} (s : ∀ {x y}, R x y → induction L with | nil => simp | cons l L IH => - simp only [flatten, pairwise_append, IH, mem_flatten, exists_imp, and_imp, forall_mem_cons, + simp only [flatten_cons, pairwise_append, IH, mem_flatten, exists_imp, and_imp, forall_mem_cons, pairwise_cons, and_assoc, and_congr_right_iff] rw [and_comm, and_congr_left_iff] intros; exact ⟨fun h l' b c d e => h c d e l' b, fun h c d e l' b => h l' b c d e⟩ diff --git a/src/Init/Data/String.lean b/src/Init/Data/String.lean index 0dba31ee4f..c4c104ce89 100644 --- a/src/Init/Data/String.lean +++ b/src/Init/Data/String.lean @@ -8,6 +8,7 @@ module prelude public import Init.Data.String.Basic public import Init.Data.String.Bootstrap +public import Init.Data.String.Decode public import Init.Data.String.Extra public import Init.Data.String.Lemmas public import Init.Data.String.Repr diff --git a/src/Init/Data/String/Basic.lean b/src/Init/Data/String/Basic.lean index 39c1e76c1b..6bd2c13aec 100644 --- a/src/Init/Data/String/Basic.lean +++ b/src/Init/Data/String/Basic.lean @@ -9,11 +9,370 @@ prelude public import Init.Data.List.Basic public import Init.Data.Char.Basic public import Init.Data.String.Bootstrap +public import Init.Data.ByteArray.Basic +public import Init.Data.String.Decode +import Init.Data.ByteArray.Lemmas public section universe u +section + +@[simp] +theorem List.utf8Encode_nil : List.utf8Encode [] = ByteArray.empty := by simp [utf8Encode] + +theorem List.utf8Encode_singleton {c : Char} : [c].utf8Encode = (String.utf8EncodeChar c).toByteArray := by + simp [utf8Encode] + +@[simp] +theorem List.utf8Encode_append {l l' : List Char} : + (l ++ l').utf8Encode = l.utf8Encode ++ l'.utf8Encode := by + simp [utf8Encode] + +theorem List.utf8Encode_cons {c : Char} {l : List Char} : (c :: l).utf8Encode = [c].utf8Encode ++ l.utf8Encode := by + rw [← singleton_append, List.utf8Encode_append] + +@[simp] +theorem String.utf8EncodeChar_ne_nil {c : Char} : String.utf8EncodeChar c ≠ [] := by + fun_cases String.utf8EncodeChar with simp + +@[simp] +theorem List.utf8Encode_eq_empty {l : List Char} : l.utf8Encode = ByteArray.empty ↔ l = [] := by + simp [utf8Encode, ← List.eq_nil_iff_forall_not_mem] + +theorem ByteArray.isValidUtf8_utf8Encode {l : List Char} : IsValidUtf8 l.utf8Encode := + .intro l rfl + +@[simp] +theorem ByteArray.isValidUtf8_empty : IsValidUtf8 ByteArray.empty := + .intro [] (by simp) + +theorem Char.isValidUtf8_toByteArray_utf8EncodeChar {c : Char} : + ByteArray.IsValidUtf8 (String.utf8EncodeChar c).toByteArray := + .intro [c] (by simp [List.utf8Encode_singleton]) + +theorem ByteArray.IsValidUtf8.append {b b' : ByteArray} (h : IsValidUtf8 b) (h' : IsValidUtf8 b') : + IsValidUtf8 (b ++ b') := by + rcases h with ⟨m, rfl⟩ + rcases h' with ⟨m', rfl⟩ + exact .intro (m ++ m') (by simp) + +theorem ByteArray.isValidUtf8_utf8Encode_singleton_append_iff {b : ByteArray} {c : Char} : + IsValidUtf8 ([c].utf8Encode ++ b) ↔ IsValidUtf8 b := by + refine ⟨?_, fun h => IsValidUtf8.append isValidUtf8_utf8Encode h⟩ + rintro ⟨l, hl⟩ + match l with + | [] => simp at hl + | d::l => + obtain rfl : c = d := by + replace hl := congrArg (fun l => utf8DecodeChar? l 0) hl + simpa [List.utf8DecodeChar?_utf8Encode_singleton_append, + List.utf8DecodeChar?_utf8Encode_cons] using hl + rw [← List.singleton_append (l := l), List.utf8Encode_append, + ByteArray.append_right_inj] at hl + exact hl ▸ isValidUtf8_utf8Encode + +@[expose] +def ByteArray.utf8Decode? (b : ByteArray) : Option (Array Char) := + go (b.size + 1) 0 #[] (by simp) (by simp) +where + go (fuel : Nat) (i : Nat) (acc : Array Char) (hi : i ≤ b.size) (hf : b.size - i < fuel) : Option (Array Char) := + match fuel, hf with + | fuel + 1, _ => + if i = b.size then + some acc + else + match h : utf8DecodeChar? b i with + | none => none + | some c => go fuel (i + c.utf8Size) (acc.push c) + (le_size_of_utf8DecodeChar?_eq_some h) + (have := c.utf8Size_pos; have := le_size_of_utf8DecodeChar?_eq_some h; by omega) + termination_by structural fuel + +theorem ByteArray.utf8Decode?.go.congr {b b' : ByteArray} {fuel fuel' i i' : Nat} {acc acc' : Array Char} {hi hi' hf hf'} + (hbb' : b = b') (hii' : i = i') (hacc : acc = acc') : + ByteArray.utf8Decode?.go b fuel i acc hi hf = ByteArray.utf8Decode?.go b' fuel' i' acc' hi' hf' := by + subst hbb' hii' hacc + fun_induction ByteArray.utf8Decode?.go b fuel i acc hi hf generalizing fuel' with + | case1 => + rw [go.eq_def] + split + simp + | case2 => + rw [go.eq_def] + split <;> split + · simp_all + · split <;> simp_all + | case3 => + conv => rhs; rw [go.eq_def] + split <;> split + · simp_all + · split + · simp_all + · rename_i c₁ hc₁ ih _ _ _ _ _ c₂ hc₂ + obtain rfl : c₁ = c₂ := by rw [← Option.some_inj, ← hc₁, ← hc₂] + apply ih + +@[simp] +theorem ByteArray.utf8Decode?_empty : ByteArray.empty.utf8Decode? = some #[] := by + simp [utf8Decode?, utf8Decode?.go] + +private theorem ByteArray.isSome_utf8Decode?go_iff {b : ByteArray} {fuel i : Nat} {hi : i ≤ b.size} {hf} {acc : Array Char} : + (ByteArray.utf8Decode?.go b fuel i acc hi hf).isSome ↔ IsValidUtf8 (b.extract i b.size) := by + fun_induction ByteArray.utf8Decode?.go with + | case1 => simp + | case2 fuel i hi hf acc h₁ h₂ => + simp only [Option.isSome_none, Bool.false_eq_true, false_iff] + rintro ⟨l, hl⟩ + have : l ≠ [] := by + rintro rfl + simp at hl + omega + rw [← l.cons_head_tail this] at hl + rw [utf8DecodeChar?_eq_utf8DecodeChar?_extract, hl, List.utf8DecodeChar?_utf8Encode_cons] at h₂ + simp at h₂ + | case3 i acc hi fuel hf h₁ c h₂ ih => + rw [ih] + have h₂' := h₂ + rw [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h₂' + obtain ⟨l, hl⟩ := exists_of_utf8DecodeChar?_eq_some h₂' + rw [ByteArray.extract_eq_extract_append_extract (i := i) (i + c.utf8Size) (by omega) + (le_size_of_utf8DecodeChar?_eq_some h₂)] at hl ⊢ + rw [ByteArray.append_inj_left hl (by have := le_size_of_utf8DecodeChar?_eq_some h₂; simp; omega), + ← List.utf8Encode_singleton, isValidUtf8_utf8Encode_singleton_append_iff] + +theorem ByteArray.isSome_utf8Decode?_iff {b : ByteArray} : + b.utf8Decode?.isSome ↔ IsValidUtf8 b := by + rw [utf8Decode?, isSome_utf8Decode?go_iff, extract_zero_size] + +@[simp] +theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl) + +/-- +Appends two strings. Usually accessed via the `++` operator. + +The internal implementation will perform destructive updates if the string is not shared. + +Examples: + * `"abc".append "def" = "abcdef"` + * `"abc" ++ "def" = "abcdef"` + * `"" ++ "" = ""` +-/ +@[extern "lean_string_append", expose] +def String.append (s t : String) : String where + bytes := s.bytes ++ t.bytes + isValidUtf8 := s.isValidUtf8.append t.isValidUtf8 + +instance : Append String where + append s t := s.append t + +@[simp] +theorem String.bytes_append {s t : String} : (s ++ t).bytes = s.bytes ++ t.bytes := (rfl) + +theorem String.bytes_inj {s t : String} : s.bytes = t.bytes ↔ s = t := by + refine ⟨fun h => ?_, (· ▸ rfl)⟩ + rcases s with ⟨s⟩ + rcases t with ⟨t⟩ + subst h + rfl + +@[simp] +theorem String.empty_append {s : String} : "" ++ s = s := by + simp [← String.bytes_inj] + +@[simp] +theorem String.append_empty {s : String} : s ++ "" = s := by + simp [← String.bytes_inj] + +@[simp] theorem List.bytes_asString {l : List Char} : l.asString.bytes = l.utf8Encode := by + simp [List.asString, String.mk] + +@[simp] +theorem List.asString_nil : List.asString [] = "" := by + simp [← String.bytes_inj] + +@[simp] +theorem List.asString_append {l₁ l₂ : List Char} : (l₁ ++ l₂).asString = l₁.asString ++ l₂.asString := by + simp [← String.bytes_inj] + +@[expose] +def String.Internal.toArray (b : String) : Array Char := + b.bytes.utf8Decode?.get (b.bytes.isSome_utf8Decode?_iff.2 b.isValidUtf8) + +@[simp] +theorem String.Internal.toArray_empty : String.Internal.toArray "" = #[] := by + simp [toArray] + +@[extern "lean_string_data", expose] +def String.data (b : String) : List Char := + (String.Internal.toArray b).toList + +@[simp] +theorem String.data_empty : "".data = [] := by + simp [data] + +/-- +Returns the length of a string in Unicode code points. + +Examples: +* `"".length = 0` +* `"abc".length = 3` +* `"L∃∀N".length = 4` +-/ +@[extern "lean_string_length"] +def String.length (b : String) : Nat := + b.data.length + +@[simp] +theorem String.Internal.size_toArray {b : String} : (String.Internal.toArray b).size = b.length := + (rfl) + +@[simp] +theorem String.length_data {b : String} : b.data.length = b.length := (rfl) + +theorem String.exists_eq_asString (s : String) : + ∃ l : List Char, s = l.asString := by + rcases s with ⟨_, ⟨l, rfl⟩⟩ + refine ⟨l, by simp [← String.bytes_inj]⟩ + +private theorem ByteArray.utf8Decode?go_eq_utf8Decode?go_extract {b : ByteArray} {fuel i : Nat} {hi : i ≤ b.size} {hf} {acc : Array Char} : + utf8Decode?.go b fuel i acc hi hf = (utf8Decode?.go (b.extract i b.size) fuel 0 #[] (by simp) (by simp [hf])).map (acc ++ ·) := by + fun_cases utf8Decode?.go b fuel i acc hi hf with + | case1 => + rw [utf8Decode?.go] + simp only [size_extract, Nat.le_refl, Nat.min_eq_left, Nat.zero_add, List.push_toArray, + List.nil_append] + rw [if_pos (by omega)] + simp + | case2 fuel hf₁ h₁ h₂ hf₂ => + rw [utf8Decode?.go] + simp only [size_extract, Nat.le_refl, Nat.min_eq_left, Nat.zero_add, List.push_toArray, + List.nil_append] + rw [if_neg (by omega)] + rw [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h₂ + split <;> simp_all + | case3 fuel hf₁ h₁ c h₂ hf₂ => + conv => rhs; rw [utf8Decode?.go] + simp only [size_extract, Nat.le_refl, Nat.min_eq_left, Nat.zero_add, List.push_toArray, + List.nil_append] + rw [if_neg (by omega)] + rw [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h₂ + split + · simp_all + · rename_i c' hc' + obtain rfl : c = c' := by + rw [← Option.some_inj, ← h₂, hc'] + have := c.utf8Size_pos + conv => lhs; rw [ByteArray.utf8Decode?go_eq_utf8Decode?go_extract] + conv => rhs; rw [ByteArray.utf8Decode?go_eq_utf8Decode?go_extract] + simp only [size_extract, Nat.le_refl, Nat.min_eq_left, Option.map_map, ByteArray.extract_extract] + have : (fun x => acc ++ x) ∘ (fun x => #[c] ++ x) = fun x => acc.push c ++ x := by funext; simp + simp [(by omega : i + (b.size - i) = b.size), this] + +theorem ByteArray.utf8Decode?_utf8Encode_singleton_append {l : ByteArray} {c : Char} : + ([c].utf8Encode ++ l).utf8Decode? = l.utf8Decode?.map (#[c] ++ ·) := by + rw [utf8Decode?, utf8Decode?.go, + if_neg (by simp [List.utf8Encode_singleton]; have := c.utf8Size_pos; omega)] + split + · simp_all [List.utf8DecodeChar?_utf8Encode_singleton_append] + · rename_i d h + obtain rfl : c = d := by simpa [List.utf8DecodeChar?_utf8Encode_singleton_append] using h + rw [utf8Decode?go_eq_utf8Decode?go_extract, utf8Decode?] + simp only [List.push_toArray, List.nil_append, Nat.zero_add] + congr 1 + apply ByteArray.utf8Decode?.go.congr _ rfl rfl + apply extract_append_eq_right + simp [List.utf8Encode_singleton] + +@[simp] +theorem List.utf8Decode?_utf8Encode {l : List Char} : + l.utf8Encode.utf8Decode? = some l.toArray := by + induction l with + | nil => simp + | cons c l ih => + rw [← List.singleton_append, List.utf8Encode_append] + simp only [ByteArray.utf8Decode?_utf8Encode_singleton_append, cons_append, nil_append, + Option.map_eq_some_iff, Array.append_eq_toArray_iff, cons.injEq, true_and] + refine ⟨l.toArray, ih, by simp⟩ + +@[simp] +theorem ByteArray.utf8Encode_get_utf8Decode? {b : ByteArray} {h} : + (b.utf8Decode?.get h).toList.utf8Encode = b := by + obtain ⟨l, rfl⟩ := isSome_utf8Decode?_iff.1 h + simp + +@[simp] +theorem List.data_asString {l : List Char} : l.asString.data = l := by + simp [String.data, String.Internal.toArray] + +@[simp] +theorem String.asString_data {b : String} : b.data.asString = b := by + obtain ⟨l, rfl⟩ := String.exists_eq_asString b + rw [List.data_asString] + +theorem List.asString_injective {l₁ l₂ : List Char} (h : l₁.asString = l₂.asString) : l₁ = l₂ := by + simpa using congrArg String.data h + +theorem List.asString_inj {l₁ l₂ : List Char} : l₁.asString = l₂.asString ↔ l₁ = l₂ := + ⟨asString_injective, (· ▸ rfl)⟩ + +theorem String.data_injective {s₁ s₂ : String} (h : s₁.data = s₂.data) : s₁ = s₂ := by + simpa using congrArg List.asString h + +theorem String.data_inj {s₁ s₂ : String} : s₁.data = s₂.data ↔ s₁ = s₂ := + ⟨data_injective, (· ▸ rfl)⟩ + +@[simp] +theorem String.data_append {l₁ l₂ : String} : (l₁ ++ l₂).data = l₁.data ++ l₂.data := by + apply List.asString_injective + simp + +@[simp] +theorem String.utf8encode_data {b : String} : b.data.utf8Encode = b.bytes := by + have := congrArg String.bytes (String.asString_data (b := b)) + rwa [← List.bytes_asString] + +@[simp] +theorem String.utf8ByteSize_empty : "".utf8ByteSize = 0 := (rfl) + +@[simp] +theorem String.utf8ByteSize_append {s t : String} : + (s ++ t).utf8ByteSize = s.utf8ByteSize + t.utf8ByteSize := by + simp [utf8ByteSize] + +@[simp] +theorem String.size_bytes {s : String} : s.bytes.size = s.utf8ByteSize := rfl + +@[simp] +theorem String.bytes_push {s : String} {c : Char} : (s.push c).bytes = s.bytes ++ [c].utf8Encode := by + simp [push] + +-- This is just to keep the proof of `set_next_add` below from breaking; if that lemma goes away +-- or the proof is rewritten, it can be removed. +private noncomputable def String.utf8ByteSize' : String → Nat + | s => go s.data +where + go : List Char → Nat + | [] => 0 + | c::cs => go cs + c.utf8Size + +private theorem String.utf8ByteSize'_eq (s : String) : s.utf8ByteSize' = s.utf8ByteSize := by + suffices ∀ l, utf8ByteSize'.go l = l.asString.utf8ByteSize by + obtain ⟨m, rfl⟩ := s.exists_eq_asString + rw [utf8ByteSize', this, asString_data] + intro l + induction l with + | nil => simp [utf8ByteSize'.go] + | cons c cs ih => + rw [utf8ByteSize'.go, ih, ← List.singleton_append, List.asString_append, + utf8ByteSize_append, Nat.add_comm] + congr + rw [← size_bytes, List.bytes_asString, List.utf8Encode_singleton, + List.size_toByteArray, length_utf8EncodeChar] + +end + namespace String instance : HAdd String.Pos String.Pos String.Pos where @@ -54,8 +413,6 @@ instance : LT String := instance decidableLT (s₁ s₂ : @& String) : Decidable (s₁ < s₂) := List.decidableLT s₁.data s₂.data - - /-- Non-strict inequality on strings, typically used via the `≤` operator. @@ -69,32 +426,6 @@ instance : LE String := instance decLE (s₁ s₂ : String) : Decidable (s₁ ≤ s₂) := inferInstanceAs (Decidable (Not _)) -/-- -Returns the length of a string in Unicode code points. - -Examples: -* `"".length = 0` -* `"abc".length = 3` -* `"L∃∀N".length = 4` --/ -@[extern "lean_string_length", expose] -def length : (@& String) → Nat - | ⟨s⟩ => s.length - -/-- -Appends two strings. Usually accessed via the `++` operator. - -The internal implementation will perform destructive updates if the string is not shared. - -Examples: - * `"abc".append "def" = "abcdef"` - * `"abc" ++ "def" = "abcdef"` - * `"" ++ "" = ""` --/ -@[extern "lean_string_append", expose] -def append : String → (@& String) → String - | ⟨a⟩, ⟨b⟩ => ⟨a ++ b⟩ - /-- Converts a string to a list of characters. @@ -153,8 +484,7 @@ Examples: -/ @[extern "lean_string_utf8_get", expose] def get (s : @& String) (p : @& Pos) : Char := - match s with - | ⟨s⟩ => utf8GetAux s 0 p + utf8GetAux s.data 0 p def utf8GetAux? : List Char → Pos → Pos → Option Char | [], _, _ => none @@ -175,7 +505,7 @@ Examples: -/ @[extern "lean_string_utf8_get_opt"] def get? : (@& String) → (@& Pos) → Option Char - | ⟨s⟩, p => utf8GetAux? s 0 p + | s, p => utf8GetAux? s.data 0 p /-- Returns the character at position `p` of a string. Panics if `p` is not a valid position. @@ -191,7 +521,7 @@ Examples @[extern "lean_string_utf8_get_bang", expose] def get! (s : @& String) (p : @& Pos) : Char := match s with - | ⟨s⟩ => utf8GetAux s 0 p + | s => utf8GetAux s.data 0 p def utf8SetAux (c' : Char) : List Char → Pos → Pos → List Char | [], _, _ => [] @@ -214,7 +544,7 @@ Examples: -/ @[extern "lean_string_utf8_set"] def set : String → (@& Pos) → Char → String - | ⟨s⟩, i, c => ⟨utf8SetAux c s 0 i⟩ + | s, i, c => (utf8SetAux c s.data 0 i).asString /-- Replaces the character at position `p` in the string `s` with the result of applying `f` to that @@ -270,7 +600,7 @@ Examples: -/ @[extern "lean_string_utf8_prev", expose] def prev : (@& String) → (@& Pos) → Pos - | ⟨s⟩, p => utf8PrevAux s 0 p + | s, p => utf8PrevAux s.data 0 p /-- Returns the first character in `s`. If `s = ""`, returns `(default : Char)`. @@ -336,7 +666,7 @@ Examples: @[extern "lean_string_utf8_get_fast", expose] def get' (s : @& String) (p : @& Pos) (h : ¬ s.atEnd p) : Char := match s with - | ⟨s⟩ => utf8GetAux s 0 p + | s => utf8GetAux s.data 0 p /-- Returns the next position in a string after position `p`. The result is unspecified if `p` is not a @@ -360,16 +690,6 @@ def next' (s : @& String) (p : @& Pos) (h : ¬ s.atEnd p) : Pos := let c := get s p p + c -theorem _root_.Char.utf8Size_pos (c : Char) : 0 < c.utf8Size := by - repeat first | apply iteInduction (motive := (0 < ·)) <;> intros | decide - -theorem _root_.Char.utf8Size_le_four (c : Char) : c.utf8Size ≤ 4 := by - repeat first | apply iteInduction (motive := (· ≤ 4)) <;> intros | decide - -theorem _root_.Char.utf8Size_eq (c : Char) : c.utf8Size = 1 ∨ c.utf8Size = 2 ∨ c.utf8Size = 3 ∨ c.utf8Size = 4 := by - match c.utf8Size, c.utf8Size_pos, c.utf8Size_le_four with - | 1, _, _ | 2, _, _ | 3, _, _ | 4, _, _ => simp - @[deprecated Char.utf8Size_pos (since := "2026-06-04")] abbrev one_le_csize := Char.utf8Size_pos @[simp] theorem pos_lt_eq (p₁ p₂ : Pos) : (p₁ < p₂) = (p₁.1 < p₂.1) := rfl @@ -534,7 +854,7 @@ Examples: -/ @[extern "lean_string_utf8_extract", expose] def extract : (@& String) → (@& Pos) → (@& Pos) → String - | ⟨s⟩, b, e => if b.byteIdx ≥ e.byteIdx then "" else ⟨go₁ s 0 b e⟩ + | s, b, e => if b.byteIdx ≥ e.byteIdx then "" else (go₁ s.data 0 b e).asString where go₁ : List Char → Pos → Pos → Pos → List Char | [], _, _, _ => [] @@ -1030,37 +1350,31 @@ theorem utf8SetAux_of_gt (c' : Char) : ∀ (cs : List Char) {i p : Pos}, i > p theorem set_next_add (s : String) (i : Pos) (c : Char) (b₁ b₂) (h : (s.next i).1 + b₁ = s.endPos.1 + b₂) : ((s.set i c).next i).1 + b₁ = (s.set i c).endPos.1 + b₂ := by - simp [next, get, set, endPos, utf8ByteSize] at h ⊢ + simp [next, get, set, endPos, ← utf8ByteSize'_eq, utf8ByteSize'] at h ⊢ rw [Nat.add_comm i.1, Nat.add_assoc] at h ⊢ let rec foo : ∀ cs a b₁ b₂, - (utf8GetAux cs a i).utf8Size + b₁ = utf8ByteSize.go cs + b₂ → - (utf8GetAux (utf8SetAux c cs a i) a i).utf8Size + b₁ = utf8ByteSize.go (utf8SetAux c cs a i) + b₂ + (utf8GetAux cs a i).utf8Size + b₁ = utf8ByteSize'.go cs + b₂ → + (utf8GetAux (utf8SetAux c cs a i) a i).utf8Size + b₁ = utf8ByteSize'.go (utf8SetAux c cs a i) + b₂ | [], _, _, _, h => h | c'::cs, a, b₁, b₂, h => by unfold utf8SetAux - apply iteInduction (motive := fun p => (utf8GetAux p a i).utf8Size + b₁ = utf8ByteSize.go p + b₂) <;> - intro h' <;> simp [utf8GetAux, h', utf8ByteSize.go] at h ⊢ + apply iteInduction (motive := fun p => (utf8GetAux p a i).utf8Size + b₁ = utf8ByteSize'.go p + b₂) <;> + intro h' <;> simp [utf8GetAux, h', utf8ByteSize'.go] at h ⊢ next => rw [Nat.add_assoc, Nat.add_left_comm] at h ⊢; rw [Nat.add_left_cancel h] next => rw [Nat.add_assoc] at h ⊢ refine foo cs (a + c') b₁ (c'.utf8Size + b₂) h - exact foo s.1 0 _ _ h + exact foo s.data 0 _ _ h theorem mapAux_lemma (s : String) (i : Pos) (c : Char) (h : ¬s.atEnd i) : - (s.set i c).endPos.1 - ((s.set i c).next i).1 < s.endPos.1 - i.1 := + (s.set i c).endPos.1 - ((s.set i c).next i).1 < s.endPos.1 - i.1 := by suffices (s.set i c).endPos.1 - ((s.set i c).next i).1 = s.endPos.1 - (s.next i).1 by rw [this] apply Nat.sub_lt_sub_left (Nat.gt_of_not_le (mt decide_eq_true h)) (lt_next ..) - Nat.sub.elim (motive := (_ = ·)) _ _ - (fun _ k e => - have := set_next_add _ _ _ k 0 e.symm - Nat.sub_eq_of_eq_add <| this.symm.trans <| Nat.add_comm ..) - (fun h => by - have ⟨k, e⟩ := Nat.le.dest h - rw [Nat.succ_add] at e - have : ((s.set i c).next i).1 = _ := set_next_add _ _ c 0 k.succ e.symm - exact Nat.sub_eq_zero_of_le (this ▸ Nat.le_add_right ..)) + have := set_next_add s i c (s.endPos.byteIdx - (s.next i).byteIdx) 0 + have := set_next_add s i c 0 ((s.next i).byteIdx - s.endPos.byteIdx) + omega @[specialize] def mapAux (f : Char → Char) (i : Pos) (s : String) : String := if h : s.atEnd i then s @@ -2044,40 +2358,51 @@ def stripSuffix (s : String) (suff : String) : String := end String -namespace Char - -@[simp] theorem length_toString (c : Char) : c.toString.length = 1 := rfl - -end Char - namespace String +@[ext] theorem ext {s₁ s₂ : String} (h : s₁.data = s₂.data) : s₁ = s₂ := - show ⟨s₁.data⟩ = (⟨s₂.data⟩ : String) from h ▸ rfl - -theorem ext_iff {s₁ s₂ : String} : s₁ = s₂ ↔ s₁.data = s₂.data := ⟨fun h => h ▸ rfl, ext⟩ + data_injective h @[simp] theorem default_eq : default = "" := rfl -@[simp] theorem length_mk (s : List Char) : (String.mk s).length = s.length := rfl +@[simp] +theorem String.mk_eq_asString (s : List Char) : String.mk s = List.asString s := rfl -@[simp] theorem length_empty : "".length = 0 := rfl +@[simp] +theorem _root_.List.length_asString (s : List Char) : (List.asString s).length = s.length := by + rw [← length_data, List.data_asString] -@[simp] theorem length_singleton (c : Char) : (String.singleton c).length = 1 := rfl +@[simp] theorem length_empty : "".length = 0 := by simp [← length_data, data_empty] + +@[simp] +theorem bytes_singleton {c : Char} : (String.singleton c).bytes = [c].utf8Encode := by + simp [singleton] + +theorem singleton_eq {c : Char} : String.singleton c = [c].asString := by + simp [← bytes_inj] + +@[simp] theorem data_singleton (c : Char) : (String.singleton c).data = [c] := by + simp [singleton_eq] + +@[simp] +theorem length_singleton {c : Char} : (String.singleton c).length = 1 := by + simp [← length_data] + +theorem push_eq_append (c : Char) : String.push s c = s ++ singleton c := by + simp [← bytes_inj] + +@[simp] theorem data_push (c : Char) : (String.push s c).data = s.data ++ [c] := by + simp [push_eq_append] @[simp] theorem length_push (c : Char) : (String.push s c).length = s.length + 1 := by - rw [push, length_mk, List.length_append, List.length_singleton, Nat.succ.injEq] - rfl + simp [← length_data] @[simp] theorem length_pushn (c : Char) (n : Nat) : (pushn s c n).length = s.length + n := by unfold pushn; induction n <;> simp [Nat.repeat, Nat.add_assoc, *] @[simp] theorem length_append (s t : String) : (s ++ t).length = s.length + t.length := by - simp only [length, append, List.length_append] - -@[simp] theorem data_push (s : String) (c : Char) : (s.push c).data = s.data ++ [c] := rfl - -@[simp] theorem data_append (s t : String) : (s ++ t).data = s.data ++ t.data := rfl + simp [← length_data] attribute [simp] toList -- prefer `String.data` over `String.toList` in lemmas @@ -2161,10 +2486,8 @@ end Pos theorem lt_next' (s : String) (p : Pos) : p < next s p := lt_next .. @[simp] theorem prev_zero (s : String) : prev s 0 = 0 := by - cases s with | mk cs - cases cs - next => rfl - next => simp [prev, utf8PrevAux, Pos.le_iff] + rw [prev] + cases s.data <;> simp [utf8PrevAux, Pos.le_iff] @[simp] theorem get'_eq (s : String) (p : Pos) (h) : get' s p h = get s p := rfl @@ -2174,19 +2497,20 @@ theorem lt_next' (s : String) (p : Pos) : p < next s p := lt_next .. -- so for proving can be unfolded. attribute [simp] toSubstring' -theorem singleton_eq (c : Char) : singleton c = ⟨[c]⟩ := rfl - -@[simp] theorem data_singleton (c : Char) : (singleton c).data = [c] := rfl - -@[simp] theorem append_empty (s : String) : s ++ "" = s := ext (List.append_nil _) - -@[simp] theorem empty_append (s : String) : "" ++ s = s := rfl - theorem append_assoc (s₁ s₂ s₃ : String) : (s₁ ++ s₂) ++ s₃ = s₁ ++ (s₂ ++ s₃) := - ext (List.append_assoc ..) + ext (by simp [data_append]) end String +namespace Char + +theorem toString_eq_singleton {c : Char} : c.toString = String.singleton c := rfl + +@[simp] theorem length_toString (c : Char) : c.toString.length = 1 := by + simp [toString_eq_singleton] + +end Char + open String namespace Substring diff --git a/src/Init/Data/String/Bootstrap.lean b/src/Init/Data/String/Bootstrap.lean index eae8eb2faf..52e09c5fd2 100644 --- a/src/Init/Data/String/Bootstrap.lean +++ b/src/Init/Data/String/Bootstrap.lean @@ -8,6 +8,7 @@ module prelude public import Init.Data.List.Basic public import Init.Data.Char.Basic +public import Init.Data.ByteArray.Bootstrap public section @@ -31,7 +32,11 @@ Examples: -/ @[extern "lean_string_push", expose] def push : String → Char → String - | ⟨s⟩, c => ⟨s ++ [c]⟩ + | ⟨b, h⟩, c => ⟨b.append (List.utf8Encode [c]), ?pf⟩ +where finally + have ⟨m, hm⟩ := h + cases hm + exact .intro (m ++ [c]) (by simp [List.utf8Encode, List.toByteArray_append']) /-- Returns a new string that contains only the character `c`. @@ -124,8 +129,21 @@ Examples: * `[].asString = ""` * `['a', 'a', 'a'].asString = "aaa"` -/ +@[extern "lean_string_mk", expose] +def String.mk (data : List Char) : String := + ⟨List.utf8Encode data,.intro data rfl⟩ + +/-- +Creates a string that contains the characters in a list, in order. + +Examples: + * `['L', '∃', '∀', 'N'].asString = "L∃∀N"` + * `[].asString = ""` + * `['a', 'a', 'a'].asString = "aaa"` +-/ +@[expose] def List.asString (s : List Char) : String := - ⟨s⟩ + String.mk s namespace Substring.Internal diff --git a/src/Init/Data/String/Decode.lean b/src/Init/Data/String/Decode.lean new file mode 100644 index 0000000000..7793abe91b --- /dev/null +++ b/src/Init/Data/String/Decode.lean @@ -0,0 +1,1238 @@ +/- +Copyright (c) 2025 Lean FRO, LLC. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Author: Markus Himmel +-/ +module + +prelude +public import Init.Data.List.Lemmas +import Init.Data.List.Lemmas +public import Init.Data.UInt.Bitwise +import Init.Data.Char.Lemmas +public import Init.Data.ByteArray.Basic +import Init.Data.ByteArray.Lemmas + +/-! # `Char.utf8Size` -/ + +public theorem Char.utf8Size_eq_one_iff {c : Char} : c.utf8Size = 1 ↔ c.val ≤ 127 := by + fun_cases utf8Size + all_goals simp_all; assumption + +public theorem Char.utf8Size_eq_two_iff {c : Char} : c.utf8Size = 2 ↔ 127 < c.val ∧ c.val ≤ 0x7ff := by + fun_cases utf8Size with + | case1 v _ => subst v; simp_all [← UInt32.not_le] + | case2 v _ _ => subst v; simp_all + | case3 v _ _ _ => subst v; simp_all [← UInt32.not_le] + | case4 v _ _ _ => subst v; simp_all [← UInt32.not_le] + +public theorem Char.utf8Size_eq_three_iff {c : Char} : c.utf8Size = 3 ↔ 0x7ff < c.val ∧ c.val ≤ 0xffff := by + fun_cases utf8Size with + | case1 v _ => subst v; simp_all [UInt32.lt_iff_toNat_lt, UInt32.le_iff_toNat_le]; omega + | case2 v _ _ => subst v; simp_all [UInt32.lt_iff_toNat_lt, UInt32.le_iff_toNat_le]; omega + | case3 v _ _ _ => subst v; simp_all [← UInt32.not_le] + | case4 v _ _ _ => subst v; simp_all [← UInt32.not_le] + +public theorem Char.utf8Size_eq_four_iff {c : Char} : c.utf8Size = 4 ↔ 0xffff < c.val := by + fun_cases utf8Size with + | case1 v _ => subst v; simp_all [UInt32.lt_iff_toNat_lt, UInt32.le_iff_toNat_le]; omega + | case2 v _ _ => subst v; simp_all [UInt32.lt_iff_toNat_lt, UInt32.le_iff_toNat_le]; omega + | case3 v _ _ _ => subst v; simp_all [← UInt32.not_le] + | case4 v _ _ _ => subst v; simp_all [← UInt32.not_le] + +public theorem Char.utf8Size_pos (c : Char) : 0 < c.utf8Size := by + repeat first | apply iteInduction (motive := (0 < ·)) <;> intros | decide + +public theorem Char.utf8Size_le_four (c : Char) : c.utf8Size ≤ 4 := by + repeat first | apply iteInduction (motive := (· ≤ 4)) <;> intros | decide + +public theorem Char.utf8Size_eq (c : Char) : c.utf8Size = 1 ∨ c.utf8Size = 2 ∨ c.utf8Size = 3 ∨ c.utf8Size = 4 := by + match c.utf8Size, c.utf8Size_pos, c.utf8Size_le_four with + | 1, _, _ | 2, _, _ | 3, _, _ | 4, _, _ => simp + +theorem Char.toNat_val_le {c : Char} : c.val.toNat ≤ 0x10ffff := by + have := c.valid + simp [UInt32.isValidChar, Nat.isValidChar] at this + omega + +/-! # `utf8EncodeChar` -/ + +/-! ## `utf8EncodeChar` low-level API -/ + +/-- +Returns the sequence of bytes in a character's UTF-8 encoding. +-/ +@[expose] +public def String.utf8EncodeCharFast (c : Char) : List UInt8 := + let v := c.val + if v ≤ 0x7f then + [v.toUInt8] + else if v ≤ 0x7ff then + [(v >>> 6).toUInt8 &&& 0x1f ||| 0xc0, + v.toUInt8 &&& 0x3f ||| 0x80] + else if v ≤ 0xffff then + [(v >>> 12).toUInt8 &&& 0x0f ||| 0xe0, + (v >>> 6).toUInt8 &&& 0x3f ||| 0x80, + v.toUInt8 &&& 0x3f ||| 0x80] + else + [(v >>> 18).toUInt8 &&& 0x07 ||| 0xf0, + (v >>> 12).toUInt8 &&& 0x3f ||| 0x80, + (v >>> 6).toUInt8 &&& 0x3f ||| 0x80, + v.toUInt8 &&& 0x3f ||| 0x80] + +private theorem Nat.add_two_pow_eq_or_of_lt {b : Nat} (i : Nat) (b_lt : b < 2 ^ i) (a : Nat) : + b + 2 ^ i * a = b ||| 2 ^ i * a := by + rw [Nat.add_comm, Nat.or_comm, Nat.two_pow_add_eq_or_of_lt b_lt] + +@[csimp] +public theorem String.utf8EncodeChar_eq_utf8EncodeCharFast : @utf8EncodeChar = @utf8EncodeCharFast := by + funext c + simp only [utf8EncodeChar, utf8EncodeCharFast, UInt8.ofNat_uInt32ToNat, UInt8.ofNat_add, + UInt8.reduceOfNat, UInt32.le_iff_toNat_le, UInt32.reduceToNat] + split + · rfl + · split + · simp only [List.cons.injEq, ← UInt8.toNat_inj, UInt8.toNat_add, UInt8.toNat_ofNat', + Nat.reducePow, UInt8.reduceToNat, Nat.mod_add_mod, UInt8.toNat_or, UInt8.toNat_and, + UInt32.toNat_toUInt8, UInt32.toNat_shiftRight, UInt32.reduceToNat, Nat.reduceMod, and_true] + refine ⟨?_, ?_⟩ + · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 5 (by omega) 6, + Nat.and_two_pow_sub_one_eq_mod _ 5, Nat.shiftRight_eq_div_pow, + Nat.mod_eq_of_lt (b := 256) (by omega)] + · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, + Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.mod_mod_of_dvd _ (by decide)] + · split + · simp only [List.cons.injEq, ← UInt8.toNat_inj, UInt8.toNat_add, UInt8.toNat_ofNat', + Nat.reducePow, UInt8.reduceToNat, Nat.mod_add_mod, UInt8.toNat_or, UInt8.toNat_and, + UInt32.toNat_toUInt8, UInt32.toNat_shiftRight, UInt32.reduceToNat, Nat.reduceMod, and_true] + refine ⟨?_, ?_, ?_⟩ + · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 4 (by omega) 14, + Nat.and_two_pow_sub_one_eq_mod _ 4, Nat.shiftRight_eq_div_pow, + Nat.mod_eq_of_lt (b := 256) (by omega)] + · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, + Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.shiftRight_eq_div_pow, + Nat.mod_mod_of_dvd (c.val.toNat / 2 ^ 6) (by decide)] + · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, + Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.mod_mod_of_dvd c.val.toNat (by decide)] + · simp only [List.cons.injEq, ← UInt8.toNat_inj, UInt8.toNat_add, UInt8.toNat_ofNat', + Nat.reducePow, UInt8.reduceToNat, Nat.mod_add_mod, UInt8.toNat_or, UInt8.toNat_and, + UInt32.toNat_toUInt8, UInt32.toNat_shiftRight, UInt32.reduceToNat, Nat.reduceMod, and_true] + refine ⟨?_, ?_, ?_, ?_⟩ + · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 3 (by omega) 30, + Nat.and_two_pow_sub_one_eq_mod _ 3, Nat.shiftRight_eq_div_pow, + Nat.mod_mod_of_dvd _ (by decide)] + · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, + Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.shiftRight_eq_div_pow, + Nat.mod_mod_of_dvd (c.val.toNat / 2 ^ 12) (by decide)] + · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, + Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.shiftRight_eq_div_pow, + Nat.mod_mod_of_dvd (c.val.toNat / 2 ^ 6) (by decide)] + · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, + Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.mod_mod_of_dvd c.val.toNat (by decide)] + +@[simp] public theorem String.length_utf8EncodeChar (c : Char) : (utf8EncodeChar c).length = c.utf8Size := by + simp [Char.utf8Size, utf8EncodeChar_eq_utf8EncodeCharFast, utf8EncodeCharFast] + cases Decidable.em (c.val ≤ 0x7f) <;> simp [*] + cases Decidable.em (c.val ≤ 0x7ff) <;> simp [*] + cases Decidable.em (c.val ≤ 0xffff) <;> simp [*] + +theorem String.utf8EncodeChar_eq_singleton {c : Char} : c.utf8Size = 1 → + String.utf8EncodeChar c = [c.val.toUInt8] := by + rw [← length_utf8EncodeChar] + fun_cases utf8EncodeChar + all_goals simp_all; try rfl + +theorem String.utf8EncodeChar_eq_cons_cons {c : Char} : c.utf8Size = 2 → + String.utf8EncodeChar c = [(c.val >>> 6).toUInt8 &&& 0x1f ||| 0xc0, c.val.toUInt8 &&& 0x3f ||| 0x80] := by + rw [← length_utf8EncodeChar, utf8EncodeChar_eq_utf8EncodeCharFast] + fun_cases utf8EncodeCharFast + all_goals simp_all <;> (repeat' apply And.intro) <;> rfl + +theorem String.utf8EncodeChar_eq_cons_cons_cons {c : Char} : c.utf8Size = 3 → + String.utf8EncodeChar c = + [(c.val >>> 12).toUInt8 &&& 0x0f ||| 0xe0, + (c.val >>> 6).toUInt8 &&& 0x3f ||| 0x80, + c.val.toUInt8 &&& 0x3f ||| 0x80] := by + rw [← length_utf8EncodeChar, utf8EncodeChar_eq_utf8EncodeCharFast] + fun_cases utf8EncodeCharFast + all_goals simp_all <;> (repeat' apply And.intro) <;> rfl + +theorem String.utf8EncodeChar_eq_cons_cons_cons_cons {c : Char} : c.utf8Size = 4 → + String.utf8EncodeChar c = + [(c.val >>> 18).toUInt8 &&& 0x07 ||| 0xf0, + (c.val >>> 12).toUInt8 &&& 0x3f ||| 0x80, + (c.val >>> 6).toUInt8 &&& 0x3f ||| 0x80, + c.val.toUInt8 &&& 0x3f ||| 0x80] := by + rw [← length_utf8EncodeChar, utf8EncodeChar_eq_utf8EncodeCharFast] + fun_cases utf8EncodeCharFast + all_goals simp_all <;> (repeat' apply And.intro) <;> rfl + +/-! ## `utf8EncodeChar` BitVec API -/ + +theorem helper₄ (s : Nat) (c : BitVec w₀) (v : BitVec w') (w : Nat) : + (c >>> s).setWidth (w' + w) &&& (BitVec.allOnes w).setWidth (w' + w) ||| v ++ 0#w = v ++ c.extractLsb' s w := by + rw [BitVec.and_setWidth_allOnes, BitVec.or_append, BitVec.zero_or, BitVec.or_zero, + BitVec.setWidth_ushiftRight_eq_extractLsb, BitVec.setWidth_extractLsb'_of_le (by simp)] + +/-! ### Size one -/ + +-- TODO: possibly it makes sense to factor out this proof +theorem String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_one {c : Char} (h : c.utf8Size = 1) : + ((String.utf8EncodeChar c)[0]'(by simp [h])).toBitVec = 0#1 ++ c.val.toBitVec.extractLsb' 0 7 := by + have h₀ : c.val.toNat < 128 := by + suffices c.val.toNat ≤ 127 by omega + simpa [Char.utf8Size_eq_one_iff, UInt32.le_iff_toNat_le] using h + have h₁ : c.val.toNat < 256 := by omega + rw [← BitVec.toNat_inj, BitVec.toNat_append] + simp [utf8EncodeChar_eq_singleton h, Nat.mod_eq_of_lt h₀, Nat.mod_eq_of_lt h₁] + +/-! ### Size two -/ + +theorem String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_two {c : Char} (h : c.utf8Size = 2) : + ((String.utf8EncodeChar c)[0]'(by simp [h])).toBitVec = 0b110#3 ++ c.val.toBitVec.extractLsb' 6 5 := by + simpa [String.utf8EncodeChar_eq_cons_cons h] using helper₄ 6 c.val.toBitVec 6#3 5 + +theorem String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_two {c : Char} (h : c.utf8Size = 2) : + ((String.utf8EncodeChar c)[1]'(by simp [h])).toBitVec = 0b10#2 ++ c.val.toBitVec.extractLsb' 0 6 := by + simpa [String.utf8EncodeChar_eq_cons_cons h] using helper₄ 0 c.val.toBitVec 2#2 6 + +/-! ### Size three -/ + +theorem String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_three {c : Char} (h : c.utf8Size = 3) : + ((String.utf8EncodeChar c)[0]'(by simp [h])).toBitVec = 0b1110#4 ++ c.val.toBitVec.extractLsb' 12 4 := by + simpa [String.utf8EncodeChar_eq_cons_cons_cons h] using helper₄ 12 c.val.toBitVec 0b1110#4 4 + +theorem String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_three {c : Char} (h : c.utf8Size = 3) : + ((String.utf8EncodeChar c)[1]'(by simp [h])).toBitVec = 0b10#2 ++ c.val.toBitVec.extractLsb' 6 6 := by + simpa [String.utf8EncodeChar_eq_cons_cons_cons h] using helper₄ 6 c.val.toBitVec 0b10#2 6 + +theorem String.toBitVec_getElem_utf8EncodeChar_two_of_utf8Size_eq_three {c : Char} (h : c.utf8Size = 3) : + ((String.utf8EncodeChar c)[2]'(by simp [h])).toBitVec = 0b10#2 ++ c.val.toBitVec.extractLsb' 0 6 := by + simpa [String.utf8EncodeChar_eq_cons_cons_cons h] using helper₄ 0 c.val.toBitVec 0b10#2 6 + +/-! ### Size four -/ + +theorem String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_four {c : Char} (h : c.utf8Size = 4) : + ((String.utf8EncodeChar c)[0]'(by simp [h])).toBitVec = 0b11110#5 ++ c.val.toBitVec.extractLsb' 18 3 := by + simpa [String.utf8EncodeChar_eq_cons_cons_cons_cons h] using helper₄ 18 c.val.toBitVec 0b11110#5 3 + +theorem String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_four {c : Char} (h : c.utf8Size = 4) : + ((String.utf8EncodeChar c)[1]'(by simp [h])).toBitVec = 0b10#2 ++ c.val.toBitVec.extractLsb' 12 6 := by + simpa [String.utf8EncodeChar_eq_cons_cons_cons_cons h] using helper₄ 12 c.val.toBitVec 0b10#2 6 + +theorem String.toBitVec_getElem_utf8EncodeChar_two_of_utf8Size_eq_four {c : Char} (h : c.utf8Size = 4) : + ((String.utf8EncodeChar c)[2]'(by simp [h])).toBitVec = 0b10#2 ++ c.val.toBitVec.extractLsb' 6 6 := by + simpa [String.utf8EncodeChar_eq_cons_cons_cons_cons h] using helper₄ 6 c.val.toBitVec 0b10#2 6 + +theorem String.toBitVec_getElem_utf8EncodeChar_three_of_utf8Size_eq_four {c : Char} (h : c.utf8Size = 4) : + ((String.utf8EncodeChar c)[3]'(by simp [h])).toBitVec = 0b10#2 ++ c.val.toBitVec.extractLsb' 0 6 := by + simpa [String.utf8EncodeChar_eq_cons_cons_cons_cons h] using helper₄ 0 c.val.toBitVec 0b10#2 6 + +namespace ByteArray.utf8DecodeChar? + +/-! # `parseFirstByte` -/ + +/-! ## `parseFirstByte` definition -/ + +public inductive FirstByte where + | invalid : FirstByte + | done : FirstByte + | oneMore : FirstByte + | twoMore : FirstByte + | threeMore : FirstByte + +@[inline, expose] +public def parseFirstByte (b : UInt8) : FirstByte := + if b &&& 0x80 == 0 then + .done + else if b &&& 0xe0 == 0xc0 then + .oneMore + else if b &&& 0xf0 == 0xe0 then + .twoMore + else if b &&& 0xf8 == 0xf0 then + .threeMore + else .invalid + +/-! ## `parseFirstByte` low-level API -/ + +theorem parseFirstByte_eq_done_iff : parseFirstByte b = .done ↔ b &&& 0x80 = 0 := by + fun_cases parseFirstByte with simp_all + +theorem parseFirstByte_eq_oneMore_iff : parseFirstByte b = .oneMore ↔ b &&& 0xe0 = 0xc0 := by + suffices b &&& 0xe0 = 0xc0 → b &&& 0x80 ≠ 0 by fun_cases parseFirstByte with simp_all + intro h + calc b &&& 0x80 = b &&& (0xe0 &&& 0x80) := rfl + _ = 0xc0 &&& 0x80 := by rw [← UInt8.and_assoc, h] + _ ≠ 0 := by decide + +theorem parseFirstByte_eq_twoMore_iff : parseFirstByte b = .twoMore ↔ b &&& 0xf0 = 0xe0 := by + suffices b &&& 0xf0 = 0xe0 → b &&& 0xe0 ≠ 0xc0 ∧ b &&& 0x80 ≠ 0 by fun_cases parseFirstByte with simp_all + refine fun h => ⟨?_, ?_⟩ + · calc b &&& 0xe0 = b &&& (0xf0 &&& 0xe0) := rfl + _ = 0xe0 &&& 0xe0 := by rw [← UInt8.and_assoc, h] + _ ≠ 0xc0 := by decide + · calc b &&& 0x80 = b &&& (0xf0 &&& 0x80) := rfl + _ = 0xe0 &&& 0x80 := by rw [← UInt8.and_assoc, h] + _ ≠ 0 := by decide + +theorem parseFirstByte_eq_threeMore_iff : parseFirstByte b = .threeMore ↔ b &&& 0xf8 = 0xf0 := by + suffices b &&& 0xf8 = 0xf0 → b &&& 0xf0 ≠ 0xe0 ∧ b &&& 0xe0 ≠ 0xc0 ∧ b &&& 0x80 ≠ 0 by fun_cases parseFirstByte with simp_all + refine fun h => ⟨?_, ?_, ?_⟩ + · calc b &&& 0xf0 = b &&& (0xf8 &&& 0xf0) := rfl + _ = 0xf0 &&& 0xf0 := by rw [← UInt8.and_assoc, h] + _ ≠ 0xe0 := by decide + · calc b &&& 0xe0 = b &&& (0xf8 &&& 0xe0) := rfl + _ = 0xf0 &&& 0xe0 := by rw [← UInt8.and_assoc, h] + _ ≠ 0xc0 := by decide + · calc b &&& 0x80 = b &&& (0xf8 &&& 0x80) := rfl + _ = 0xf0 &&& 0x80 := by rw [← UInt8.and_assoc, h] + _ ≠ 0 := by decide + +/-! ## `parseFirstByte` BitVec API -/ + +theorem helper (w : Nat) (b : BitVec (w' + w)) (v : BitVec w') : b &&& (BitVec.allOnes w' ++ 0#w) = v ++ 0#w ↔ b.extractLsb' w w' = v := by + have h : b = b.extractLsb' w w' ++ b.extractLsb' 0 w := by + rw [BitVec.extractLsb'_append_extractLsb'_eq_extractLsb'] <;> simp + conv => lhs; rw [h] + rw [BitVec.and_append, BitVec.and_allOnes, BitVec.and_zero, BitVec.append_left_inj] + +theorem helper₂ (w : Nat) (b : BitVec (w' + w)) {v : BitVec w'} (h : b.extractLsb' w w' = v) : b = v ++ b.extractLsb' 0 w := by + rw [← h, BitVec.extractLsb'_append_extractLsb'_eq_extractLsb'] <;> simp + +/-! ### Size one -/ + +theorem parseFirstByte_eq_done_iff_toBitVec : parseFirstByte b = .done ↔ b.toBitVec.extractLsb' 7 1 = 0#1 := by + simp only [parseFirstByte_eq_done_iff, ← UInt8.toBitVec_inj, UInt8.toBitVec_and, + UInt8.toBitVec_ofNat] + exact helper 7 b.toBitVec 0#1 + +theorem toBitVec_eq_of_parseFirstByte_eq_done {b : UInt8} (h : parseFirstByte b = .done) : + b.toBitVec = 0#1 ++ b.toBitVec.setWidth 7 := by + exact helper₂ 7 b.toBitVec (parseFirstByte_eq_done_iff_toBitVec.1 h) + +/-! ### Size two -/ + +theorem parseFirstByte_eq_oneMore_iff_toBitVec : parseFirstByte b = .oneMore ↔ b.toBitVec.extractLsb' 5 3 = 0b110#3 := by + simp only [parseFirstByte_eq_oneMore_iff, ← UInt8.toBitVec_inj, UInt8.toBitVec_and, + UInt8.toBitVec_ofNat] + exact helper 5 b.toBitVec 0b110#3 + +theorem toBitVec_eq_of_parseFirstByte_eq_oneMore {b : UInt8} (h : parseFirstByte b = .oneMore) : + b.toBitVec = 0b110#3 ++ b.toBitVec.setWidth 5 := by + exact helper₂ 5 b.toBitVec (parseFirstByte_eq_oneMore_iff_toBitVec.1 h) + +/-! ### Size three -/ + +theorem parseFirstByte_eq_twoMore_iff_toBitVec : parseFirstByte b = .twoMore ↔ b.toBitVec.extractLsb' 4 4 = 0b1110#4 := by + simp only [parseFirstByte_eq_twoMore_iff, ← UInt8.toBitVec_inj, UInt8.toBitVec_and, + UInt8.toBitVec_ofNat] + exact helper 4 b.toBitVec 0b1110#4 + +theorem toBitVec_eq_of_parseFirstByte_eq_twoMore {b : UInt8} (h : parseFirstByte b = .twoMore) : + b.toBitVec = 0b1110#4 ++ b.toBitVec.setWidth 4 := by + exact helper₂ 4 b.toBitVec (parseFirstByte_eq_twoMore_iff_toBitVec.1 h) + +/-! ### Size four -/ + +theorem parseFirstByte_eq_threeMore_iff_toBitVec : parseFirstByte b = .threeMore ↔ b.toBitVec.extractLsb' 3 5 = 0b11110#5 := by + simp only [parseFirstByte_eq_threeMore_iff, ← UInt8.toBitVec_inj, UInt8.toBitVec_and, + UInt8.toBitVec_ofNat] + exact helper 3 b.toBitVec 0b11110#5 + +theorem toBitVec_eq_of_parseFirstByte_eq_threeMore {b : UInt8} (h : parseFirstByte b = .threeMore) : + b.toBitVec = 0b11110#5 ++ b.toBitVec.setWidth 3 := by + exact helper₂ 3 b.toBitVec (parseFirstByte_eq_threeMore_iff_toBitVec.1 h) + +/-! # `isInvalidContinuationByte` definiton & API -/ + +@[inline, expose] +public def isInvalidContinuationByte (b : UInt8) : Bool := + b &&& 0xc0 != 0x80 + +theorem isInvalidContinutationByte_eq_false_iff {b : UInt8} : + isInvalidContinuationByte b = false ↔ b &&& 0xc0 = 0x80 := by + simp [isInvalidContinuationByte] + +theorem isInvalidContinuationByte_eq_false_iff_toBitVec {b : UInt8} : + isInvalidContinuationByte b = false ↔ b.toBitVec.extractLsb' 6 2 = 0b10#2 := by + simp only [isInvalidContinuationByte, bne_eq_false_iff_eq, ← UInt8.toBitVec_inj, + UInt8.toBitVec_and, UInt8.toBitVec_ofNat] + exact helper 6 b.toBitVec 0b10#2 + +theorem toBitVec_eq_of_isInvalidContinuationByte_eq_false {b : UInt8} (hb : isInvalidContinuationByte b = false) : + b.toBitVec = 0b10#2 ++ b.toBitVec.setWidth 6 := by + exact helper₂ 6 b.toBitVec (isInvalidContinuationByte_eq_false_iff_toBitVec.1 hb) + +/-! # `parseFirstByte`, `isInvalidContinuationByte` and `utf8EncodeChar` -/ + +theorem parseFirstByte_utf8EncodeChar_eq_done {c : Char} (hc : c.utf8Size = 1) : + parseFirstByte ((String.utf8EncodeChar c)[0]'(by simp [Char.utf8Size_pos])) = .done := by + rw [parseFirstByte_eq_done_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_one hc, + BitVec.extractLsb'_append_eq_left] + +theorem parseFirstByte_utf8EncodeChar_eq_oneMore {c : Char} (hc : c.utf8Size = 2) : + parseFirstByte ((String.utf8EncodeChar c)[0]'(by simp [Char.utf8Size_pos])) = .oneMore := by + rw [parseFirstByte_eq_oneMore_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_two hc, + BitVec.extractLsb'_append_eq_left] + +theorem isInvalidContinuationByte_getElem_utf8EncodeChar_one_of_utf8Size_eq_two {c : Char} (hc : c.utf8Size = 2) : + isInvalidContinuationByte ((String.utf8EncodeChar c)[1]'(by simp [String.length_utf8EncodeChar, hc])) = false := by + rw [isInvalidContinuationByte_eq_false_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_two hc, + BitVec.extractLsb'_append_eq_left] + +theorem parseFirstByte_utf8EncodeChar_eq_twoMore {c : Char} (hc : c.utf8Size = 3) : + parseFirstByte ((String.utf8EncodeChar c)[0]'(by simp [Char.utf8Size_pos])) = .twoMore := by + rw [parseFirstByte_eq_twoMore_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_three hc, + BitVec.extractLsb'_append_eq_left] + +theorem isInvalidContinuationByte_getElem_utf8EncodeChar_one_of_utf8Size_eq_three {c : Char} (hc : c.utf8Size = 3) : + isInvalidContinuationByte ((String.utf8EncodeChar c)[1]'(by simp [String.length_utf8EncodeChar, hc])) = false := by + rw [isInvalidContinuationByte_eq_false_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_three hc, + BitVec.extractLsb'_append_eq_left] + +theorem isInvalidContinuationByte_getElem_utf8EncodeChar_two_of_utf8Size_eq_three {c : Char} (hc : c.utf8Size = 3) : + isInvalidContinuationByte ((String.utf8EncodeChar c)[2]'(by simp [String.length_utf8EncodeChar, hc])) = false := by + rw [isInvalidContinuationByte_eq_false_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_two_of_utf8Size_eq_three hc, + BitVec.extractLsb'_append_eq_left] + +theorem parseFirstByte_utf8EncodeChar_eq_threeMore {c : Char} (hc : c.utf8Size = 4) : + parseFirstByte ((String.utf8EncodeChar c)[0]'(by simp [Char.utf8Size_pos])) = .threeMore := by + rw [parseFirstByte_eq_threeMore_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_four hc, + BitVec.extractLsb'_append_eq_left] + +theorem isInvalidContinuationByte_getElem_utf8EncodeChar_one_of_utf8Size_eq_four {c : Char} (hc : c.utf8Size = 4) : + isInvalidContinuationByte ((String.utf8EncodeChar c)[1]'(by simp [String.length_utf8EncodeChar, hc])) = false := by + rw [isInvalidContinuationByte_eq_false_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_four hc, + BitVec.extractLsb'_append_eq_left] + +theorem isInvalidContinuationByte_getElem_utf8EncodeChar_two_of_utf8Size_eq_four {c : Char} (hc : c.utf8Size = 4) : + isInvalidContinuationByte ((String.utf8EncodeChar c)[2]'(by simp [String.length_utf8EncodeChar, hc])) = false := by + rw [isInvalidContinuationByte_eq_false_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_two_of_utf8Size_eq_four hc, + BitVec.extractLsb'_append_eq_left] + +theorem isInvalidContinuationByte_getElem_utf8EncodeChar_three_of_utf8Size_eq_four {c : Char} (hc : c.utf8Size = 4) : + isInvalidContinuationByte ((String.utf8EncodeChar c)[3]'(by simp [String.length_utf8EncodeChar, hc])) = false := by + rw [isInvalidContinuationByte_eq_false_iff_toBitVec, String.toBitVec_getElem_utf8EncodeChar_three_of_utf8Size_eq_four hc, + BitVec.extractLsb'_append_eq_left] + +/-! # `assemble₁` -/ + +theorem helper₅ {w : UInt8} (h : parseFirstByte w = .done) : w < 128 := by + simp only [UInt8.lt_iff_toBitVec_lt, UInt8.toBitVec_ofNat] + rw [toBitVec_eq_of_parseFirstByte_eq_done h] + simp only [Nat.reduceAdd, BitVec.lt_def, BitVec.toNat_ofNat, Nat.reducePow, Nat.reduceMod] + rw [BitVec.toNat_append] + simpa using Nat.mod_lt _ (by decide) + +@[inline, expose] +public def assemble₁ (w : UInt8) (h : parseFirstByte w = .done) : Option Char := + some ⟨w.toUInt32, ?done⟩ +where finally + case done => + have : w < 0x80 := helper₅ h + refine Or.inl ?_ + simp only [UInt8.lt_iff_toNat_lt, UInt8.reduceToNat] at this + simp only [UInt8.toNat_toUInt32] + omega + +theorem toBitVec_val_assemble₁ {w : UInt8} {h} {c : Char} : assemble₁ w h = some c → c.val.toBitVec = w.toBitVec.setWidth 32 := by + simp only [assemble₁, Option.some.injEq] + rintro rfl + rfl + +theorem val_assemble₁_le {w : UInt8} {h} {c : Char} (h' : assemble₁ w h = some c) : c.val ≤ 127 := by + have := toBitVec_val_assemble₁ h' + have hx := helper₅ h + simp only [UInt8.lt_iff_toNat_lt, UInt8.reduceToNat] at hx + rw [UInt32.le_iff_toBitVec_le, this, BitVec.le_def] + simp only [BitVec.toNat_setWidth, UInt8.toNat_toBitVec, Nat.reducePow, UInt8.toNat_mod_uInt32Size, + UInt32.toBitVec_ofNat, BitVec.toNat_ofNat, Nat.reduceMod] + omega + +theorem utf8Size_assemble₁ {w : UInt8} {c : Char} {h} (h : assemble₁ w h = some c) : c.utf8Size = 1 := + Char.utf8Size_eq_one_iff.2 (val_assemble₁_le h) + +theorem assemble₁_eq_some_iff_utf8EncodeChar_eq {w : UInt8} {c : Char} : + (∃ h : parseFirstByte w = .done, assemble₁ w h = some c) ↔ String.utf8EncodeChar c = [w] := by + refine ⟨fun ⟨h₁, h₂⟩ => ?_, ?_⟩ + · have hc := utf8Size_assemble₁ h₂ + simp only [String.utf8EncodeChar_eq_singleton hc, List.cons.injEq, and_true] + simp only [assemble₁, Option.some.injEq] at h₂ + simpa using congrArg (·.val.toUInt8) h₂ |>.symm + · intro h + have h₀ : (String.utf8EncodeChar c).length = 1 := congrArg List.length h + have hc : c.utf8Size = 1 := String.length_utf8EncodeChar _ ▸ h₀ + obtain ⟨rfl⟩ : w = (String.utf8EncodeChar c)[0] := by simp [h] + refine ⟨parseFirstByte_utf8EncodeChar_eq_done hc, ?_⟩ + have : c.val.toNat < 256 := by + simp only [Char.utf8Size_eq_one_iff, UInt32.le_iff_toNat_le, UInt32.reduceToNat] at hc + omega + simpa [String.utf8EncodeChar_eq_singleton hc, assemble₁, Char.ext_iff, ← UInt32.toNat_inj] + +/-! # `assemble₂` -/ + +@[inline, expose] +public def assemble₂Unchecked (w x : UInt8) : UInt32 := + let b₀ : UInt8 := w &&& 0x1f + let b₁ := x &&& 0x3f + (b₀.toUInt32 <<< 6) ||| b₁.toUInt32 + +@[inline, expose] +public def assemble₂ (w x : UInt8) : Option Char := + if isInvalidContinuationByte x then + none + else + let r := assemble₂Unchecked w x + if r < 0x80 then + none -- overlong encodinlg + else + some ⟨r, ?onemore⟩ +where finally + case onemore => + let b₀ : UInt8 := w &&& 0x1f + let b₁ := x &&& 0x3f + have hr : r = (b₀.toUInt32 <<< 6) ||| b₁.toUInt32 := rfl + have hb₀ : b₀ < 0x20 := UInt8.and_lt_add_one (by decide) + have hb₁ : b₁ < 0x40 := UInt8.and_lt_add_one (by decide) + refine Or.inl ?_ + simp only [UInt8.lt_iff_toNat_lt, UInt8.reduceToNat] at hb₀ hb₁ + simp only [hr, UInt32.toNat_or, UInt32.toNat_shiftLeft, UInt8.toNat_toUInt32, UInt32.reduceToNat, + Nat.reduceMod, Nat.reducePow] + rw [Nat.shiftLeft_eq, Nat.mod_eq_of_lt (by omega), Nat.mul_comm, ← Nat.two_pow_add_eq_or_of_lt hb₁] + omega + +theorem helper₃ {x : UInt8} (n : Nat) (hn : n < 8) : + (x &&& UInt8.ofNat (2 ^ n - 1)).toUInt32.toBitVec = (x.toBitVec.setWidth n).setWidth 32 := by + apply BitVec.eq_of_toNat_eq + simp only [UInt8.toUInt32_and, UInt32.toBitVec_and, UInt8.toBitVec_toUInt32, BitVec.toNat_and, + BitVec.toNat_setWidth, UInt8.toNat_toBitVec, Nat.reducePow, UInt8.toNat_mod_uInt32Size, + UInt8.toNat_ofNat'] + have : 2 ^ n < 2 ^ 8 := Nat.pow_lt_pow_right (by decide) hn + rw [Nat.mod_mod_of_dvd' (by decide), Nat.mod_eq_of_lt (by omega), Nat.mod_mod_of_dvd'] + · exact Nat.and_two_pow_sub_one_eq_mod _ _ + · exact Nat.pow_dvd_pow (n := 32) 2 (by omega) + +theorem toBitVec_assemble₂Unchecked {w x : UInt8} : (assemble₂Unchecked w x).toBitVec = (w.toBitVec.setWidth 5 ++ x.toBitVec.setWidth 6).setWidth 32 := by + have hw : (w &&& 31).toUInt32.toBitVec = (w.toBitVec.setWidth 5).setWidth 32 := helper₃ 5 (by decide) + have hx : (x &&& 63).toUInt32.toBitVec = (x.toBitVec.setWidth 6).setWidth 32 := helper₃ 6 (by decide) + rw [assemble₂Unchecked, UInt32.toBitVec_or, UInt32.toBitVec_shiftLeft, hw, hx] + simpa using BitVec.setWidth_append_eq_shiftLeft_setWidth_or.symm + +theorem toBitVec_val_assemble₂ {w x : UInt8} (c : Char) : assemble₂ w x = some c → c.val.toBitVec = (w.toBitVec.setWidth 5 ++ x.toBitVec.setWidth 6).setWidth 32 := by + fun_cases assemble₂ with + | case1 => simp + | case2 => simp + | case3 hx r hr => + simp only [Option.some.injEq, Nat.reduceAdd] + rintro rfl + subst r + simp [toBitVec_assemble₂Unchecked] + +theorem le_val_assemble₂ {w x : UInt8} {c : Char} : assemble₂ w x = some c → 128 ≤ c.val := by + rcases c with ⟨v, hv⟩ + fun_cases assemble₂ with + | case1 => simp + | case2 => simp + | case3 hx r hr => + simp only [Option.some.injEq, Char.mk.injEq] + rintro rfl + exact UInt32.not_lt.1 hr + +theorem val_assemble₂_le {w x : UInt8} {c : Char} (h : assemble₂ w x = some c) : c.val ≤ 2047 := by + rw [UInt32.le_iff_toBitVec_le, toBitVec_val_assemble₂ _ h, BitVec.le_def, BitVec.toNat_setWidth_of_le (by simp)] + exact Nat.le_of_lt_succ (BitVec.toNat_lt_twoPow_of_le (Nat.le_refl _)) + +theorem utf8Size_assemble₂ {w x : UInt8} {c : Char} (h : assemble₂ w x = some c) : c.utf8Size = 2 := + Char.utf8Size_eq_two_iff.2 ⟨le_val_assemble₂ h, val_assemble₂_le h⟩ + +theorem helper₆ {c : Char} {o : Option Char} {b : BitVec 32} (hc : c.val.toBitVec = b) (h' : ∀ d, o = some d → d.val.toBitVec = b) + (hi : o.isSome) : o = some c := by + obtain rfl : c = o.get hi := by + refine Char.ext (UInt32.eq_of_toBitVec_eq ?_) + rw [hc, h' (o.get hi) (by simp)] + simp + +theorem assemble₂_eq_some_of_toBitVec {w x : UInt8} (hx : isInvalidContinuationByte x = false) {c : Char} (hc₀ : 128 ≤ c.val) + (hc : c.val.toBitVec = (w.toBitVec.setWidth 5 ++ x.toBitVec.setWidth 6).setWidth 32) : assemble₂ w x = some c := by + apply helper₆ hc toBitVec_val_assemble₂ + fun_cases assemble₂ with + | case1 => simp_all + | case2 hx r hr => + suffices 128 ≤ r.toNat ∧ r.toNat < 128 by omega + simp_all [← toBitVec_assemble₂Unchecked, r, UInt32.le_iff_toBitVec_le, UInt32.lt_iff_toBitVec_lt, BitVec.le_def, BitVec.lt_def] + | case3 hx r hr => simp + +theorem isInvalidContinuationByte_eq_false_of_assemble₂_eq_some {w x : UInt8} {c : Char} : assemble₂ w x = some c → isInvalidContinuationByte x = false := by + fun_cases assemble₂ with simp_all + +theorem assemble₂_eq_some_iff_utf8EncodeChar_eq {x y : UInt8} {c : Char} : + parseFirstByte x = .oneMore ∧ assemble₂ x y = some c ↔ String.utf8EncodeChar c = [x, y] := by + refine ⟨fun ⟨h₁, h₂⟩ => ?_, ?_⟩ + · have hc := utf8Size_assemble₂ h₂ + rw [(String.utf8EncodeChar c).eq_getElem_of_length_eq_two (by simp [hc])] + simp only [List.cons.injEq, and_true, UInt8.eq_iff_toBitVec_eq] + refine ⟨?_, ?_⟩ + · rw [String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_two hc, + toBitVec_val_assemble₂ _ h₂, BitVec.extractLsb'_setWidth_of_le (by decide), + BitVec.extractLsb'_append_eq_left] + conv => rhs; rw [toBitVec_eq_of_parseFirstByte_eq_oneMore h₁] + · rw [String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_two hc, + toBitVec_val_assemble₂ _ h₂] + rw [BitVec.extractLsb'_setWidth_of_le (by decide)] + conv => rhs; rw [toBitVec_eq_of_isInvalidContinuationByte_eq_false (isInvalidContinuationByte_eq_false_of_assemble₂_eq_some h₂)] + rw [BitVec.extractLsb'_append_eq_right] + · intro h + have hc : c.utf8Size = 2 := String.length_utf8EncodeChar _ ▸ congrArg List.length h + have h₀ : (String.utf8EncodeChar c).length = 2 := congrArg List.length h + obtain ⟨rfl, rfl⟩ : x = (String.utf8EncodeChar c)[0] ∧ y = (String.utf8EncodeChar c)[1] := by simp [h] + refine ⟨parseFirstByte_utf8EncodeChar_eq_oneMore hc, ?_⟩ + apply assemble₂_eq_some_of_toBitVec + · apply isInvalidContinuationByte_getElem_utf8EncodeChar_one_of_utf8Size_eq_two hc + · rw [Char.utf8Size_eq_two_iff] at hc + exact hc.1 + · rw [String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_two hc, + String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_two hc, + BitVec.setWidth_append_eq_right, BitVec.setWidth_append_eq_right, + BitVec.extractLsb'_append_extractLsb'_eq_extractLsb' (by simp), + ← BitVec.setWidth_eq_extractLsb' (by simp), + BitVec.setWidth_setWidth_eq_self] + simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_two_iff.1 hc).2 + +/-! # `assemble₃` -/ + +@[inline, expose] +public def assemble₃Unchecked (w x y : UInt8) : UInt32 := + let b₀ : UInt8 := w &&& 0x0f + let b₁ := x &&& 0x3f + let b₂ := y &&& 0x3f + (b₀.toUInt32 <<< 12) ||| (b₁.toUInt32 <<< 6) ||| b₂.toUInt32 + +@[inline, expose] +public def assemble₃ (w x y : UInt8) : Option Char := + if isInvalidContinuationByte x || isInvalidContinuationByte y then + none + else + let r := assemble₃Unchecked w x y + if r < 0x800 then + none -- overlong encoding + else if hr : 0xd800 ≤ r ∧ r ≤ 0xdfff then + none -- surrogate code point + else + some ⟨r, ?twomore⟩ +where finally + case twomore => + let b₀ : UInt8 := w &&& 0x0f + let b₁ := x &&& 0x3f + let b₂ := y &&& 0x3f + have hb₀ : b₀ < 0x10 := UInt8.and_lt_add_one (by decide) + have hb₁ : b₁ < 0x40 := UInt8.and_lt_add_one (by decide) + have hb₂ : b₂ < 0x40 := UInt8.and_lt_add_one (by decide) + have hr' : r = (b₀.toUInt32 <<< 12) ||| (b₁.toUInt32 <<< 6) ||| b₂.toUInt32 := rfl + simp only [UInt8.lt_iff_toNat_lt, UInt8.reduceToNat] at hb₀ hb₁ hb₂ + simp only [Decidable.not_and_iff_not_or_not, UInt32.not_le] at hr + rcases hr with (hr|hr) + · exact Or.inl hr + · refine Or.inr ⟨hr, ?_⟩ + subst r + simp only [hr', UInt32.toNat_or, UInt32.toNat_shiftLeft, UInt8.toNat_toUInt32, UInt32.reduceToNat, + Nat.reduceMod, Nat.reducePow] + rw [Nat.shiftLeft_eq, Nat.shiftLeft_eq, Nat.mod_eq_of_lt (by omega), Nat.mod_eq_of_lt (by omega), + Nat.mul_comm _ (2 ^ _), Nat.mul_comm _ (2 ^ _), Nat.or_assoc, ← Nat.two_pow_add_eq_or_of_lt (b := b₂.toNat) hb₂, + ← Nat.two_pow_add_eq_or_of_lt (by omega)] + omega + +theorem toBitVec_assemble₃Unchecked {w x y : UInt8} : (assemble₃Unchecked w x y).toBitVec = + (w.toBitVec.setWidth 4 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6).setWidth 32 := by + have hw : (w &&& 15).toUInt32.toBitVec = (w.toBitVec.setWidth 4).setWidth 32 := helper₃ 4 (by decide) + have hx : (x &&& 63).toUInt32.toBitVec = (x.toBitVec.setWidth 6).setWidth 32 := helper₃ 6 (by decide) + have hy : (y &&& 63).toUInt32.toBitVec = (y.toBitVec.setWidth 6).setWidth 32 := helper₃ 6 (by decide) + rw [assemble₃Unchecked, UInt32.toBitVec_or, UInt32.toBitVec_or, UInt32.toBitVec_shiftLeft, UInt32.toBitVec_shiftLeft, + hw, hx, hy, BitVec.setWidth_append_append_eq_shiftLeft_setWidth_or] + simp + +theorem toBitVec_val_assemble₃ {w x y : UInt8} (c : Char) : assemble₃ w x y = some c → + c.val.toBitVec = (w.toBitVec.setWidth 4 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6).setWidth 32 := by + fun_cases assemble₃ with + | case1 => simp + | case2 => simp + | case3 => simp + | case4 hx r hr => + simp only [Option.some.injEq, Nat.reduceAdd] + rintro rfl + subst r + simp [toBitVec_assemble₃Unchecked] + +theorem le_val_assemble₃ {w x y : UInt8} {c : Char} : assemble₃ w x y = some c → 0x800 ≤ c.val := by + rcases c with ⟨v, hv⟩ + fun_cases assemble₃ with + | case1 => simp + | case2 => simp + | case3 => simp + | case4 hx r hr => + simp only [Option.some.injEq, Char.mk.injEq] + rintro rfl + exact UInt32.not_lt.1 hr + +theorem val_assemble₃_le {w x y : UInt8} {c : Char} (h : assemble₃ w x y = some c) : c.val ≤ 0xffff := by + rw [UInt32.le_iff_toBitVec_le, toBitVec_val_assemble₃ _ h, BitVec.le_def, BitVec.toNat_setWidth_of_le (by simp)] + exact Nat.le_of_lt_succ (BitVec.toNat_lt_twoPow_of_le (Nat.le_refl _)) + +theorem utf8Size_assemble₃ {w x y : UInt8} {c : Char} (h : assemble₃ w x y = some c) : c.utf8Size = 3 := + Char.utf8Size_eq_three_iff.2 ⟨le_val_assemble₃ h, val_assemble₃_le h⟩ + +theorem assemble₃_eq_some_of_toBitVec {w x y : UInt8} (hx : isInvalidContinuationByte x = false) + (hy : isInvalidContinuationByte y = false) {c : Char} (hc₀ : 0x800 ≤ c.val) + (hc : c.val.toBitVec = (w.toBitVec.setWidth 4 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6).setWidth 32) : + assemble₃ w x y = some c := by + apply helper₆ hc toBitVec_val_assemble₃ + fun_cases assemble₃ with + | case1 => simp_all + | case2 hx r hr => + suffices 0x800 ≤ (assemble₃Unchecked w x y).toNat ∧ (assemble₃Unchecked w x y).toNat < 0x800 by omega + simp_all [← toBitVec_assemble₃Unchecked, r, UInt32.le_iff_toBitVec_le, UInt32.lt_iff_toBitVec_lt, BitVec.le_def, BitVec.lt_def] + | case3 hx r hr hr' => + have hcv := c.valid + suffices ((55296 ≤ r.toNat ∧ r.toNat ≤ 57343) ∧ (r.toNat < 55296 ∨ (57343 < r.toNat ∧ r.toNat < 1114112))) by omega + simp_all [UInt32.isValidChar, Nat.isValidChar, ← toBitVec_assemble₃Unchecked, r, UInt32.le_iff_toBitVec_le, UInt32.lt_iff_toBitVec_lt, + BitVec.le_def, BitVec.lt_def, ← BitVec.toNat_inj] + | case4 => simp + +theorem isInvalidContinuationByte_eq_false_of_assemble₃_eq_some_left {w x y : UInt8} {c : Char} : assemble₃ w x y = some c → isInvalidContinuationByte x = false := by + fun_cases assemble₃ with simp_all + +theorem isInvalidContinuationByte_eq_false_of_assemble₃_eq_some_right {w x y : UInt8} {c : Char} : assemble₃ w x y = some c → isInvalidContinuationByte y = false := by + fun_cases assemble₃ with simp_all + +theorem assemble₃_eq_some_iff_utf8EncodeChar_eq {w x y : UInt8} {c : Char} : + parseFirstByte w = .twoMore ∧ assemble₃ w x y = some c ↔ String.utf8EncodeChar c = [w, x, y] := by + refine ⟨fun ⟨h₁, h₂⟩ => ?_, ?_⟩ + · have hc := utf8Size_assemble₃ h₂ + rw [(String.utf8EncodeChar c).eq_getElem_of_length_eq_three (by simp [hc])] + simp only [List.cons.injEq, UInt8.eq_iff_toBitVec_eq, and_true] + refine ⟨?_, ?_, ?_⟩ + · rw [String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_three hc, + toBitVec_val_assemble₃ _ h₂, BitVec.extractLsb'_setWidth_of_le (by decide), + BitVec.append_assoc, BitVec.cast_eq, BitVec.extractLsb'_append_eq_left, + ← toBitVec_eq_of_parseFirstByte_eq_twoMore h₁] + · rw [String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_three hc, + toBitVec_val_assemble₃ _ h₂, BitVec.extractLsb'_setWidth_of_le (by decide), + BitVec.extractLsb'_append_eq_of_le (by decide), BitVec.extractLsb'_append_eq_right, + ← toBitVec_eq_of_isInvalidContinuationByte_eq_false (isInvalidContinuationByte_eq_false_of_assemble₃_eq_some_left h₂)] + · rw [String.toBitVec_getElem_utf8EncodeChar_two_of_utf8Size_eq_three hc, + toBitVec_val_assemble₃ _ h₂, BitVec.extractLsb'_setWidth_of_le (by decide), + BitVec.extractLsb'_append_eq_right, + ← toBitVec_eq_of_isInvalidContinuationByte_eq_false (isInvalidContinuationByte_eq_false_of_assemble₃_eq_some_right h₂)] + · intro h + have h₀ : (String.utf8EncodeChar c).length = 3 := congrArg List.length h + have hc : c.utf8Size = 3 := String.length_utf8EncodeChar _ ▸ h₀ + obtain ⟨rfl, rfl, rfl⟩ : w = (String.utf8EncodeChar c)[0] ∧ x = (String.utf8EncodeChar c)[1] ∧ + y = (String.utf8EncodeChar c)[2] := by simp [h] + refine ⟨parseFirstByte_utf8EncodeChar_eq_twoMore hc, ?_⟩ + apply assemble₃_eq_some_of_toBitVec + · apply isInvalidContinuationByte_getElem_utf8EncodeChar_one_of_utf8Size_eq_three hc + · apply isInvalidContinuationByte_getElem_utf8EncodeChar_two_of_utf8Size_eq_three hc + · simp [Char.utf8Size_eq_three_iff, UInt32.lt_iff_toNat_lt] at hc + exact UInt32.le_iff_toNat_le.2 (by simp; omega) + · rw [String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_three hc, + String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_three hc, + String.toBitVec_getElem_utf8EncodeChar_two_of_utf8Size_eq_three hc, + BitVec.setWidth_append_eq_right, BitVec.setWidth_append_eq_right, BitVec.setWidth_append_eq_right, + BitVec.extractLsb'_append_extractLsb'_eq_extractLsb' (by simp), + BitVec.extractLsb'_append_extractLsb'_eq_extractLsb' (by simp), + ← BitVec.setWidth_eq_extractLsb' (by simp), BitVec.setWidth_setWidth_eq_self] + simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_three_iff.1 hc).2 + +/-! # `assemble₄` -/ + +@[inline, expose] +public def assemble₄Unchecked (w x y z : UInt8) : UInt32 := + let b₀ : UInt8 := w &&& 0x07 + let b₁ := x &&& 0x3f + let b₂ := y &&& 0x3f + let b₃ := z &&& 0x3f + (b₀.toUInt32 <<< 18) ||| (b₁.toUInt32 <<< 12) ||| (b₂.toUInt32 <<< 6) ||| b₃.toUInt32 + +@[inline, expose] +public def assemble₄ (w x y z : UInt8) : Option Char := + if isInvalidContinuationByte x || isInvalidContinuationByte y || isInvalidContinuationByte z then + none + else + let r := assemble₄Unchecked w x y z + if h₁ : r < 0x10000 then + none -- overlong encoding + else if h₂ : 0x10ffff < r then + none -- out-of-range code point + else + some ⟨r, ?threemore⟩ +where finally + case threemore => + simp only [UInt32.not_lt, UInt32.le_iff_toNat_le, UInt32.reduceToNat] at h₁ h₂ + exact Or.inr ⟨by omega, by omega⟩ + +theorem toBitVec_assemble₄Unchecked {w x y z : UInt8} : (assemble₄Unchecked w x y z).toBitVec = + (w.toBitVec.setWidth 3 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ++ z.toBitVec.setWidth 6).setWidth 32 := by + have hw : (w &&& 7).toUInt32.toBitVec = (w.toBitVec.setWidth 3).setWidth 32 := helper₃ 3 (by decide) + have hx : (x &&& 63).toUInt32.toBitVec = (x.toBitVec.setWidth 6).setWidth 32 := helper₃ 6 (by decide) + have hy : (y &&& 63).toUInt32.toBitVec = (y.toBitVec.setWidth 6).setWidth 32 := helper₃ 6 (by decide) + have hz : (z &&& 63).toUInt32.toBitVec = (z.toBitVec.setWidth 6).setWidth 32 := helper₃ 6 (by decide) + rw [assemble₄Unchecked, UInt32.toBitVec_or, UInt32.toBitVec_or, UInt32.toBitVec_or, UInt32.toBitVec_shiftLeft, + UInt32.toBitVec_shiftLeft, UInt32.toBitVec_shiftLeft, hw, hx, hy, hz, BitVec.setWidth_append_append_append_eq_shiftLeft_setWidth_or] + simp + +theorem toBitVec_val_assemble₄ {w x y z : UInt8} (c : Char) : assemble₄ w x y z = some c → + c.val.toBitVec = (w.toBitVec.setWidth 3 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ++ z.toBitVec.setWidth 6).setWidth 32 := by + fun_cases assemble₄ with + | case1 => simp + | case2 => simp + | case3 => simp + | case4 hx r hr => + simp only [Option.some.injEq, Nat.reduceAdd] + rintro rfl + subst r + simp [toBitVec_assemble₄Unchecked] + +theorem le_val_assemble₄ {w x y z : UInt8} {c : Char} : assemble₄ w x y z = some c → 0xffff < c.val := by + rcases c with ⟨v, hv⟩ + fun_cases assemble₄ with + | case1 => simp + | case2 => simp + | case3 => simp + | case4 hx r hr => + simp only [Option.some.injEq, Char.mk.injEq] + rintro rfl + have := UInt32.not_lt.1 hr + simp only [UInt32.le_iff_toNat_le, UInt32.reduceToNat] at this + simp only [UInt32.lt_iff_toNat_lt, UInt32.reduceToNat, gt_iff_lt] + omega + +theorem utf8Size_assemble₄ {w x y z : UInt8} {c : Char} (h : assemble₄ w x y z = some c) : c.utf8Size = 4 := + Char.utf8Size_eq_four_iff.2 (le_val_assemble₄ h) + +theorem assemble₄_eq_some_of_toBitVec {w x y z : UInt8} (hx : isInvalidContinuationByte x = false) + (hy : isInvalidContinuationByte y = false) (hz : isInvalidContinuationByte z = false) {c : Char} (hc₀ : 0x10000 ≤ c.val) + (hc : c.val.toBitVec = (w.toBitVec.setWidth 3 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ++ z.toBitVec.setWidth 6).setWidth 32) : + assemble₄ w x y z = some c := by + apply helper₆ hc toBitVec_val_assemble₄ + fun_cases assemble₄ with + | case1 => simp_all + | case2 hx r hr => + suffices 0x10000 ≤ (assemble₄Unchecked w x y z).toNat ∧ (assemble₄Unchecked w x y z).toNat < 0x10000 by omega + simp_all [← toBitVec_assemble₄Unchecked, r, UInt32.le_iff_toBitVec_le, UInt32.lt_iff_toBitVec_lt, BitVec.le_def, BitVec.lt_def] + | case3 hx r hr hr' => + have hcv := c.valid + suffices (1114111 < r.toNat ∧ (r.toNat < 55296 ∨ (57343 < r.toNat ∧ r.toNat < 1114112))) by omega + simp_all [UInt32.isValidChar, Nat.isValidChar, ← toBitVec_assemble₄Unchecked, r, UInt32.le_iff_toBitVec_le, UInt32.lt_iff_toBitVec_lt, + BitVec.le_def, BitVec.lt_def, ← BitVec.toNat_inj] + | case4 => simp + +theorem isInvalidContinuationByte_eq_false_of_assemble₄_eq_some_left {w x y z : UInt8} {c : Char} : assemble₄ w x y z = some c → isInvalidContinuationByte x = false := by + fun_cases assemble₄ with simp_all + +theorem isInvalidContinuationByte_eq_false_of_assemble₄_eq_some_middle {w x y z : UInt8} {c : Char} : assemble₄ w x y z = some c → isInvalidContinuationByte y = false := by + fun_cases assemble₄ with simp_all + +theorem isInvalidContinuationByte_eq_false_of_assemble₄_eq_some_right {w x y z : UInt8} {c : Char} : assemble₄ w x y z = some c → isInvalidContinuationByte z = false := by + fun_cases assemble₄ with simp_all + +theorem assemble₄_eq_some_iff_utf8EncodeChar_eq {w x y z : UInt8} {c : Char} : + parseFirstByte w = .threeMore ∧ assemble₄ w x y z = some c ↔ String.utf8EncodeChar c = [w, x, y, z] := by + refine ⟨fun ⟨h₁, h₂⟩ => ?_, ?_⟩ + · have hc := utf8Size_assemble₄ h₂ + rw [(String.utf8EncodeChar c).eq_getElem_of_length_eq_four (by simp [hc])] + simp only [List.cons.injEq, UInt8.eq_iff_toBitVec_eq, and_true] + refine ⟨?_, ?_, ?_, ?_⟩ + · rw [String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_four hc, + toBitVec_val_assemble₄ _ h₂, BitVec.extractLsb'_setWidth_of_le (by decide), + BitVec.append_assoc, BitVec.cast_eq, BitVec.append_assoc, BitVec.cast_eq, + BitVec.extractLsb'_append_eq_left, + ← toBitVec_eq_of_parseFirstByte_eq_threeMore h₁] + · rw [String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_four hc, + toBitVec_val_assemble₄ _ h₂, BitVec.extractLsb'_setWidth_of_le (by decide), + BitVec.extractLsb'_append_eq_of_le (by decide), BitVec.extractLsb'_append_eq_of_le (by decide), + BitVec.extractLsb'_append_eq_right, + ← toBitVec_eq_of_isInvalidContinuationByte_eq_false (isInvalidContinuationByte_eq_false_of_assemble₄_eq_some_left h₂)] + · rw [String.toBitVec_getElem_utf8EncodeChar_two_of_utf8Size_eq_four hc, + toBitVec_val_assemble₄ _ h₂, BitVec.extractLsb'_setWidth_of_le (by decide), + BitVec.extractLsb'_append_eq_of_le (by decide), BitVec.extractLsb'_append_eq_right, + ← toBitVec_eq_of_isInvalidContinuationByte_eq_false (isInvalidContinuationByte_eq_false_of_assemble₄_eq_some_middle h₂)] + · rw [String.toBitVec_getElem_utf8EncodeChar_three_of_utf8Size_eq_four hc, + toBitVec_val_assemble₄ _ h₂, BitVec.extractLsb'_setWidth_of_le (by decide), + BitVec.extractLsb'_append_eq_right, + ← toBitVec_eq_of_isInvalidContinuationByte_eq_false (isInvalidContinuationByte_eq_false_of_assemble₄_eq_some_right h₂)] + · intro h + have h₀ : (String.utf8EncodeChar c).length = 4 := congrArg List.length h + have hc : c.utf8Size = 4 := String.length_utf8EncodeChar _ ▸ h₀ + obtain ⟨rfl, rfl, rfl, rfl⟩ : + w = (String.utf8EncodeChar c)[0] ∧ x = (String.utf8EncodeChar c)[1] ∧ y = (String.utf8EncodeChar c)[2] + ∧ z = (String.utf8EncodeChar c)[3] := by simp [h] + refine ⟨parseFirstByte_utf8EncodeChar_eq_threeMore hc, ?_⟩ + apply assemble₄_eq_some_of_toBitVec + · apply isInvalidContinuationByte_getElem_utf8EncodeChar_one_of_utf8Size_eq_four hc + · apply isInvalidContinuationByte_getElem_utf8EncodeChar_two_of_utf8Size_eq_four hc + · apply isInvalidContinuationByte_getElem_utf8EncodeChar_three_of_utf8Size_eq_four hc + · simp [Char.utf8Size_eq_four_iff, UInt32.lt_iff_toNat_lt] at hc + exact UInt32.le_iff_toNat_le.2 (by simp; omega) + · rw [String.toBitVec_getElem_utf8EncodeChar_zero_of_utf8Size_eq_four hc, + String.toBitVec_getElem_utf8EncodeChar_one_of_utf8Size_eq_four hc, + String.toBitVec_getElem_utf8EncodeChar_two_of_utf8Size_eq_four hc, + String.toBitVec_getElem_utf8EncodeChar_three_of_utf8Size_eq_four hc, + BitVec.setWidth_append_eq_right, BitVec.setWidth_append_eq_right, BitVec.setWidth_append_eq_right, + BitVec.setWidth_append_eq_right, + BitVec.extractLsb'_append_extractLsb'_eq_extractLsb' (by simp), + BitVec.extractLsb'_append_extractLsb'_eq_extractLsb' (by simp), + BitVec.extractLsb'_append_extractLsb'_eq_extractLsb' (by simp), + ← BitVec.setWidth_eq_extractLsb' (by simp), BitVec.setWidth_setWidth_eq_self] + have := c.toNat_val_le + simp only [Nat.reduceAdd, BitVec.lt_def, UInt32.toNat_toBitVec, BitVec.toNat_twoPow, + Nat.reducePow, Nat.reduceMod, gt_iff_lt] + omega + +end ByteArray.utf8DecodeChar? + +open ByteArray.utf8DecodeChar? + +/- # `utf8DecodeChar?` -/ + +@[inline, expose] +public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char := + if h₀ : i < bytes.size then + match h : parseFirstByte bytes[i] with + | .invalid => none -- invalid first byte + | .done => assemble₁ bytes[i] h + | .oneMore => + if h₁ : i + 1 < bytes.size then + assemble₂ bytes[i] bytes[i + 1] + else + none + | .twoMore => + if h₁ : i + 2 < bytes.size then + assemble₃ bytes[i] bytes[i + 1] bytes[i + 2] + else none + | .threeMore => + if h₁ : i + 3 < bytes.size then + assemble₄ bytes[i] bytes[i + 1] bytes[i + 2] bytes[i + 3] + else none + else none + +/-! # `utf8DecodeChar?` low-level API -/ + +theorem parseFirstByte_eq_done_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char} + (h : b.utf8DecodeChar? i = some c) (hc : c.utf8Size = 1) (h') : + parseFirstByte (b[i]'h') = .done := by + revert h + fun_cases ByteArray.utf8DecodeChar? with + | case1 => simp + | case2 => simp_all + | case3 => exact (by omega) ∘ utf8Size_assemble₂ + | case4 => simp + | case5 => exact (by omega) ∘ utf8Size_assemble₃ + | case6 => simp + | case7 => exact (by omega) ∘ utf8Size_assemble₄ + | case8 => simp + | case9 => simp + +theorem parseFirstByte_eq_oneMore_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char} + (h : b.utf8DecodeChar? i = some c) (hc : c.utf8Size = 2) (h') : + parseFirstByte (b[i]'h') = .oneMore := by + revert h + fun_cases ByteArray.utf8DecodeChar? with + | case1 => simp + | case2 => exact (by omega) ∘ utf8Size_assemble₁ + | case3 => simp_all + | case4 => simp + | case5 => exact (by omega) ∘ utf8Size_assemble₃ + | case6 => simp + | case7 => exact (by omega) ∘ utf8Size_assemble₄ + | case8 => simp + | case9 => simp + +theorem parseFirstByte_eq_twoMore_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char} + (h : b.utf8DecodeChar? i = some c) (hc : c.utf8Size = 3) (h') : + parseFirstByte (b[i]'h') = .twoMore := by + revert h + fun_cases ByteArray.utf8DecodeChar? with + | case1 => simp + | case2 => exact (by omega) ∘ utf8Size_assemble₁ + | case3 => exact (by omega) ∘ utf8Size_assemble₂ + | case4 => simp + | case5 => simp_all + | case6 => simp + | case7 => exact (by omega) ∘ utf8Size_assemble₄ + | case8 => simp + | case9 => simp + +theorem parseFirstByte_eq_threeMore_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char} + (h : b.utf8DecodeChar? i = some c) (hc : c.utf8Size = 4) (h') : + parseFirstByte (b[i]'h') = .threeMore := by + revert h + fun_cases ByteArray.utf8DecodeChar? with + | case1 => simp + | case2 => exact (by omega) ∘ utf8Size_assemble₁ + | case3 => exact (by omega) ∘ utf8Size_assemble₂ + | case4 => simp + | case5 => exact (by omega) ∘ utf8Size_assemble₃ + | case6 => simp + | case7 => simp_all + | case8 => simp + | case9 => simp + +theorem utf8Size_le_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} : + b.utf8DecodeChar? 0 = some c → c.utf8Size ≤ b.size := by + fun_cases ByteArray.utf8DecodeChar? with + | case1 => simp + | case2 => exact (by omega) ∘ utf8Size_assemble₁ + | case3 => exact (by omega) ∘ utf8Size_assemble₂ + | case4 => simp + | case5 => exact (by omega) ∘ utf8Size_assemble₃ + | case6 => simp + | case7 => exact (by omega) ∘ utf8Size_assemble₄ + | case8 => simp + | case9 => simp + +theorem utf8DecodeChar?_eq_assemble₁ {b : ByteArray} (hb : 1 ≤ b.size) (h : parseFirstByte b[0] = .done) : + b.utf8DecodeChar? 0 = assemble₁ b[0] h := by + fun_cases ByteArray.utf8DecodeChar? + all_goals try (simp_all; done) + all_goals omega + +theorem utf8DecodeChar?_eq_assemble₂ {b : ByteArray} (hb : 2 ≤ b.size) (h : parseFirstByte b[0] = .oneMore) : + b.utf8DecodeChar? 0 = assemble₂ b[0] b[1] := by + fun_cases ByteArray.utf8DecodeChar? + all_goals try (simp_all; done) + all_goals omega + +theorem utf8DecodeChar?_eq_assemble₃ {b : ByteArray} (hb : 3 ≤ b.size) (h : parseFirstByte b[0] = .twoMore) : + b.utf8DecodeChar? 0 = assemble₃ b[0] b[1] b[2] := by + fun_cases ByteArray.utf8DecodeChar? + all_goals try (simp_all; done) + all_goals omega + +theorem utf8DecodeChar?_eq_assemble₄ {b : ByteArray} (hb : 4 ≤ b.size) (h : parseFirstByte b[0] = .threeMore) : + b.utf8DecodeChar? 0 = assemble₄ b[0] b[1] b[2] b[3] := by + fun_cases ByteArray.utf8DecodeChar? + all_goals try (simp_all; done) + all_goals omega + +theorem utf8DecodeChar?_append_eq_assemble₁ {l : List UInt8} {b : ByteArray} (hl : l.length = 1) (h : parseFirstByte l[0] = .done) : + (l.toByteArray ++ b).utf8DecodeChar? 0 = assemble₁ l[0] h := by + have : (l.toByteArray ++ b)[0]'(by simp [hl]; omega) = l[0] := by + rw [ByteArray.getElem_append_left (by simp [hl]), List.getElem_toByteArray] + rw [utf8DecodeChar?_eq_assemble₁ (by simp [hl])] <;> simp [this, h] + +theorem utf8DecodeChar?_append_eq_assemble₂ {l : List UInt8} {b : ByteArray} (hl : l.length = 2) (h : parseFirstByte l[0] = .oneMore) : + (l.toByteArray ++ b).utf8DecodeChar? 0 = assemble₂ l[0] l[1] := by + rw [utf8DecodeChar?_eq_assemble₂ (by simp [hl])] + all_goals repeat rw [ByteArray.getElem_append_left (by simp [hl])] + all_goals repeat rw [List.getElem_toByteArray] + assumption + +theorem utf8DecodeChar?_append_eq_assemble₃ {l : List UInt8} {b : ByteArray} (hl : l.length = 3) (h : parseFirstByte l[0] = .twoMore) : + (l.toByteArray ++ b).utf8DecodeChar? 0 = assemble₃ l[0] l[1] l[2] := by + rw [utf8DecodeChar?_eq_assemble₃ (by simp [hl])] + all_goals repeat rw [ByteArray.getElem_append_left (by simp [hl])] + all_goals repeat rw [List.getElem_toByteArray] + assumption + +theorem utf8DecodeChar?_append_eq_assemble₄ {l : List UInt8} {b : ByteArray} (hl : l.length = 4) (h : parseFirstByte l[0] = .threeMore) : + (l.toByteArray ++ b).utf8DecodeChar? 0 = assemble₄ l[0] l[1] l[2] l[3] := by + rw [utf8DecodeChar?_eq_assemble₄ (by simp [hl])] + all_goals repeat rw [ByteArray.getElem_append_left (by simp [hl])] + all_goals repeat rw [List.getElem_toByteArray] + assumption + +/-! +# Main theorems + +`utf8DecodeChar?_utf8EncodeChar_append` and `toByteArray_of_utf8DecodeChar?_eq_some` are the two main results that together +imply that UTF-8 encoding and decoding are inverse. +-/ + +public theorem ByteArray.utf8DecodeChar?_utf8EncodeChar_append {b : ByteArray} {c : Char} : + utf8DecodeChar? ((String.utf8EncodeChar c).toByteArray ++ b) 0 = some c := by + match hc : c.utf8Size, c.utf8Size_pos, c.utf8Size_le_four with + | 1, _, _ => + have hc' : (String.utf8EncodeChar c).length = 1 := String.length_utf8EncodeChar _ ▸ hc + rw [utf8DecodeChar?_append_eq_assemble₁ hc' (parseFirstByte_utf8EncodeChar_eq_done hc)] + exact (assemble₁_eq_some_iff_utf8EncodeChar_eq.2 (List.eq_getElem_of_length_eq_one _ hc')).2 + | 2, _, _ => + have hc' : (String.utf8EncodeChar c).length = 2 := String.length_utf8EncodeChar _ ▸ hc + rw [utf8DecodeChar?_append_eq_assemble₂ hc' (parseFirstByte_utf8EncodeChar_eq_oneMore hc)] + exact (assemble₂_eq_some_iff_utf8EncodeChar_eq.2 (List.eq_getElem_of_length_eq_two _ hc')).2 + | 3, _, _ => + have hc' : (String.utf8EncodeChar c).length = 3 := String.length_utf8EncodeChar _ ▸ hc + rw [utf8DecodeChar?_append_eq_assemble₃ hc' (parseFirstByte_utf8EncodeChar_eq_twoMore hc)] + exact (assemble₃_eq_some_iff_utf8EncodeChar_eq.2 (List.eq_getElem_of_length_eq_three _ hc')).2 + | 4, _, _ => + have hc' : (String.utf8EncodeChar c).length = 4 := String.length_utf8EncodeChar _ ▸ hc + rw [utf8DecodeChar?_append_eq_assemble₄ hc' (parseFirstByte_utf8EncodeChar_eq_threeMore hc)] + exact (assemble₄_eq_some_iff_utf8EncodeChar_eq.2 (List.eq_getElem_of_length_eq_four _ hc')).2 + +public theorem String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : b.utf8DecodeChar? 0 = some c) : + (String.utf8EncodeChar c).toByteArray = b.extract 0 c.utf8Size := by + have := utf8Size_le_of_utf8DecodeChar?_eq_some h + match hc : c.utf8Size, c.utf8Size_pos, c.utf8Size_le_four with + | 1, _, _ => + have := parseFirstByte_eq_done_of_utf8DecodeChar?_eq_some h hc (by omega) + rw [utf8DecodeChar?_eq_assemble₁ (by omega) this] at h + rw [ByteArray.extract_add_one (by omega)] + congr + rw [← assemble₁_eq_some_iff_utf8EncodeChar_eq] + exact ⟨this, h⟩ + | 2, _, _ => + have := parseFirstByte_eq_oneMore_of_utf8DecodeChar?_eq_some h hc (by omega) + rw [utf8DecodeChar?_eq_assemble₂ (by omega) this] at h + rw [ByteArray.extract_add_two (by omega)] + congr + rw [← assemble₂_eq_some_iff_utf8EncodeChar_eq] + exact ⟨this, h⟩ + | 3, _, _ => + have := parseFirstByte_eq_twoMore_of_utf8DecodeChar?_eq_some h hc (by omega) + rw [utf8DecodeChar?_eq_assemble₃ (by omega) this] at h + rw [ByteArray.extract_add_three (by omega)] + congr + rw [← assemble₃_eq_some_iff_utf8EncodeChar_eq] + exact ⟨this, h⟩ + | 4, _, _ => + have := parseFirstByte_eq_threeMore_of_utf8DecodeChar?_eq_some h hc (by omega) + rw [utf8DecodeChar?_eq_assemble₄ (by omega) this] at h + rw [ByteArray.extract_add_four (by omega)] + congr + rw [← assemble₄_eq_some_iff_utf8EncodeChar_eq] + exact ⟨this, h⟩ + +/-! # Corollaries -/ + +public theorem ByteArray.eq_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : utf8DecodeChar? b 0 = some c) : + b = (String.utf8EncodeChar c).toByteArray ++ b.extract c.utf8Size b.size := by + rw [String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some h, + ByteArray.extract_append_extract, Nat.zero_min, Nat.max_eq_right (utf8Size_le_of_utf8DecodeChar?_eq_some h), + ByteArray.extract_zero_size] + +public theorem ByteArray.exists_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : utf8DecodeChar? b 0 = some c) : + ∃ l, b = (String.utf8EncodeChar c).toByteArray ++ l := + ⟨b.extract c.utf8Size b.size, eq_of_utf8DecodeChar?_eq_some h⟩ + +public theorem ByteArray.utf8DecodeChar?_eq_utf8DecodeChar?_extract {b : ByteArray} {i : Nat} : + utf8DecodeChar? b i = utf8DecodeChar? (b.extract i b.size) 0 := by + simp [utf8DecodeChar?] + have h₁ : i < b.size ↔ 0 < b.size - i := by omega + have h₂ : i + 1 < b.size ↔ 1 < b.size - i := by omega + have h₃ : i + 2 < b.size ↔ 2 < b.size - i := by omega + have h₄ : i + 3 < b.size ↔ 3 < b.size - i := by omega + have h₅ : ∀ h, b[i]'h = (b.extract i b.size)[0]'(by simp; omega) := by simp [ByteArray.getElem_extract] + have h₆ : ∀ h, b[i + 1]'h = (b.extract i b.size)[1]'(by simp; omega) := by simp [ByteArray.getElem_extract] + have h₇ : ∀ h, b[i + 2]'h = (b.extract i b.size)[2]'(by simp; omega) := by simp [ByteArray.getElem_extract] + have h₈ : ∀ h, b[i + 3]'h = (b.extract i b.size)[3]'(by simp; omega) := by simp [ByteArray.getElem_extract] + have h₉ : (b.extract i b.size).size = b.size - i := by simp + simp only [h₁] + split + · split + all_goals (rename_i h h'; simp only [h₅] at h') + · split <;> simp_all + · split <;> rename_i h'' + all_goals try (simp [h'] at h''; done) + simp [h₅] + · symm + split <;> rename_i h'' + all_goals try (simp [h'] at h''; done) + simp [h₂, h₅, h₆] + · symm + split <;> rename_i h'' + all_goals try (simp [h'] at h''; done) + simp [h₃, h₅, h₆, h₇] + · symm + split <;> rename_i h'' + all_goals try (simp [h'] at h''; done) + simp [h₄, h₅, h₆, h₇, h₈] + · rfl + +public theorem ByteArray.le_size_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char} + (h : utf8DecodeChar? b i = some c) : i + c.utf8Size ≤ b.size := by + rw [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h + obtain ⟨l, hl⟩ := exists_of_utf8DecodeChar?_eq_some h + replace hl := congrArg ByteArray.size hl + simp at hl + have hi : i < b.size := by + simp [utf8DecodeChar?] at h + obtain ⟨h, -⟩ := h + omega + omega + +public theorem ByteArray.lt_size_of_isSome_utf8DecodeChar? {b : ByteArray} {i : Nat} (h : (utf8DecodeChar? b i).isSome) : + i < b.size := by + obtain ⟨c, hc⟩ := Option.isSome_iff_exists.1 h + have := le_size_of_utf8DecodeChar?_eq_some hc + have := c.utf8Size_pos + omega + +public theorem ByteArray.utf8DecodeChar?_append_eq_some {b : ByteArray} {i : Nat} {c : Char} (h : utf8DecodeChar? b i = some c) + (b' : ByteArray) : utf8DecodeChar? (b ++ b') i = some c := by + have := le_size_of_utf8DecodeChar?_eq_some h + rw [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at ⊢ h + rw [ByteArray.extract_eq_extract_append_extract b.size (by omega) (by simp), ByteArray.extract_append_size_left, + eq_of_utf8DecodeChar?_eq_some h, ByteArray.append_assoc, utf8DecodeChar?_utf8EncodeChar_append] + +public theorem ByteArray.isSome_utf8DecodeChar?_append {b : ByteArray} {i : Nat} (h : (utf8DecodeChar? b i).isSome) + (b' : ByteArray) : (utf8DecodeChar? (b ++ b') i).isSome := by + obtain ⟨c, hc⟩ := Option.isSome_iff_exists.1 h + rw [utf8DecodeChar?_append_eq_some hc, Option.isSome_some] + +public def ByteArray.utf8DecodeChar (bytes : ByteArray) (i : Nat) (h : (utf8DecodeChar? bytes i).isSome) : Char := + (utf8DecodeChar? bytes i).get h + +public theorem ByteArray.utf8DecodeChar_eq_utf8DecodeChar_extract {b : ByteArray} {i : Nat} {h} : + utf8DecodeChar b i h = + utf8DecodeChar (b.extract i b.size) 0 (by rwa [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h) := by + simp [utf8DecodeChar, ← utf8DecodeChar?_eq_utf8DecodeChar?_extract] + +theorem ByteArray.utf8DecodeChar_extract_congr_of_le {bytes : ByteArray} (i j j' : Nat) {h h'} (hj : j ≤ j') : + utf8DecodeChar (bytes.extract i j) 0 h = utf8DecodeChar (bytes.extract i j') 0 h' := by + obtain ⟨c, hc⟩ := Option.isSome_iff_exists.1 h + obtain ⟨c', hc'⟩ := Option.isSome_iff_exists.1 h' + have hij : i ≤ j := by + have := lt_size_of_isSome_utf8DecodeChar? h + simp only [ByteArray.size_extract] at this + omega + have h₀ := eq_of_utf8DecodeChar?_eq_some hc' + simp only [utf8DecodeChar, hc, Option.get_some, hc', ← Option.some_inj (a := c)] + have := utf8DecodeChar?_append_eq_some hc (bytes.extract j j') + rw [← ByteArray.extract_eq_extract_append_extract j hij hj] at this + rw [← this, hc'] + +public theorem ByteArray.utf8DecodeChar_extract_congr {bytes : ByteArray} (i j j' : Nat) {h h'} : + utf8DecodeChar (bytes.extract i j) 0 h = utf8DecodeChar (bytes.extract i j') 0 h' := by + obtain (hj|hj) := Nat.le_or_le j j' + · exact utf8DecodeChar_extract_congr_of_le _ _ _ hj + · exact (utf8DecodeChar_extract_congr_of_le _ _ _ hj).symm + +public theorem ByteArray.utf8EncodeChar_utf8DecodeChar {b : ByteArray} {i : Nat} {h} : + (String.utf8EncodeChar (utf8DecodeChar b i h)).toByteArray = b.extract i (i + (utf8DecodeChar b i h).utf8Size) := by + rw [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h + obtain ⟨c, hc⟩ := Option.isSome_iff_exists.1 h + rw [utf8DecodeChar_eq_utf8DecodeChar_extract, utf8DecodeChar, + String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some (b := b.extract i b.size) (by simp), + ByteArray.extract_extract, Nat.add_zero, Nat.min_eq_left] + exact le_size_of_utf8DecodeChar?_eq_some (by simp [← utf8DecodeChar?_eq_utf8DecodeChar?_extract]) + +@[simp] +public theorem List.utf8DecodeChar?_utf8Encode_singleton_append {b : ByteArray} {c : Char} : + ByteArray.utf8DecodeChar? ([c].utf8Encode ++ b) 0 = some c := by + rw [List.utf8Encode, List.flatMap_cons, List.toByteArray_append, + List.flatMap_nil, List.toByteArray_nil, ByteArray.append_empty, + ByteArray.utf8DecodeChar?_utf8EncodeChar_append] + +@[simp] +public theorem List.utf8DecodeChar?_utf8Encode_singleton {c : Char} : + [c].utf8Encode.utf8DecodeChar? 0 = some c := by + simpa using utf8DecodeChar?_utf8Encode_singleton_append (b := ByteArray.empty) + +@[simp] +public theorem List.utf8DecodeChar?_utf8Encode_cons {l : List Char} {c : Char} : + ByteArray.utf8DecodeChar? (c::l).utf8Encode 0 = some c := by + rw [List.utf8Encode, List.flatMap_cons, List.toByteArray_append, + ByteArray.utf8DecodeChar?_utf8EncodeChar_append] + +@[simp] +public theorem List.utf8DecodeChar_utf8Encode_cons {l : List Char} {c : Char} {h} : + ByteArray.utf8DecodeChar (c::l).utf8Encode 0 h = c := by + simp [ByteArray.utf8DecodeChar] diff --git a/src/Init/Data/String/Extra.lean b/src/Init/Data/String/Extra.lean index 15c06af6c0..b96b36fb23 100644 --- a/src/Init/Data/String/Extra.lean +++ b/src/Init/Data/String/Extra.lean @@ -133,92 +133,15 @@ the corresponding string, or panics if the array is not a valid UTF-8 encoding o @[inline] def fromUTF8! (a : ByteArray) : String := if h : validateUTF8 a then fromUTF8 a h else panic! "invalid UTF-8 string" -/-- -Returns the sequence of bytes in a character's UTF-8 encoding. --/ -def utf8EncodeCharFast (c : Char) : List UInt8 := - let v := c.val - if v ≤ 0x7f then - [v.toUInt8] - else if v ≤ 0x7ff then - [(v >>> 6).toUInt8 &&& 0x1f ||| 0xc0, - v.toUInt8 &&& 0x3f ||| 0x80] - else if v ≤ 0xffff then - [(v >>> 12).toUInt8 &&& 0x0f ||| 0xe0, - (v >>> 6).toUInt8 &&& 0x3f ||| 0x80, - v.toUInt8 &&& 0x3f ||| 0x80] - else - [(v >>> 18).toUInt8 &&& 0x07 ||| 0xf0, - (v >>> 12).toUInt8 &&& 0x3f ||| 0x80, - (v >>> 6).toUInt8 &&& 0x3f ||| 0x80, - v.toUInt8 &&& 0x3f ||| 0x80] - -private theorem Nat.add_two_pow_eq_or_of_lt {b : Nat} (i : Nat) (b_lt : b < 2 ^ i) (a : Nat) : - b + 2 ^ i * a = b ||| 2 ^ i * a := by - rw [Nat.add_comm, Nat.or_comm, Nat.two_pow_add_eq_or_of_lt b_lt] - -@[csimp] -theorem utf8EncodeChar_eq_utf8EncodeCharFast : @utf8EncodeChar = @utf8EncodeCharFast := by - funext c - simp only [utf8EncodeChar, utf8EncodeCharFast, UInt8.ofNat_uInt32ToNat, UInt8.ofNat_add, - UInt8.reduceOfNat, UInt32.le_iff_toNat_le, UInt32.reduceToNat] - split - · rfl - · split - · simp only [List.cons.injEq, ← UInt8.toNat_inj, UInt8.toNat_add, UInt8.toNat_ofNat', - Nat.reducePow, UInt8.reduceToNat, Nat.mod_add_mod, UInt8.toNat_or, UInt8.toNat_and, - UInt32.toNat_toUInt8, UInt32.toNat_shiftRight, UInt32.reduceToNat, Nat.reduceMod, and_true] - refine ⟨?_, ?_⟩ - · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 5 (by omega) 6, - Nat.and_two_pow_sub_one_eq_mod _ 5, Nat.shiftRight_eq_div_pow, - Nat.mod_eq_of_lt (b := 256) (by omega)] - · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, - Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.mod_mod_of_dvd _ (by decide)] - · split - · simp only [List.cons.injEq, ← UInt8.toNat_inj, UInt8.toNat_add, UInt8.toNat_ofNat', - Nat.reducePow, UInt8.reduceToNat, Nat.mod_add_mod, UInt8.toNat_or, UInt8.toNat_and, - UInt32.toNat_toUInt8, UInt32.toNat_shiftRight, UInt32.reduceToNat, Nat.reduceMod, and_true] - refine ⟨?_, ?_, ?_⟩ - · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 4 (by omega) 14, - Nat.and_two_pow_sub_one_eq_mod _ 4, Nat.shiftRight_eq_div_pow, - Nat.mod_eq_of_lt (b := 256) (by omega)] - · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, - Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.shiftRight_eq_div_pow, - Nat.mod_mod_of_dvd (c.val.toNat / 2 ^ 6) (by decide)] - · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, - Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.mod_mod_of_dvd c.val.toNat (by decide)] - · simp only [List.cons.injEq, ← UInt8.toNat_inj, UInt8.toNat_add, UInt8.toNat_ofNat', - Nat.reducePow, UInt8.reduceToNat, Nat.mod_add_mod, UInt8.toNat_or, UInt8.toNat_and, - UInt32.toNat_toUInt8, UInt32.toNat_shiftRight, UInt32.reduceToNat, Nat.reduceMod, and_true] - refine ⟨?_, ?_, ?_, ?_⟩ - · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 3 (by omega) 30, - Nat.and_two_pow_sub_one_eq_mod _ 3, Nat.shiftRight_eq_div_pow, - Nat.mod_mod_of_dvd _ (by decide)] - · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, - Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.shiftRight_eq_div_pow, - Nat.mod_mod_of_dvd (c.val.toNat / 2 ^ 12) (by decide)] - · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, - Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.shiftRight_eq_div_pow, - Nat.mod_mod_of_dvd (c.val.toNat / 2 ^ 6) (by decide)] - · rw [Nat.mod_eq_of_lt (by omega), Nat.add_two_pow_eq_or_of_lt 6 (by omega) 2, - Nat.and_two_pow_sub_one_eq_mod _ 6, Nat.mod_mod_of_dvd c.val.toNat (by decide)] - -@[simp] theorem length_utf8EncodeChar (c : Char) : (utf8EncodeChar c).length = c.utf8Size := by - simp [Char.utf8Size, utf8EncodeChar_eq_utf8EncodeCharFast, utf8EncodeCharFast] - cases Decidable.em (c.val ≤ 0x7f) <;> simp [*] - cases Decidable.em (c.val ≤ 0x7ff) <;> simp [*] - cases Decidable.em (c.val ≤ 0xffff) <;> simp [*] - /-- Encodes a string in UTF-8 as an array of bytes. -/ @[extern "lean_string_to_utf8"] def toUTF8 (a : @& String) : ByteArray := - ⟨⟨a.data.flatMap utf8EncodeChar⟩⟩ + a.bytes @[simp] theorem size_toUTF8 (s : String) : s.toUTF8.size = s.utf8ByteSize := by - simp [toUTF8, ByteArray.size, Array.size, utf8ByteSize, List.flatMap] - induction s.data <;> simp [List.map, utf8ByteSize.go, Nat.add_comm, *] + rfl /-- Accesses the indicated byte in the UTF-8 encoding of a string. diff --git a/src/Init/Data/UInt/Bitwise.lean b/src/Init/Data/UInt/Bitwise.lean index a1116fe27d..31853efe5f 100644 --- a/src/Init/Data/UInt/Bitwise.lean +++ b/src/Init/Data/UInt/Bitwise.lean @@ -1327,3 +1327,14 @@ theorem UInt64.right_le_or {a b : UInt64} : b ≤ a ||| b := by simpa [UInt64.le_iff_toNat_le] using Nat.right_le_or theorem USize.right_le_or {a b : USize} : b ≤ a ||| b := by simpa [USize.le_iff_toNat_le] using Nat.right_le_or + +theorem UInt8.and_lt_add_one {b c : UInt8} (h : c ≠ -1) : b &&& c < c + 1 := + UInt8.lt_of_le_of_lt UInt8.and_le_right (UInt8.lt_add_one h) +theorem UInt16.and_lt_add_one {b c : UInt16} (h : c ≠ -1) : b &&& c < c + 1 := + UInt16.lt_of_le_of_lt UInt16.and_le_right (UInt16.lt_add_one h) +theorem UInt32.and_lt_add_one {b c : UInt32} (h : c ≠ -1) : b &&& c < c + 1 := + UInt32.lt_of_le_of_lt UInt32.and_le_right (UInt32.lt_add_one h) +theorem UInt64.and_lt_add_one {b c : UInt64} (h : c ≠ -1) : b &&& c < c + 1 := + UInt64.lt_of_le_of_lt UInt64.and_le_right (UInt64.lt_add_one h) +theorem USize.and_lt_add_one {b c : USize} (h : c ≠ -1) : b &&& c < c + 1 := + USize.lt_of_le_of_lt USize.and_le_right (USize.lt_add_one h) diff --git a/src/Init/Data/UInt/Lemmas.lean b/src/Init/Data/UInt/Lemmas.lean index 7f58667f95..da178dc85d 100644 --- a/src/Init/Data/UInt/Lemmas.lean +++ b/src/Init/Data/UInt/Lemmas.lean @@ -3163,3 +3163,15 @@ protected theorem USize.sub_lt {a b : USize} (hb : 0 < b) (hab : b ≤ a) : a - rw [lt_iff_toNat_lt, USize.toNat_sub_of_le _ _ hab] refine Nat.sub_lt ?_ (USize.lt_iff_toNat_lt.1 hb) exact USize.lt_iff_toNat_lt.1 (USize.lt_of_lt_of_le hb hab) + +theorem UInt8.lt_add_one {c : UInt8} (h : c ≠ -1) : c < c + 1 := + UInt8.lt_iff_toBitVec_lt.2 (BitVec.lt_add_one (by simpa [← UInt8.toBitVec_inj] using h)) +theorem UInt16.lt_add_one {c : UInt16} (h : c ≠ -1) : c < c + 1 := + UInt16.lt_iff_toBitVec_lt.2 (BitVec.lt_add_one (by simpa [← UInt16.toBitVec_inj] using h)) +theorem UInt32.lt_add_one {c : UInt32} (h : c ≠ -1) : c < c + 1 := + UInt32.lt_iff_toBitVec_lt.2 (BitVec.lt_add_one (by simpa [← UInt32.toBitVec_inj] using h)) +theorem UInt64.lt_add_one {c : UInt64} (h : c ≠ -1) : c < c + 1 := + UInt64.lt_iff_toBitVec_lt.2 (BitVec.lt_add_one (by simpa [← UInt64.toBitVec_inj] using h)) +theorem USize.lt_add_one {c : USize} (h : c ≠ -1) : c < c + 1 := + USize.lt_iff_toBitVec_lt.2 (BitVec.lt_add_one + (by simpa [← USize.toBitVec_inj, BitVec.neg_one_eq_allOnes] using h)) diff --git a/src/Init/Prelude.lean b/src/Init/Prelude.lean index c0f4cf7ab3..f3e9bda653 100644 --- a/src/Init/Prelude.lean +++ b/src/Init/Prelude.lean @@ -3010,218 +3010,56 @@ def List.concat {α : Type u} : List α → α → List α | cons a as, b => cons a (concat as b) /-- -Returns the sequence of bytes in a character's UTF-8 encoding. +Appends two lists. Normally used via the `++` operator. + +Appending lists takes time proportional to the length of the first list: `O(|xs|)`. + +Examples: + * `[1, 2, 3] ++ [4, 5] = [1, 2, 3, 4, 5]`. + * `[] ++ [4, 5] = [4, 5]`. + * `[1, 2, 3] ++ [] = [1, 2, 3]`. -/ -def String.utf8EncodeChar (c : Char) : List UInt8 := - let v := c.val.toNat - ite (LE.le v 0x7f) - (List.cons (UInt8.ofNat v) List.nil) - (ite (LE.le v 0x7ff) - (List.cons - (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 64) 0x20) 0xc0)) - (List.cons - (UInt8.ofNat (HAdd.hAdd (HMod.hMod v 0x40) 0x80)) - List.nil)) - (ite (LE.le v 0xffff) - (List.cons - (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 4096) 0x10) 0xe0)) - (List.cons - (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 64) 0x40) 0x80)) - (List.cons - (UInt8.ofNat (HAdd.hAdd (HMod.hMod v 0x40) 0x80)) - List.nil))) - (List.cons - (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 262144) 0x08) 0xf0)) - (List.cons - (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 4096) 0x40) 0x80)) - (List.cons - (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 64) 0x40) 0x80)) - (List.cons - (UInt8.ofNat (HAdd.hAdd (HMod.hMod v 0x40) 0x80)) - List.nil)))))) +protected def List.append : (xs ys : List α) → List α + | nil, bs => bs + | cons a as, bs => cons a (List.append as bs) /-- -A string is a sequence of Unicode code points. +Concatenates a list of lists into a single list, preserving the order of the elements. -At runtime, strings are represented by [dynamic arrays](https://en.wikipedia.org/wiki/Dynamic_array) -of bytes using the UTF-8 encoding. Both the size in bytes (`String.utf8ByteSize`) and in characters -(`String.length`) are cached and take constant time. Many operations on strings perform in-place -modifications when the reference to the string is unique. +`O(|flatten L|)`. + +Examples: +* `[["a"], ["b", "c"]].flatten = ["a", "b", "c"]` +* `[["a"], [], ["b", "c"], ["d", "e", "f"]].flatten = ["a", "b", "c", "d", "e", "f"]` -/ -structure String where - /-- Pack a `List Char` into a `String`. This function is overridden by the - compiler and is O(n) in the length of the list. -/ - mk :: - /-- Unpack `String` into a `List Char`. This function is overridden by the - compiler and is O(n) in the length of the list. -/ - data : List Char - -attribute [extern "lean_string_mk"] String.mk -attribute [extern "lean_string_data"] String.data +def List.flatten : List (List α) → List α + | nil => nil + | cons l L => List.append l (flatten L) /-- -Decides whether two strings are equal. Normally used via the `DecidableEq String` instance and the -`=` operator. +Applies a function to each element of the list, returning the resulting list of values. -At runtime, this function is overridden with an efficient native implementation. +`O(|l|)`. + +Examples: +* `[a, b, c].map f = [f a, f b, f c]` +* `[].map Nat.succ = []` +* `["one", "two", "three"].map (·.length) = [3, 3, 5]` +* `["one", "two", "three"].map (·.reverse) = ["eno", "owt", "eerht"]` -/ -@[extern "lean_string_dec_eq"] -def String.decEq (s₁ s₂ : @& String) : Decidable (Eq s₁ s₂) := - match s₁, s₂ with - | ⟨s₁⟩, ⟨s₂⟩ => - dite (Eq s₁ s₂) (fun h => isTrue (congrArg _ h)) (fun h => isFalse (fun h' => String.noConfusion h' (fun h' => absurd h' h))) - -instance : DecidableEq String := String.decEq +@[specialize] def List.map (f : α → β) : (l : List α) → List β + | nil => nil + | cons a as => cons (f a) (map f as) /-- -A byte position in a `String`, according to its UTF-8 encoding. +Applies a function that returns a list to each element of a list, and concatenates the resulting +lists. -Character positions (counting the Unicode code points rather than bytes) are represented by plain -`Nat`s. Indexing a `String` by a `String.Pos` takes constant time, while character positions need to -be translated internally to byte positions, which takes linear time. - -A byte position `p` is *valid* for a string `s` if `0 ≤ p ≤ s.endPos` and `p` lies on a UTF-8 -character boundary. +Examples: +* `[2, 3, 2].flatMap List.range = [0, 1, 0, 1, 2, 0, 1]` +* `["red", "blue"].flatMap String.toList = ['r', 'e', 'd', 'b', 'l', 'u', 'e']` -/ -structure String.Pos where - /-- Get the underlying byte index of a `String.Pos` -/ - byteIdx : Nat := 0 - -instance : Inhabited String.Pos where - default := {} - -instance : DecidableEq String.Pos := - fun ⟨a⟩ ⟨b⟩ => match decEq a b with - | isTrue h => isTrue (h ▸ rfl) - | isFalse h => isFalse (fun he => String.Pos.noConfusion he fun he => absurd he h) - -/-- -A region or slice of some underlying string. - -A substring contains an string together with the start and end byte positions of a region of -interest. Actually extracting a substring requires copying and memory allocation, while many -substrings of the same underlying string may exist with very little overhead, and they are more -convenient than tracking the bounds by hand. - -Using its constructor explicitly, it is possible to construct a `Substring` in which one or both of -the positions is invalid for the string. Many operations will return unexpected or confusing results -if the start and stop positions are not valid. Instead, it's better to use API functions that ensure -the validity of the positions in a substring to create and manipulate them. --/ -structure Substring where - /-- The underlying string. -/ - str : String - /-- The byte position of the start of the string slice. -/ - startPos : String.Pos - /-- The byte position of the end of the string slice. -/ - stopPos : String.Pos - -instance : Inhabited Substring where - default := ⟨"", {}, {}⟩ - -/-- -The number of bytes used by the string's UTF-8 encoding. --/ -@[inline, expose] def Substring.bsize : Substring → Nat - | ⟨_, b, e⟩ => e.byteIdx.sub b.byteIdx - -/-- -The number of bytes used by the string's UTF-8 encoding. - -At runtime, this function takes constant time because the byte length of strings is cached. --/ -@[extern "lean_string_utf8_byte_size"] -def String.utf8ByteSize : (@& String) → Nat - | ⟨s⟩ => go s -where - go : List Char → Nat - | .nil => 0 - | .cons c cs => hAdd (go cs) c.utf8Size - -/-- -A UTF-8 byte position that points at the end of a string, just after the last character. - -* `"abc".endPos = ⟨3⟩` -* `"L∃∀N".endPos = ⟨8⟩` --/ -@[inline] def String.endPos (s : String) : String.Pos where - byteIdx := utf8ByteSize s - -/-- -Converts a `String` into a `Substring` that denotes the entire string. --/ -@[inline] def String.toSubstring (s : String) : Substring where - str := s - startPos := {} - stopPos := s.endPos - -/-- -Converts a `String` into a `Substring` that denotes the entire string. - -This is a version of `String.toSubstring` that doesn't have an `@[inline]` annotation. --/ -def String.toSubstring' (s : String) : Substring := - s.toSubstring - -/-- -This function will cast a value of type `α` to type `β`, and is a no-op in the -compiler. This function is **extremely dangerous** because there is no guarantee -that types `α` and `β` have the same data representation, and this can lead to -memory unsafety. It is also logically unsound, since you could just cast -`True` to `False`. For all those reasons this function is marked as `unsafe`. - -It is implemented by lifting both `α` and `β` into a common universe, and then -using `cast (lcProof : ULift (PLift α) = ULift (PLift β))` to actually perform -the cast. All these operations are no-ops in the compiler. - -Using this function correctly requires some knowledge of the data representation -of the source and target types. Some general classes of casts which are safe in -the current runtime: - -* `Array α` to `Array β` where `α` and `β` have compatible representations, - or more generally for other inductive types. -* `Quot α r` and `α`. -* `@Subtype α p` and `α`, or generally any structure containing only one - non-`Prop` field of type `α`. -* Casting `α` to/from `NonScalar` when `α` is a boxed generic type - (i.e. a function that accepts an arbitrary type `α` and is not specialized to - a scalar type like `UInt8`). --/ -unsafe def unsafeCast {α : Sort u} {β : Sort v} (a : α) : β := - PLift.down (ULift.down.{max u v} (cast lcProof (ULift.up.{max u v} (PLift.up a)))) - - -/-- Auxiliary definition for `panic`. -/ -/- -This is a workaround for `panic` occurring in monadic code. See issue #695. -The `panicCore` definition cannot be specialized since it is an extern. -When `panic` occurs in monadic code, the `Inhabited α` parameter depends on a -`[inst : Monad m]` instance. The `inst` parameter will not be eliminated during -specialization if it occurs inside of a binder (to avoid work duplication), and -will prevent the actual monad from being "copied" to the code being specialized. -When we reimplement the specializer, we may consider copying `inst` if it also -occurs outside binders or if it is an instance. --/ -@[never_extract, extern "lean_panic_fn"] -def panicCore {α : Sort u} [Inhabited α] (msg : String) : α := default - -/-- -`(panic "msg" : α)` has a built-in implementation which prints `msg` to -the error buffer. It *does not* terminate execution, and because it is a safe -function, it still has to return an element of `α`, so it takes `[Inhabited α]` -and returns `default`. It is primarily intended for debugging in pure contexts, -and assertion failures. - -Because this is a pure function with side effects, it is marked as -`@[never_extract]` so that the compiler will not perform common sub-expression -elimination and other optimizations that assume that the expression is pure. --/ -@[noinline, never_extract] -def panic {α : Sort u} [Inhabited α] (msg : String) : α := - panicCore msg - --- TODO: this be applied directly to `Inhabited`'s definition when we remove the above workaround -attribute [nospecialize] Inhabited +@[inline] def List.flatMap {α : Type u} {β : Type v} (b : α → List β) (as : List α) : List β := flatten (map b as) /-- `Array α` is the type of [dynamic arrays](https://en.wikipedia.org/wiki/Dynamic_array) with elements @@ -3452,6 +3290,276 @@ def Array.extract (as : Array α) (start : Nat := 0) (stop : Nat := as.size) : A let sz' := Nat.sub (min stop as.size) start loop sz' start (emptyWithCapacity sz') +/-- `ByteArray` is like `Array UInt8`, but with an efficient run-time representation as a packed +byte buffer. -/ +structure ByteArray where + /-- The data contained in the byte array. -/ + data : Array UInt8 + +attribute [extern "lean_byte_array_mk"] ByteArray.mk +attribute [extern "lean_byte_array_data"] ByteArray.data + +/-- +Constructs a new empty byte array with initial capacity `c`. +-/ +@[extern "lean_mk_empty_byte_array"] +def ByteArray.emptyWithCapacity (c : @& Nat) : ByteArray := + { data := Array.empty } + +/-- +Constructs a new empty byte array with initial capacity `0`. + +Use `ByteArray.emptyWithCapacity` to create an array with a greater initial capacity. +-/ +def ByteArray.empty : ByteArray := emptyWithCapacity 0 + +/-- +Adds an element to the end of an array. The resulting array's size is one greater than the input +array. If there are no other references to the array, then it is modified in-place. + +This takes amortized `O(1)` time because `Array α` is represented by a dynamic array. +-/ +@[extern "lean_byte_array_push"] +def ByteArray.push : ByteArray → UInt8 → ByteArray + | ⟨bs⟩, b => ⟨bs.push b⟩ + +/-- +Converts a list of bytes into a `ByteArray`. +-/ +def List.toByteArray (bs : List UInt8) : ByteArray := + let rec loop + | nil, r => r + | cons b bs, r => loop bs (r.push b) + loop bs ByteArray.empty + +/-- Returns the size of the byte array. -/ +@[extern "lean_byte_array_size"] +def ByteArray.size : (@& ByteArray) → Nat + | ⟨bs⟩ => bs.size + +/-- +Returns the sequence of bytes in a character's UTF-8 encoding. +-/ +def String.utf8EncodeChar (c : Char) : List UInt8 := + let v := c.val.toNat + ite (LE.le v 0x7f) + (List.cons (UInt8.ofNat v) List.nil) + (ite (LE.le v 0x7ff) + (List.cons + (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 64) 0x20) 0xc0)) + (List.cons + (UInt8.ofNat (HAdd.hAdd (HMod.hMod v 0x40) 0x80)) + List.nil)) + (ite (LE.le v 0xffff) + (List.cons + (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 4096) 0x10) 0xe0)) + (List.cons + (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 64) 0x40) 0x80)) + (List.cons + (UInt8.ofNat (HAdd.hAdd (HMod.hMod v 0x40) 0x80)) + List.nil))) + (List.cons + (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 262144) 0x08) 0xf0)) + (List.cons + (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 4096) 0x40) 0x80)) + (List.cons + (UInt8.ofNat (HAdd.hAdd (HMod.hMod (HDiv.hDiv v 64) 0x40) 0x80)) + (List.cons + (UInt8.ofNat (HAdd.hAdd (HMod.hMod v 0x40) 0x80)) + List.nil)))))) + +/-- Encode a list of characters (Unicode scalar value) in UTF-8. This is an inefficient model +implementation. Use `List.asString` instead. -/ +def List.utf8Encode (l : List Char) : ByteArray := + l.flatMap String.utf8EncodeChar |>.toByteArray + +/-- A byte array is valid UTF-8 if it is of the form `List.Internal.utf8Encode m` for some `m`. + +Note that in order for this definition to be well-behaved it is necessary to know that this `m` +is unique. To show this, one defines UTF-8 decoding and shows that encoding and decoding are +mutually inverse. -/ +inductive ByteArray.IsValidUtf8 (b : ByteArray) : Prop + /-- Show that a byte -/ + | intro (m : List Char) (hm : Eq b (List.utf8Encode m)) + +/-- +A string is a sequence of Unicode scalar values. + +At runtime, strings are represented by [dynamic arrays](https://en.wikipedia.org/wiki/Dynamic_array) +of bytes using the UTF-8 encoding. Both the size in bytes (`String.utf8ByteSize`) and in characters +(`String.length`) are cached and take constant time. Many operations on strings perform in-place +modifications when the reference to the string is unique. +-/ +structure String where ofByteArray :: + /-- The bytes of the UTF-8 encoding of the string. -/ + bytes : ByteArray + /-- The bytes of the string form valid UTF-8. -/ + isValidUtf8 : ByteArray.IsValidUtf8 bytes + +attribute [extern "lean_string_to_utf8"] String.bytes + +/-- +Decides whether two strings are equal. Normally used via the `DecidableEq String` instance and the +`=` operator. + +At runtime, this function is overridden with an efficient native implementation. +-/ +@[extern "lean_string_dec_eq"] +def String.decEq (s₁ s₂ : @& String) : Decidable (Eq s₁ s₂) := + match s₁, s₂ with + | ⟨⟨⟨s₁⟩⟩, _⟩, ⟨⟨⟨s₂⟩⟩, _⟩ => + dite (Eq s₁ s₂) (fun h => match s₁, s₂, h with | _, _, Eq.refl _ => isTrue rfl) + (fun h => isFalse + (fun h' => h (congrArg (fun s => Array.toList (ByteArray.data (String.bytes s))) h'))) + +instance : DecidableEq String := String.decEq + +/-- +A byte position in a `String`, according to its UTF-8 encoding. + +Character positions (counting the Unicode code points rather than bytes) are represented by plain +`Nat`s. Indexing a `String` by a `String.Pos` takes constant time, while character positions need to +be translated internally to byte positions, which takes linear time. + +A byte position `p` is *valid* for a string `s` if `0 ≤ p ≤ s.endPos` and `p` lies on a UTF-8 +character boundary. +-/ +structure String.Pos where + /-- Get the underlying byte index of a `String.Pos` -/ + byteIdx : Nat := 0 + +instance : Inhabited String.Pos where + default := {} + +instance : DecidableEq String.Pos := + fun ⟨a⟩ ⟨b⟩ => match decEq a b with + | isTrue h => isTrue (h ▸ rfl) + | isFalse h => isFalse (fun he => String.Pos.noConfusion he fun he => absurd he h) + +/-- +A region or slice of some underlying string. + +A substring contains an string together with the start and end byte positions of a region of +interest. Actually extracting a substring requires copying and memory allocation, while many +substrings of the same underlying string may exist with very little overhead, and they are more +convenient than tracking the bounds by hand. + +Using its constructor explicitly, it is possible to construct a `Substring` in which one or both of +the positions is invalid for the string. Many operations will return unexpected or confusing results +if the start and stop positions are not valid. Instead, it's better to use API functions that ensure +the validity of the positions in a substring to create and manipulate them. +-/ +structure Substring where + /-- The underlying string. -/ + str : String + /-- The byte position of the start of the string slice. -/ + startPos : String.Pos + /-- The byte position of the end of the string slice. -/ + stopPos : String.Pos + +instance : Inhabited Substring where + default := ⟨"", {}, {}⟩ + +/-- +The number of bytes used by the string's UTF-8 encoding. +-/ +@[inline, expose] def Substring.bsize : Substring → Nat + | ⟨_, b, e⟩ => e.byteIdx.sub b.byteIdx + +/-- +The number of bytes used by the string's UTF-8 encoding. + +At runtime, this function takes constant time because the byte length of strings is cached. +-/ +@[extern "lean_string_utf8_byte_size"] +def String.utf8ByteSize (s : @& String) : Nat := + s.bytes.size + +/-- +A UTF-8 byte position that points at the end of a string, just after the last character. + +* `"abc".endPos = ⟨3⟩` +* `"L∃∀N".endPos = ⟨8⟩` +-/ +@[inline] def String.endPos (s : String) : String.Pos where + byteIdx := utf8ByteSize s + +/-- +Converts a `String` into a `Substring` that denotes the entire string. +-/ +@[inline] def String.toSubstring (s : String) : Substring where + str := s + startPos := {} + stopPos := s.endPos + +/-- +Converts a `String` into a `Substring` that denotes the entire string. + +This is a version of `String.toSubstring` that doesn't have an `@[inline]` annotation. +-/ +def String.toSubstring' (s : String) : Substring := + s.toSubstring + +/-- +This function will cast a value of type `α` to type `β`, and is a no-op in the +compiler. This function is **extremely dangerous** because there is no guarantee +that types `α` and `β` have the same data representation, and this can lead to +memory unsafety. It is also logically unsound, since you could just cast +`True` to `False`. For all those reasons this function is marked as `unsafe`. + +It is implemented by lifting both `α` and `β` into a common universe, and then +using `cast (lcProof : ULift (PLift α) = ULift (PLift β))` to actually perform +the cast. All these operations are no-ops in the compiler. + +Using this function correctly requires some knowledge of the data representation +of the source and target types. Some general classes of casts which are safe in +the current runtime: + +* `Array α` to `Array β` where `α` and `β` have compatible representations, + or more generally for other inductive types. +* `Quot α r` and `α`. +* `@Subtype α p` and `α`, or generally any structure containing only one + non-`Prop` field of type `α`. +* Casting `α` to/from `NonScalar` when `α` is a boxed generic type + (i.e. a function that accepts an arbitrary type `α` and is not specialized to + a scalar type like `UInt8`). +-/ +unsafe def unsafeCast {α : Sort u} {β : Sort v} (a : α) : β := + PLift.down (ULift.down.{max u v} (cast lcProof (ULift.up.{max u v} (PLift.up a)))) + + +/-- Auxiliary definition for `panic`. -/ +/- +This is a workaround for `panic` occurring in monadic code. See issue #695. +The `panicCore` definition cannot be specialized since it is an extern. +When `panic` occurs in monadic code, the `Inhabited α` parameter depends on a +`[inst : Monad m]` instance. The `inst` parameter will not be eliminated during +specialization if it occurs inside of a binder (to avoid work duplication), and +will prevent the actual monad from being "copied" to the code being specialized. +When we reimplement the specializer, we may consider copying `inst` if it also +occurs outside binders or if it is an instance. +-/ +@[never_extract, extern "lean_panic_fn"] +def panicCore {α : Sort u} [Inhabited α] (msg : String) : α := default + +/-- +`(panic "msg" : α)` has a built-in implementation which prints `msg` to +the error buffer. It *does not* terminate execution, and because it is a safe +function, it still has to return an element of `α`, so it takes `[Inhabited α]` +and returns `default`. It is primarily intended for debugging in pure contexts, +and assertion failures. + +Because this is a pure function with side effects, it is marked as +`@[never_extract]` so that the compiler will not perform common sub-expression +elimination and other optimizations that assume that the expression is pure. +-/ +@[noinline, never_extract] +def panic {α : Sort u} [Inhabited α] (msg : String) : α := + panicCore msg + +-- TODO: this be applied directly to `Inhabited`'s definition when we remove the above workaround +attribute [nospecialize] Inhabited + /-- The `>>=` operator is overloaded via instances of `bind`. diff --git a/src/Init/SizeOf.lean b/src/Init/SizeOf.lean index ee5c00e984..ec8446e12d 100644 --- a/src/Init/SizeOf.lean +++ b/src/Init/SizeOf.lean @@ -84,10 +84,11 @@ deriving instance SizeOf for USize deriving instance SizeOf for Char deriving instance SizeOf for Option deriving instance SizeOf for List +deriving instance SizeOf for Array +deriving instance SizeOf for ByteArray deriving instance SizeOf for String deriving instance SizeOf for String.Pos deriving instance SizeOf for Substring -deriving instance SizeOf for Array deriving instance SizeOf for Except deriving instance SizeOf for EStateM.Result diff --git a/src/Lean/DocString/Parser.lean b/src/Lean/DocString/Parser.lean index 2e79782977..2ae30749c1 100644 --- a/src/Lean/DocString/Parser.lean +++ b/src/Lean/DocString/Parser.lean @@ -611,7 +611,7 @@ mutual asStringFn (atomicFn (noSpaceBefore >> repFn count (satisfyFn (· == char) s!"'{tok count}'")))) where - tok (count : Nat) : String := ⟨List.replicate count char⟩ + tok (count : Nat) : String := (List.replicate count char).asString opener (ctxt : InlineCtxt) : ParserFn := match getter ctxt with | none => many1Fn (satisfyFn (· == char) s!"any number of {char}s") diff --git a/tests/lean/interactive/hover.lean.expected.out b/tests/lean/interactive/hover.lean.expected.out index c23b82bd6e..f4b9ca9fef 100644 --- a/tests/lean/interactive/hover.lean.expected.out +++ b/tests/lean/interactive/hover.lean.expected.out @@ -631,7 +631,7 @@ "end": {"line": 294, "character": 20}}, "contents": {"value": - "```lean\nList.map.{u, v} {α : Type u} {β : Type v} (f : α → β) (l : List α) : List β\n```\n***\nApplies a function to each element of the list, returning the resulting list of values.\n\n`O(|l|)`.\n\nExamples:\n* `[a, b, c].map f = [f a, f b, f c]`\n* `[].map Nat.succ = []`\n* `[\"one\", \"two\", \"three\"].map (·.length) = [3, 3, 5]`\n* `[\"one\", \"two\", \"three\"].map (·.reverse) = [\"eno\", \"owt\", \"eerht\"]`\n\n***\n*import Init.Data.List.Basic*", + "```lean\nList.map.{u_1, u_2} {α : Type u_1} {β : Type u_2} (f : α → β) (l : List α) : List β\n```\n***\nApplies a function to each element of the list, returning the resulting list of values.\n\n`O(|l|)`.\n\nExamples:\n* `[a, b, c].map f = [f a, f b, f c]`\n* `[].map Nat.succ = []`\n* `[\"one\", \"two\", \"three\"].map (·.length) = [3, 3, 5]`\n* `[\"one\", \"two\", \"three\"].map (·.reverse) = [\"eno\", \"owt\", \"eerht\"]`\n\n***\n*import Init.Prelude*", "kind": "markdown"}} {"textDocument": {"uri": "file:///hover.lean"}, "position": {"line": 297, "character": 26}} diff --git a/tests/lean/run/2846.lean b/tests/lean/run/2846.lean index ae9e389c23..9c345b6a67 100644 --- a/tests/lean/run/2846.lean +++ b/tests/lean/run/2846.lean @@ -5,8 +5,8 @@ /-! Defined without named arguments, prints without named arguments. -/ -/-- info: String.append : String → String → String -/ -#guard_msgs in #check String.append +/-- info: Nat.add : Nat → Nat → Nat -/ +#guard_msgs in #check Nat.add /-! The List argument is not named, it is not printed as a named argument. diff --git a/tests/lean/run/arthur1.lean b/tests/lean/run/arthur1.lean index 16e38b0a7a..5bf10b6134 100644 --- a/tests/lean/run/arthur1.lean +++ b/tests/lean/run/arthur1.lean @@ -85,7 +85,7 @@ def removeRightmostZeros (s : String) : String := if a != '0' then aux [] (a :: (buff ++ res)) as else aux (a :: buff) res as - ⟨aux [] [] s.data⟩ + (aux [] [] s.data).asString protected def Literal.toString : Literal → String | bool b => toString b diff --git a/tests/lean/run/arthur2.lean b/tests/lean/run/arthur2.lean index 3cb7873927..9797d1926e 100644 --- a/tests/lean/run/arthur2.lean +++ b/tests/lean/run/arthur2.lean @@ -85,7 +85,7 @@ def removeRightmostZeros (s : String) : String := if a != '0' then aux [] (a :: (buff ++ res)) as else aux (a :: buff) res as - ⟨aux [] [] s.data⟩ + (aux [] [] s.data).asString protected def Literal.toString : Literal → String | bool b => toString b diff --git a/tests/lean/run/fun_cases.lean b/tests/lean/run/fun_cases.lean index 1de0946b18..043875013c 100644 --- a/tests/lean/run/fun_cases.lean +++ b/tests/lean/run/fun_cases.lean @@ -11,8 +11,8 @@ example (x : Option Nat) (f : Nat → Nat) : (x.map f).isSome = x.isSome := by case case2 => simp /-- -info: List.map.fun_cases.{u} {α : Type u} (motive : List α → Prop) (case1 : motive []) - (case2 : ∀ (a : α) (as : List α), motive (a :: as)) (x✝ : List α) : motive x✝ +info: List.map.fun_cases.{u_1} {α : Type u_1} (motive : List α → Prop) (case1 : motive []) + (case2 : ∀ (head : α) (as : List α), motive (head :: as)) (x✝ : List α) : motive x✝ -/ #guard_msgs in #check List.map.fun_cases diff --git a/tests/lean/run/lex.lean b/tests/lean/run/lex.lean index b0d6cbf208..e036c79669 100644 --- a/tests/lean/run/lex.lean +++ b/tests/lean/run/lex.lean @@ -78,7 +78,7 @@ namespace NonMutual def lex [Monad m] [MonadExceptOf LexErr m] (current? : Option (List Char × Nat)) (it : String.Iterator) : m (List Token) := do let currTok := fun - | (cs, n) => { text := {data := cs.reverse}, tok := Tok.num n } + | (cs, n) => { text := cs.reverse.asString , tok := Tok.num n } if it.atEnd then return current?.toList.map currTok else diff --git a/tests/lean/run/printEqns.lean b/tests/lean/run/printEqns.lean index 6be213f783..68d2dd3a03 100644 --- a/tests/lean/run/printEqns.lean +++ b/tests/lean/run/printEqns.lean @@ -1,17 +1,17 @@ /-- info: equations: -@[defeq] theorem List.append.eq_1.{u} : ∀ {α : Type u} (x : List α), [].append x = x -@[defeq] theorem List.append.eq_2.{u} : ∀ {α : Type u} (x : List α) (a : α) (l : List α), - (a :: l).append x = a :: l.append x +@[defeq] theorem List.append.eq_1.{u_1} : ∀ {α : Type u_1} (x : List α), [].append x = x +@[defeq] theorem List.append.eq_2.{u_1} : ∀ {α : Type u_1} (x : List α) (a : α) (as : List α), + (a :: as).append x = a :: as.append x -/ #guard_msgs in #print eqns List.append /-- info: equations: -@[defeq] theorem List.append.eq_1.{u} : ∀ {α : Type u} (x : List α), [].append x = x -@[defeq] theorem List.append.eq_2.{u} : ∀ {α : Type u} (x : List α) (a : α) (l : List α), - (a :: l).append x = a :: l.append x +@[defeq] theorem List.append.eq_1.{u_1} : ∀ {α : Type u_1} (x : List α), [].append x = x +@[defeq] theorem List.append.eq_2.{u_1} : ∀ {α : Type u_1} (x : List α) (a : α) (as : List α), + (a :: as).append x = a :: as.append x -/ #guard_msgs in #print equations List.append diff --git a/tests/lean/setLit.lean.expected.out b/tests/lean/setLit.lean.expected.out index 78a9c7253c..64485f5db6 100644 --- a/tests/lean/setLit.lean.expected.out +++ b/tests/lean/setLit.lean.expected.out @@ -4,10 +4,11 @@ setLit.lean:22:19-22:21: error: overloaded, errors Hint: Additional diagnostic information may be available using the `set_option diagnostics true` command. - Fields missing: `data` + Fields missing: `bytes`, `isValidUtf8` Hint: Add missing fields: - ̲d̲a̲t̲a̲ ̲:̲=̲ ̲_̲ ̲ + ̲b̲y̲t̲e̲s̲ ̲:̲=̲ ̲_̲ + ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲ ̲i̲s̲V̲a̲l̲i̲d̲U̲t̲f̲8̲ ̲:̲=̲ ̲_̲ ̲ setLit.lean:24:31-24:38: error: overloaded, errors failed to synthesize Singleton Nat String diff --git a/tests/lean/sizeof.lean.expected.out b/tests/lean/sizeof.lean.expected.out index 59217b3878..e17ceafea8 100644 --- a/tests/lean/sizeof.lean.expected.out +++ b/tests/lean/sizeof.lean.expected.out @@ -4,7 +4,7 @@ 13 101 558 -311 -313 +310 +312 11 InfTree.node.sizeOf_spec.{u} {α : Type u} [SizeOf α] (children : Nat → InfTree α) : sizeOf (InfTree.node children) = 1 diff --git a/tests/lean/string_imp2.lean b/tests/lean/string_imp2.lean index 80dff0c48d..e72d2381f4 100644 --- a/tests/lean/string_imp2.lean +++ b/tests/lean/string_imp2.lean @@ -20,7 +20,7 @@ it₁.remainingToString ++ "-" ++ it₂.remainingToString #eval "αβγ".mkIterator.next.1 #eval "αβγ".mkIterator.next.next.1 #eval "αβγ".mkIterator.next.2 -#eval "αβ".1 +#eval "αβ".data #eval "αβ".push 'a' #eval g "α" #eval "".mkIterator.curr