From 5bfbe2a875be0ec7d32552ee5aae4da989d29a0a Mon Sep 17 00:00:00 2001 From: Markus Himmel Date: Wed, 1 Oct 2025 13:33:29 +0200 Subject: [PATCH] refactor: incorporate UTF8 material from String.Extra into String.Basic (#10634) This PR defines `ByteArray.validateUTF8`, uses it to show that `ByteArray.IsValidUtf8` is decidable and redefines `String.fromUTF8` and friends to use it. The functions `String.validateUTF8` and `String.utf8DecodeChar?` are deprecated in favor of the identically named functions in the `ByteArray` namespace. --- src/Init/Data/String/Basic.lean | 94 ++++++++++++++++++++++++++- src/Init/Data/String/Decode.lean | 105 ++++++++++++++++++++++++++++++- src/Init/Data/String/Extra.lean | 105 ++----------------------------- tests/lean/run/utf8英語.lean | 6 +- 4 files changed, 206 insertions(+), 104 deletions(-) diff --git a/src/Init/Data/String/Basic.lean b/src/Init/Data/String/Basic.lean index 22ba20e7e4..fbdd90297e 100644 --- a/src/Init/Data/String/Basic.lean +++ b/src/Init/Data/String/Basic.lean @@ -94,6 +94,55 @@ where (have := c.utf8Size_pos; have := le_size_of_utf8DecodeChar?_eq_some h; by omega) termination_by structural fuel +@[expose, extern "lean_string_validate_utf8"] +def ByteArray.validateUTF8 (b : ByteArray) : Bool := + go (b.size + 1) 0 (by simp) (by simp) +where + go (fuel : Nat) (i : Nat) (hi : i ≤ b.size) (hf : b.size - i < fuel) : Bool := + match fuel, hf with + | fuel + 1, _ => + if hi : i = b.size then + true + else + match h : validateUtf8At b i with + | false => false + | true => go fuel (i + (b[i].utf8ByteSize (isUtf8FirstByte_of_validateUtf8At h)).byteIdx) + ?_ ?_ + termination_by structural fuel +finally + all_goals rw [ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar?] at h + · rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)] + exact add_utf8Size_utf8DecodeChar_le_size + · rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)] + have := add_utf8Size_utf8DecodeChar_le_size (h := h) + have := (b.utf8DecodeChar i h).utf8Size_pos + omega + +theorem ByteArray.isSome_utf8Decode?Go_eq_validateUtf8Go {b : ByteArray} {fuel : Nat} + {i : Nat} {acc : Array Char} {hi : i ≤ b.size} {hf : b.size - i < fuel} : + (utf8Decode?.go b fuel i acc hi hf).isSome = validateUTF8.go b fuel i hi hf := by + fun_induction utf8Decode?.go with + | case1 => simp [validateUTF8.go] + | case2 i acc hi fuel hf h₁ h₂ => + simp only [Option.isSome_none, validateUTF8.go, h₁, ↓reduceDIte, Bool.false_eq] + split + · rfl + · rename_i heq + simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq + | case3 i acc hi fuel hf h₁ c h₂ ih => + simp [validateUTF8.go, h₁] + split + · rename_i heq + simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq + · rw [ih] + congr + rw [← ByteArray.utf8Size_utf8DecodeChar (h := by simp [h₂])] + simp [utf8DecodeChar, h₂] + +theorem ByteArray.isSome_utf8Decode?_eq_validateUtf8 {b : ByteArray} : + b.utf8Decode?.isSome = b.validateUTF8 := + b.isSome_utf8Decode?Go_eq_validateUtf8Go + theorem ByteArray.utf8Decode?.go.congr {b b' : ByteArray} {fuel fuel' i i' : Nat} {acc acc' : Array Char} {hi hi' hf hf'} (hbb' : b = b') (hii' : i = i') (hacc : acc = acc') : ByteArray.utf8Decode?.go b fuel i acc hi hf = ByteArray.utf8Decode?.go b' fuel' i' acc' hi' hf' := by @@ -151,7 +200,50 @@ theorem ByteArray.isSome_utf8Decode?_iff {b : ByteArray} : rw [utf8Decode?, isSome_utf8Decode?go_iff, extract_zero_size] @[simp] -theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl) +theorem ByteArray.validateUTF8_eq_true_iff {b : ByteArray} : + b.validateUTF8 = true ↔ IsValidUtf8 b := by + rw [← isSome_utf8Decode?_eq_validateUtf8, isSome_utf8Decode?_iff] + +@[simp] +theorem ByteArray.validateUTF8_eq_false_iff {b : ByteArray} : + b.validateUTF8 = false ↔ ¬ IsValidUtf8 b := by + simp [← Bool.not_eq_true] + +instance {b : ByteArray} : Decidable b.IsValidUtf8 := + decidable_of_iff (b.validateUTF8 = true) ByteArray.validateUTF8_eq_true_iff + +/-- +Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into +the corresponding string. +-/ +@[inline, expose] +def String.fromUTF8 (a : @& ByteArray) (h : a.IsValidUtf8) : String := + .ofByteArray a h + +/-- +Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into +the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string. +-/ +@[inline, expose] def String.fromUTF8? (a : ByteArray) : Option String := + if h : a.IsValidUtf8 then some (fromUTF8 a h) else none + +/-- +Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into +the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string. +-/ +@[inline, expose] def String.fromUTF8! (a : ByteArray) : String := + if h : a.IsValidUtf8 then fromUTF8 a h else panic! "invalid UTF-8 string" + +/-- +Encodes a string in UTF-8 as an array of bytes. +-/ +@[extern "lean_string_to_utf8"] +def String.toUTF8 (a : @& String) : ByteArray := + a.bytes + +@[simp] theorem String.toUTF8_eq_bytes {s : String} : s.toUTF8 = s.bytes := (rfl) + +@[simp] theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl) /-- Appends two strings. Usually accessed via the `++` operator. diff --git a/src/Init/Data/String/Decode.lean b/src/Init/Data/String/Decode.lean index 7b175786ee..4b58e9eee0 100644 --- a/src/Init/Data/String/Decode.lean +++ b/src/Init/Data/String/Decode.lean @@ -499,6 +499,14 @@ theorem assemble₁_eq_some_iff_utf8EncodeChar_eq {w : UInt8} {c : Char} : omega simpa [String.utf8EncodeChar_eq_singleton hc, assemble₁, Char.ext_iff, ← UInt32.toNat_inj] +@[inline, expose] +public def verify₁ (_w : UInt8) (_h : parseFirstByte w = .done) : Bool := + true + +theorem verify₁_eq_isSome_assemble₁ {w : UInt8} {h : parseFirstByte w = .done} : + verify₁ w h = (assemble₁ w h).isSome := by + simp [verify₁, assemble₁] + /-! # `assemble₂` -/ @[inline, expose] @@ -531,6 +539,14 @@ where finally rw [Nat.shiftLeft_eq, Nat.mod_eq_of_lt (by omega), Nat.mul_comm, ← Nat.two_pow_add_eq_or_of_lt hb₁] omega +@[inline, expose] +public def verify₂ (w x : UInt8) : Bool := + if isInvalidContinuationByte x then + false + else + let r := assemble₂Unchecked w x + 0x80 ≤ r + theorem helper₃ {x : UInt8} (n : Nat) (hn : n < 8) : (x &&& UInt8.ofNat (2 ^ n - 1)).toUInt32.toBitVec = (x.toBitVec.setWidth n).setWidth 32 := by apply BitVec.eq_of_toNat_eq @@ -628,6 +644,12 @@ theorem assemble₂_eq_some_iff_utf8EncodeChar_eq {x y : UInt8} {c : Char} : BitVec.setWidth_setWidth_eq_self] simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_two_iff.1 hc).2 +theorem verify₂_eq_isSome_assemble₂ {w x : UInt8} : verify₂ w x = (assemble₂ w x).isSome := by + simp only [verify₂, assemble₂] + split + · simp + · split <;> simp_all + /-! # `assemble₃` -/ @[inline, expose] @@ -671,6 +693,14 @@ where finally ← Nat.two_pow_add_eq_or_of_lt (by omega)] omega +@[inline, expose] +public def verify₃ (w x y : UInt8) : Bool := + if isInvalidContinuationByte x || isInvalidContinuationByte y then + false + else + let r := assemble₃Unchecked w x y + 0x800 ≤ r ∧ (r < 0xd800 ∨ 0xdfff < r) + theorem toBitVec_assemble₃Unchecked {w x y : UInt8} : (assemble₃Unchecked w x y).toBitVec = (w.toBitVec.setWidth 4 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6).setWidth 32 := by have hw : (w &&& 15).toUInt32.toBitVec = (w.toBitVec.setWidth 4).setWidth 32 := helper₃ 4 (by decide) @@ -772,6 +802,18 @@ theorem assemble₃_eq_some_iff_utf8EncodeChar_eq {w x y : UInt8} {c : Char} : ← BitVec.setWidth_eq_extractLsb' (by simp), BitVec.setWidth_setWidth_eq_self] simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_three_iff.1 hc).2 +theorem verify₃_eq_isSome_assemble₃ {w x y : UInt8} : + verify₃ w x y = (assemble₃ w x y).isSome := by + simp only [verify₃, assemble₃] + split + · simp + · split + · simp_all [← UInt32.not_le] + · split + · simp_all + · rename_i h h' + simpa [UInt32.not_lt.1 h] using Classical.not_and_iff_not_or_not.1 h' + /-! # `assemble₄` -/ @[inline, expose] @@ -799,6 +841,14 @@ where finally simp only [UInt32.not_lt, UInt32.le_iff_toNat_le, UInt32.reduceToNat] at h₁ h₂ exact Or.inr ⟨by omega, by omega⟩ +@[inline, expose] +public def verify₄ (w x y z : UInt8) : Bool := + if isInvalidContinuationByte x || isInvalidContinuationByte y || isInvalidContinuationByte z then + false + else + let r := assemble₄Unchecked w x y z + 0x10000 ≤ r ∧ r ≤ 0x10ffff + theorem toBitVec_assemble₄Unchecked {w x y z : UInt8} : (assemble₄Unchecked w x y z).toBitVec = (w.toBitVec.setWidth 3 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ++ z.toBitVec.setWidth 6).setWidth 32 := by have hw : (w &&& 7).toUInt32.toBitVec = (w.toBitVec.setWidth 3).setWidth 32 := helper₃ 3 (by decide) @@ -917,11 +967,22 @@ theorem assemble₄_eq_some_iff_utf8EncodeChar_eq {w x y z : UInt8} {c : Char} : Nat.reducePow, Nat.reduceMod, gt_iff_lt] omega +theorem verify₄_eq_isSome_assemble₄ {w x y z : UInt8} : + verify₄ w x y z = (assemble₄ w x y z).isSome := by + simp only [verify₄, assemble₄] + split + · simp + · split + · simp_all [← UInt32.not_lt] + · split + · simp_all + · simp_all [← UInt32.not_lt] + end ByteArray.utf8DecodeChar? open ByteArray.utf8DecodeChar? -/- # `utf8DecodeChar?` -/ +/- # `utf8DecodeChar?` and `validateUtf8At` -/ @[inline, expose] public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char := @@ -944,6 +1005,27 @@ public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char else none else none +@[inline, expose] +public def ByteArray.validateUtf8At (bytes : ByteArray) (i : Nat) : Bool := + if h₀ : i < bytes.size then + match h : parseFirstByte bytes[i] with + | .invalid => false + | .done => verify₁ bytes[i] h + | .oneMore => + if h₁ : i + 1 < bytes.size then + verify₂ bytes[i] bytes[i + 1] + else + false + | .twoMore => + if h₁ : i + 2 < bytes.size then + verify₃ bytes[i] bytes[i + 1] bytes[i + 2] + else false + | .threeMore => + if h₁ : i + 3 < bytes.size then + verify₄ bytes[i] bytes[i + 1] bytes[i + 2] bytes[i + 3] + else false + else false + /-! # `utf8DecodeChar?` low-level API -/ theorem parseFirstByte_eq_done_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char} @@ -1130,6 +1212,14 @@ public theorem String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some {b : rw [← assemble₄_eq_some_iff_utf8EncodeChar_eq] exact ⟨this, h⟩ +public theorem ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar? {b : ByteArray} {i : Nat} : + b.validateUtf8At i = (b.utf8DecodeChar? i).isSome := by + simp only [validateUtf8At, utf8DecodeChar?] + split + · split <;> (try split) <;> simp [verify₁_eq_isSome_assemble₁, verify₂_eq_isSome_assemble₂, + verify₃_eq_isSome_assemble₃, verify₄_eq_isSome_assemble₄] + · simp + /-! # Corollaries -/ public theorem ByteArray.eq_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : utf8DecodeChar? b 0 = some c) : @@ -1195,6 +1285,10 @@ public theorem ByteArray.lt_size_of_isSome_utf8DecodeChar? {b : ByteArray} {i : have := c.utf8Size_pos omega +public theorem ByteArray.lt_size_of_validateUtf8At {b : ByteArray} {i : Nat} : + validateUtf8At b i = true → i < b.size := + validateUtf8At_eq_isSome_utf8DecodeChar? ▸ lt_size_of_isSome_utf8DecodeChar? + public theorem ByteArray.utf8DecodeChar?_append_eq_some {b : ByteArray} {i : Nat} {c : Char} (h : utf8DecodeChar? b i = some c) (b' : ByteArray) : utf8DecodeChar? (b ++ b') i = some c := by have := le_size_of_utf8DecodeChar?_eq_some h @@ -1211,6 +1305,10 @@ public theorem ByteArray.isSome_utf8DecodeChar?_append {b : ByteArray} {i : Nat} public def ByteArray.utf8DecodeChar (bytes : ByteArray) (i : Nat) (h : (utf8DecodeChar? bytes i).isSome) : Char := (utf8DecodeChar? bytes i).get h +public theorem ByteArray.add_utf8Size_utf8DecodeChar_le_size {b : ByteArray} {i : Nat} {h} : + i + (b.utf8DecodeChar i h).utf8Size ≤ b.size := + le_size_of_utf8DecodeChar?_eq_some (by simp [utf8DecodeChar]) + public theorem ByteArray.utf8DecodeChar_eq_utf8DecodeChar_extract {b : ByteArray} {i : Nat} {h} : utf8DecodeChar b i h = utf8DecodeChar (b.extract i b.size) 0 (by rwa [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h) := by @@ -1363,6 +1461,11 @@ public theorem ByteArray.isUtf8FirstByte_of_isSome_utf8DecodeChar? {b : ByteArra conv => congr; congr; rw [eq_of_utf8DecodeChar?_eq_some hc] exact isUtf8FirstByte_getElem_zero_utf8EncodeChar_append +public theorem ByteArray.isUtf8FirstByte_of_validateUtf8At {b : ByteArray} {i : Nat} : + (h : validateUtf8At b i = true) → (b[i]'(lt_size_of_validateUtf8At h)).IsUtf8FirstByte := by + simp only [validateUtf8At_eq_isSome_utf8DecodeChar?] + exact isUtf8FirstByte_of_isSome_utf8DecodeChar? + theorem Char.byteIdx_utf8ByteSize_getElem_utf8EncodeChar {c : Char} : (((String.utf8EncodeChar c)[0]'(by simp [c.utf8Size_pos])).utf8ByteSize UInt8.isUtf8FirstByte_getElem_zero_utf8EncodeChar).byteIdx = c.utf8Size := by diff --git a/src/Init/Data/String/Extra.lean b/src/Init/Data/String/Extra.lean index 5bd74a13cc..f519ad802a 100644 --- a/src/Init/Data/String/Extra.lean +++ b/src/Init/Data/String/Extra.lean @@ -38,109 +38,16 @@ def toNat! (s : String) : Nat := else panic! "Nat expected" -/-- -Decodes the UTF-8 character sequence that starts at a given index in a byte array, or `none` if -index `i` is out of bounds or is not the start of a valid UTF-8 character. --/ -def utf8DecodeChar? (a : ByteArray) (i : Nat) : Option Char := do - let c ← a[i]? - if c &&& 0x80 == 0 then - some ⟨c.toUInt32, .inl (Nat.lt_trans c.toBitVec.isLt (by decide))⟩ - else if c &&& 0xe0 == 0xc0 then - let c1 ← a[i+1]? - guard (c1 &&& 0xc0 == 0x80) - let r := ((c &&& 0x1f).toUInt32 <<< 6) ||| (c1 &&& 0x3f).toUInt32 - guard (0x80 ≤ r) - -- TODO: Prove h from the definition of r once we have the necessary lemmas - if h : r < 0xd800 then some ⟨r, .inl ((UInt32.lt_ofNat_iff (by decide)).1 h)⟩ else none - else if c &&& 0xf0 == 0xe0 then - let c1 ← a[i+1]? - let c2 ← a[i+2]? - guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80) - let r := - ((c &&& 0x0f).toUInt32 <<< 12) ||| - ((c1 &&& 0x3f).toUInt32 <<< 6) ||| - (c2 &&& 0x3f).toUInt32 - guard (0x800 ≤ r) - -- TODO: Prove `r < 0x110000` from the definition of r once we have the necessary lemmas - if h : r < 0xd800 ∨ 0xdfff < r ∧ r < 0x110000 then - have := - match h with - | .inl h => Or.inl ((UInt32.lt_ofNat_iff (by decide)).1 h) - | .inr h => Or.inr ⟨(UInt32.ofNat_lt_iff (by decide)).1 h.left, (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩ - some ⟨r, this⟩ - else - none - else if c &&& 0xf8 == 0xf0 then - let c1 ← a[i+1]? - let c2 ← a[i+2]? - let c3 ← a[i+3]? - guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80 && c3 &&& 0xc0 == 0x80) - let r := - ((c &&& 0x07).toUInt32 <<< 18) ||| - ((c1 &&& 0x3f).toUInt32 <<< 12) ||| - ((c2 &&& 0x3f).toUInt32 <<< 6) ||| - (c3 &&& 0x3f).toUInt32 - if h : 0x10000 ≤ r ∧ r < 0x110000 then - some ⟨r, .inr ⟨Nat.lt_of_lt_of_le (by decide) ((UInt32.ofNat_le_iff (by decide)).1 h.left), (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩⟩ - else none - else - none +@[deprecated ByteArray.utf8DecodeChar? (since := "2025-10-01")] +abbrev utf8DecodeChar? (a : ByteArray) (i : Nat) : Option Char := + a.utf8DecodeChar? i /-- Checks whether an array of bytes is a valid UTF-8 encoding of a string. -/ -@[extern "lean_string_validate_utf8"] -def validateUTF8 (a : @& ByteArray) : Bool := - (loop 0).isSome -where - loop (i : Nat) : Option Unit := do - if i < a.size then - let c ← utf8DecodeChar? a i - loop (i + c.utf8Size) - else pure () - termination_by a.size - i - decreasing_by exact Nat.sub_lt_sub_left ‹_› (Nat.lt_add_of_pos_right c.utf8Size_pos) - -/-- -Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into -the corresponding string. --/ -@[extern "lean_string_from_utf8_unchecked"] -def fromUTF8 (a : @& ByteArray) (h : validateUTF8 a) : String := - loop 0 "" -where - loop (i : Nat) (acc : String) : String := - if i < a.size then - let c := (utf8DecodeChar? a i).getD default - loop (i + c.utf8Size) (acc.push c) - else acc - termination_by a.size - i - decreasing_by exact Nat.sub_lt_sub_left ‹_› (Nat.lt_add_of_pos_right c.utf8Size_pos) - -/-- -Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into -the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string. --/ -@[inline] def fromUTF8? (a : ByteArray) : Option String := - if h : validateUTF8 a then some (fromUTF8 a h) else none - -/-- -Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into -the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string. --/ -@[inline] def fromUTF8! (a : ByteArray) : String := - if h : validateUTF8 a then fromUTF8 a h else panic! "invalid UTF-8 string" - -/-- -Encodes a string in UTF-8 as an array of bytes. --/ -@[extern "lean_string_to_utf8"] -def toUTF8 (a : @& String) : ByteArray := - a.bytes - -@[simp] theorem size_toUTF8 (s : String) : s.toUTF8.size = s.utf8ByteSize := by - rfl +@[deprecated ByteArray.validateUTF8 (since := "2025-10-01")] +abbrev validateUTF8 (a : ByteArray) : Bool := + a.validateUTF8 theorem Iterator.sizeOf_next_lt_of_hasNext (i : String.Iterator) (h : i.hasNext) : sizeOf i.next < sizeOf i := by cases i; rename_i s pos; simp [Iterator.next, Iterator.sizeOf_eq]; simp [Iterator.hasNext] at h diff --git a/tests/lean/run/utf8英語.lean b/tests/lean/run/utf8英語.lean index 8fa6101618..be5ade6179 100644 --- a/tests/lean/run/utf8英語.lean +++ b/tests/lean/run/utf8英語.lean @@ -16,10 +16,10 @@ def checkGet (s : String) (arr : Array UInt8) := c == arr[i]! macro "validate" arr:term " => " "↯" : command => - `(test_extern' String.validateUTF8 $arr => false) + `(test_extern' ByteArray.validateUTF8 $arr => false) macro "validate" arr:term " => " str:term : command => - `(test_extern' String.validateUTF8 $arr => true - test_extern' String.fromUTF8 $arr (with_decl_name% _validate by native_decide) => $str + `(test_extern' ByteArray.validateUTF8 $arr => true + test_extern' String.ofByteArray $arr (with_decl_name% _validate by native_decide) => $str test_extern' String.toUTF8 $str => $arr #guard checkGet $str ($arr : ByteArray).data)