refactor: incorporate UTF8 material from String.Extra into String.Basic (#10634)

This PR defines `ByteArray.validateUTF8`, uses it to show that `ByteArray.IsValidUtf8` is decidable and redefines `String.fromUTF8` and friends to use it. The functions `String.validateUTF8` and `String.utf8DecodeChar?` are deprecated in favor of the identically named functions in the `ByteArray` namespace.
2025-10-01 13:33:29 +02:00 · 2025-10-01 13:33:29 +02:00 · 5bfbe2a875
commit 5bfbe2a875
parent 9dc1faf327
4 changed files with 206 additions and 104 deletions
--- a/src/Init/Data/String/Basic.lean
+++ b/src/Init/Data/String/Basic.lean
@ -94,6 +94,55 @@ where
            (have := c.utf8Size_pos; have := le_size_of_utf8DecodeChar?_eq_some h; by omega)
  termination_by structural fuel

+@[expose, extern "lean_string_validate_utf8"]
+def ByteArray.validateUTF8 (b : ByteArray) : Bool :=
+  go (b.size + 1) 0 (by simp) (by simp)
+where
+  go (fuel : Nat) (i : Nat) (hi : i ≤ b.size) (hf : b.size - i < fuel) : Bool :=
+    match fuel, hf with
+    | fuel + 1, _ =>
+      if hi : i = b.size then
+        true
+      else
+        match h : validateUtf8At b i with
+        | false => false
+        | true => go fuel (i + (b[i].utf8ByteSize (isUtf8FirstByte_of_validateUtf8At h)).byteIdx)
+            ?_ ?_
+  termination_by structural fuel
+finally
+  all_goals rw [ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar?] at h
+  · rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)]
+    exact add_utf8Size_utf8DecodeChar_le_size
+  · rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)]
+    have := add_utf8Size_utf8DecodeChar_le_size (h := h)
+    have := (b.utf8DecodeChar i h).utf8Size_pos
+    omega
+
+theorem ByteArray.isSome_utf8Decode?Go_eq_validateUtf8Go {b : ByteArray} {fuel : Nat}
+    {i : Nat} {acc : Array Char} {hi : i ≤ b.size} {hf : b.size - i < fuel} :
+    (utf8Decode?.go b fuel i acc hi hf).isSome = validateUTF8.go b fuel i hi hf := by
+  fun_induction utf8Decode?.go with
+  | case1 => simp [validateUTF8.go]
+  | case2 i acc hi fuel hf h₁ h₂ =>
+    simp only [Option.isSome_none, validateUTF8.go, h₁, ↓reduceDIte, Bool.false_eq]
+    split
+    · rfl
+    · rename_i heq
+      simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq
+  | case3 i acc hi fuel hf h₁ c h₂ ih =>
+    simp [validateUTF8.go, h₁]
+    split
+    · rename_i heq
+      simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq
+    · rw [ih]
+      congr
+      rw [← ByteArray.utf8Size_utf8DecodeChar (h := by simp [h₂])]
+      simp [utf8DecodeChar, h₂]
+
+theorem ByteArray.isSome_utf8Decode?_eq_validateUtf8 {b : ByteArray} :
+    b.utf8Decode?.isSome = b.validateUTF8 :=
+  b.isSome_utf8Decode?Go_eq_validateUtf8Go
+
 theorem ByteArray.utf8Decode?.go.congr {b b' : ByteArray} {fuel fuel' i i' : Nat} {acc acc' : Array Char} {hi hi' hf hf'}
    (hbb' : b = b') (hii' : i = i') (hacc : acc = acc') :
    ByteArray.utf8Decode?.go b fuel i acc hi hf = ByteArray.utf8Decode?.go b' fuel' i' acc' hi' hf' := by
@ -151,7 +200,50 @@ theorem ByteArray.isSome_utf8Decode?_iff {b : ByteArray} :
  rw [utf8Decode?, isSome_utf8Decode?go_iff, extract_zero_size]

@[simp]
-theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl)
+theorem ByteArray.validateUTF8_eq_true_iff {b : ByteArray} :
+    b.validateUTF8 = true ↔ IsValidUtf8 b := by
+  rw [← isSome_utf8Decode?_eq_validateUtf8, isSome_utf8Decode?_iff]
+
+@[simp]
+theorem ByteArray.validateUTF8_eq_false_iff {b : ByteArray} :
+    b.validateUTF8 = false ↔ ¬ IsValidUtf8 b := by
+  simp [← Bool.not_eq_true]
+
+instance {b : ByteArray} : Decidable b.IsValidUtf8 :=
+  decidable_of_iff (b.validateUTF8 = true) ByteArray.validateUTF8_eq_true_iff
+
+/--
+Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
+the corresponding string.
+-/
+@[inline, expose]
+def String.fromUTF8 (a : @& ByteArray) (h : a.IsValidUtf8) : String :=
+  .ofByteArray a h
+
+/--
+Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
+the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string.
+-/
+@[inline, expose] def String.fromUTF8? (a : ByteArray) : Option String :=
+  if h : a.IsValidUtf8 then some (fromUTF8 a h) else none
+
+/--
+Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
+the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string.
+-/
+@[inline, expose] def String.fromUTF8! (a : ByteArray) : String :=
+  if h : a.IsValidUtf8 then fromUTF8 a h else panic! "invalid UTF-8 string"
+
+/--
+Encodes a string in UTF-8 as an array of bytes.
+-/
+@[extern "lean_string_to_utf8"]
+def String.toUTF8 (a : @& String) : ByteArray :=
+  a.bytes
+
+@[simp] theorem String.toUTF8_eq_bytes {s : String} : s.toUTF8 = s.bytes := (rfl)
+
+@[simp] theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl)

 /--
 Appends two strings. Usually accessed via the `++` operator.
--- a/src/Init/Data/String/Decode.lean
+++ b/src/Init/Data/String/Decode.lean
@ -499,6 +499,14 @@ theorem assemble₁_eq_some_iff_utf8EncodeChar_eq {w : UInt8} {c : Char} :
      omega
    simpa [String.utf8EncodeChar_eq_singleton hc, assemble₁, Char.ext_iff, ← UInt32.toNat_inj]

+@[inline, expose]
+public def verify₁ (_w : UInt8) (_h : parseFirstByte w = .done) : Bool :=
+  true
+
+theorem verify₁_eq_isSome_assemble₁ {w : UInt8} {h : parseFirstByte w = .done} :
+    verify₁ w h = (assemble₁ w h).isSome := by
+  simp [verify₁, assemble₁]
+
 /-! # `assemble₂` -/

@[inline, expose]
@ -531,6 +539,14 @@ where finally
    rw [Nat.shiftLeft_eq, Nat.mod_eq_of_lt (by omega), Nat.mul_comm, ← Nat.two_pow_add_eq_or_of_lt hb₁]
    omega

+@[inline, expose]
+public def verify₂ (w x : UInt8) : Bool :=
+  if isInvalidContinuationByte x then
+    false
+  else
+    let r := assemble₂Unchecked w x
+    0x80 ≤ r
+
 theorem helper₃ {x : UInt8} (n : Nat) (hn : n < 8) :
    (x &&& UInt8.ofNat (2 ^ n - 1)).toUInt32.toBitVec = (x.toBitVec.setWidth n).setWidth 32 := by
  apply BitVec.eq_of_toNat_eq
@ -628,6 +644,12 @@ theorem assemble₂_eq_some_iff_utf8EncodeChar_eq {x y : UInt8} {c : Char} :
        BitVec.setWidth_setWidth_eq_self]
      simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_two_iff.1 hc).2

+theorem verify₂_eq_isSome_assemble₂ {w x : UInt8} : verify₂ w x = (assemble₂ w x).isSome := by
+  simp only [verify₂, assemble₂]
+  split
+  · simp
+  · split <;> simp_all
+
 /-! # `assemble₃` -/

@[inline, expose]
@ -671,6 +693,14 @@ where finally
          ← Nat.two_pow_add_eq_or_of_lt (by omega)]
      omega

+@[inline, expose]
+public def verify₃ (w x y : UInt8) : Bool :=
+  if isInvalidContinuationByte x || isInvalidContinuationByte y then
+    false
+  else
+    let r := assemble₃Unchecked w x y
+    0x800 ≤ r ∧ (r < 0xd800 ∨ 0xdfff < r)
+
 theorem toBitVec_assemble₃Unchecked {w x y : UInt8} : (assemble₃Unchecked w x y).toBitVec =
    (w.toBitVec.setWidth 4 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6).setWidth 32 := by
  have hw : (w &&& 15).toUInt32.toBitVec = (w.toBitVec.setWidth 4).setWidth 32 := helper₃ 4 (by decide)
@ -772,6 +802,18 @@ theorem assemble₃_eq_some_iff_utf8EncodeChar_eq {w x y : UInt8} {c : Char} :
        ← BitVec.setWidth_eq_extractLsb' (by simp), BitVec.setWidth_setWidth_eq_self]
      simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_three_iff.1 hc).2

+theorem verify₃_eq_isSome_assemble₃ {w x y : UInt8} :
+    verify₃ w x y = (assemble₃ w x y).isSome := by
+  simp only [verify₃, assemble₃]
+  split
+  · simp
+  · split
+    · simp_all [← UInt32.not_le]
+    · split
+      · simp_all
+      · rename_i h h'
+        simpa [UInt32.not_lt.1 h] using Classical.not_and_iff_not_or_not.1 h'
+
 /-! # `assemble₄` -/

@[inline, expose]
@ -799,6 +841,14 @@ where finally
    simp only [UInt32.not_lt, UInt32.le_iff_toNat_le, UInt32.reduceToNat] at h₁ h₂
    exact Or.inr ⟨by omega, by omega⟩

+@[inline, expose]
+public def verify₄ (w x y z : UInt8) : Bool :=
+  if isInvalidContinuationByte x || isInvalidContinuationByte y || isInvalidContinuationByte z then
+    false
+  else
+    let r := assemble₄Unchecked w x y z
+    0x10000 ≤ r ∧ r ≤ 0x10ffff
+
 theorem toBitVec_assemble₄Unchecked {w x y z : UInt8} : (assemble₄Unchecked w x y z).toBitVec =
    (w.toBitVec.setWidth 3 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ++ z.toBitVec.setWidth 6).setWidth 32 := by
  have hw : (w &&& 7).toUInt32.toBitVec = (w.toBitVec.setWidth 3).setWidth 32 := helper₃ 3 (by decide)
@ -917,11 +967,22 @@ theorem assemble₄_eq_some_iff_utf8EncodeChar_eq {w x y z : UInt8} {c : Char} :
        Nat.reducePow, Nat.reduceMod, gt_iff_lt]
      omega

+theorem verify₄_eq_isSome_assemble₄ {w x y z : UInt8} :
+    verify₄ w x y z = (assemble₄ w x y z).isSome := by
+  simp only [verify₄, assemble₄]
+  split
+  · simp
+  · split
+    · simp_all [← UInt32.not_lt]
+    · split
+      · simp_all
+      · simp_all [← UInt32.not_lt]
+
 end ByteArray.utf8DecodeChar?

 open ByteArray.utf8DecodeChar?

-/- # `utf8DecodeChar?` -/
+/- # `utf8DecodeChar?` and `validateUtf8At` -/

@[inline, expose]
 public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char :=
@ -944,6 +1005,27 @@ public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char
      else none
  else none

+@[inline, expose]
+public def ByteArray.validateUtf8At (bytes : ByteArray) (i : Nat) : Bool :=
+  if h₀ : i < bytes.size then
+    match h : parseFirstByte bytes[i] with
+    | .invalid => false
+    | .done => verify₁ bytes[i] h
+    | .oneMore =>
+      if h₁ : i + 1 < bytes.size then
+        verify₂ bytes[i] bytes[i + 1]
+      else
+        false
+    | .twoMore =>
+      if h₁ : i + 2 < bytes.size then
+        verify₃ bytes[i] bytes[i + 1] bytes[i + 2]
+      else false
+    | .threeMore =>
+      if h₁ : i + 3 < bytes.size then
+        verify₄ bytes[i] bytes[i + 1] bytes[i + 2] bytes[i + 3]
+      else false
+  else false
+
 /-! # `utf8DecodeChar?` low-level API -/

 theorem parseFirstByte_eq_done_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char}
@ -1130,6 +1212,14 @@ public theorem String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some {b :
    rw [← assemble₄_eq_some_iff_utf8EncodeChar_eq]
    exact ⟨this, h⟩

+public theorem ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar? {b : ByteArray} {i : Nat} :
+    b.validateUtf8At i = (b.utf8DecodeChar? i).isSome := by
+  simp only [validateUtf8At, utf8DecodeChar?]
+  split
+  · split <;> (try split) <;> simp [verify₁_eq_isSome_assemble₁, verify₂_eq_isSome_assemble₂,
+      verify₃_eq_isSome_assemble₃, verify₄_eq_isSome_assemble₄]
+  · simp
+
 /-! # Corollaries -/

 public theorem ByteArray.eq_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : utf8DecodeChar? b 0 = some c) :
@ -1195,6 +1285,10 @@ public theorem ByteArray.lt_size_of_isSome_utf8DecodeChar? {b : ByteArray} {i :
  have := c.utf8Size_pos
  omega

+public theorem ByteArray.lt_size_of_validateUtf8At {b : ByteArray} {i : Nat} :
+    validateUtf8At b i = true → i < b.size :=
+  validateUtf8At_eq_isSome_utf8DecodeChar? ▸ lt_size_of_isSome_utf8DecodeChar?
+
 public theorem ByteArray.utf8DecodeChar?_append_eq_some {b : ByteArray} {i : Nat} {c : Char} (h : utf8DecodeChar? b i = some c)
    (b' : ByteArray) : utf8DecodeChar? (b ++ b') i = some c := by
  have := le_size_of_utf8DecodeChar?_eq_some h
@ -1211,6 +1305,10 @@ public theorem ByteArray.isSome_utf8DecodeChar?_append {b : ByteArray} {i : Nat}
 public def ByteArray.utf8DecodeChar (bytes : ByteArray) (i : Nat) (h : (utf8DecodeChar? bytes i).isSome) : Char :=
  (utf8DecodeChar? bytes i).get h

+public theorem ByteArray.add_utf8Size_utf8DecodeChar_le_size {b : ByteArray} {i : Nat} {h} :
+    i + (b.utf8DecodeChar i h).utf8Size ≤ b.size :=
+  le_size_of_utf8DecodeChar?_eq_some (by simp [utf8DecodeChar])
+
 public theorem ByteArray.utf8DecodeChar_eq_utf8DecodeChar_extract {b : ByteArray} {i : Nat} {h} :
    utf8DecodeChar b i h =
      utf8DecodeChar (b.extract i b.size) 0 (by rwa [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h) := by
@ -1363,6 +1461,11 @@ public theorem ByteArray.isUtf8FirstByte_of_isSome_utf8DecodeChar? {b : ByteArra
  conv => congr; congr; rw [eq_of_utf8DecodeChar?_eq_some hc]
  exact isUtf8FirstByte_getElem_zero_utf8EncodeChar_append

+public theorem ByteArray.isUtf8FirstByte_of_validateUtf8At  {b : ByteArray} {i : Nat} :
+    (h : validateUtf8At b i = true) → (b[i]'(lt_size_of_validateUtf8At h)).IsUtf8FirstByte := by
+  simp only [validateUtf8At_eq_isSome_utf8DecodeChar?]
+  exact isUtf8FirstByte_of_isSome_utf8DecodeChar?
+
 theorem Char.byteIdx_utf8ByteSize_getElem_utf8EncodeChar {c : Char} :
    (((String.utf8EncodeChar c)[0]'(by simp [c.utf8Size_pos])).utf8ByteSize
      UInt8.isUtf8FirstByte_getElem_zero_utf8EncodeChar).byteIdx = c.utf8Size := by
--- a/src/Init/Data/String/Extra.lean
+++ b/src/Init/Data/String/Extra.lean
@ -38,109 +38,16 @@ def toNat! (s : String) : Nat :=
  else
    panic! "Nat expected"

-/--
-Decodes the UTF-8 character sequence that starts at a given index in a byte array, or `none` if
-index `i` is out of bounds or is not the start of a valid UTF-8 character.
-/
-def utf8DecodeChar? (a : ByteArray) (i : Nat) : Option Char := do
-  let c ← a[i]?
-  if c &&& 0x80 == 0 then
-    some ⟨c.toUInt32, .inl (Nat.lt_trans c.toBitVec.isLt (by decide))⟩
-  else if c &&& 0xe0 == 0xc0 then
-    let c1 ← a[i+1]?
-    guard (c1 &&& 0xc0 == 0x80)
-    let r := ((c &&& 0x1f).toUInt32 <<< 6) ||| (c1 &&& 0x3f).toUInt32
-    guard (0x80 ≤ r)
-    -- TODO: Prove h from the definition of r once we have the necessary lemmas
-    if h : r < 0xd800 then some ⟨r, .inl ((UInt32.lt_ofNat_iff (by decide)).1 h)⟩ else none
-  else if c &&& 0xf0 == 0xe0 then
-    let c1 ← a[i+1]?
-    let c2 ← a[i+2]?
-    guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80)
-    let r :=
-      ((c &&& 0x0f).toUInt32 <<< 12) |||
-      ((c1 &&& 0x3f).toUInt32 <<< 6) |||
-      (c2 &&& 0x3f).toUInt32
-    guard (0x800 ≤ r)
-    -- TODO: Prove `r < 0x110000` from the definition of r once we have the necessary lemmas
-    if h : r < 0xd800 ∨ 0xdfff < r ∧ r < 0x110000 then
-      have :=
-        match h with
-        | .inl h => Or.inl ((UInt32.lt_ofNat_iff (by decide)).1 h)
-        | .inr h => Or.inr ⟨(UInt32.ofNat_lt_iff (by decide)).1 h.left, (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩
-      some ⟨r, this⟩
-    else
-      none
-  else if c &&& 0xf8 == 0xf0 then
-    let c1 ← a[i+1]?
-    let c2 ← a[i+2]?
-    let c3 ← a[i+3]?
-    guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80 && c3 &&& 0xc0 == 0x80)
-    let r :=
-      ((c &&& 0x07).toUInt32 <<< 18) |||
-      ((c1 &&& 0x3f).toUInt32 <<< 12) |||
-      ((c2 &&& 0x3f).toUInt32 <<< 6) |||
-      (c3 &&& 0x3f).toUInt32
-    if h : 0x10000 ≤ r ∧ r < 0x110000 then
-      some ⟨r, .inr ⟨Nat.lt_of_lt_of_le (by decide) ((UInt32.ofNat_le_iff (by decide)).1 h.left), (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩⟩
-    else none
-  else
-    none
+@[deprecated ByteArray.utf8DecodeChar? (since := "2025-10-01")]
+abbrev utf8DecodeChar? (a : ByteArray) (i : Nat) : Option Char :=
+  a.utf8DecodeChar? i

 /--
 Checks whether an array of bytes is a valid UTF-8 encoding of a string.
 -/
-@[extern "lean_string_validate_utf8"]
-def validateUTF8 (a : @& ByteArray) : Bool :=
-  (loop 0).isSome
-where
-  loop (i : Nat) : Option Unit := do
-    if i < a.size then
-      let c ← utf8DecodeChar? a i
-      loop (i + c.utf8Size)
-    else pure ()
-  termination_by a.size - i
-  decreasing_by exact Nat.sub_lt_sub_left ‹_› (Nat.lt_add_of_pos_right c.utf8Size_pos)
-
-/--
-Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
-the corresponding string.
-/
-@[extern "lean_string_from_utf8_unchecked"]
-def fromUTF8 (a : @& ByteArray) (h : validateUTF8 a) : String :=
-  loop 0 ""
-where
-  loop (i : Nat) (acc : String) : String :=
-    if i < a.size then
-      let c := (utf8DecodeChar? a i).getD default
-      loop (i + c.utf8Size) (acc.push c)
-    else acc
-  termination_by a.size - i
-  decreasing_by exact Nat.sub_lt_sub_left ‹_› (Nat.lt_add_of_pos_right c.utf8Size_pos)
-
-/--
-Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
-the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string.
-/
-@[inline] def fromUTF8? (a : ByteArray) : Option String :=
-  if h : validateUTF8 a then some (fromUTF8 a h) else none
-
-/--
-Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
-the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string.
-/
-@[inline] def fromUTF8! (a : ByteArray) : String :=
-  if h : validateUTF8 a then fromUTF8 a h else panic! "invalid UTF-8 string"
-
-/--
-Encodes a string in UTF-8 as an array of bytes.
-/
-@[extern "lean_string_to_utf8"]
-def toUTF8 (a : @& String) : ByteArray :=
-  a.bytes
-
-@[simp] theorem size_toUTF8 (s : String) : s.toUTF8.size = s.utf8ByteSize := by
-  rfl
+@[deprecated ByteArray.validateUTF8 (since := "2025-10-01")]
+abbrev validateUTF8 (a : ByteArray) : Bool :=
+  a.validateUTF8

 theorem Iterator.sizeOf_next_lt_of_hasNext (i : String.Iterator) (h : i.hasNext) : sizeOf i.next < sizeOf i := by
  cases i; rename_i s pos; simp [Iterator.next, Iterator.sizeOf_eq]; simp [Iterator.hasNext] at h
--- a/tests/lean/run/utf8英語.lean
+++ b/tests/lean/run/utf8英語.lean
@ -16,10 +16,10 @@ def checkGet (s : String) (arr : Array UInt8) :=
    c == arr[i]!

 macro "validate" arr:term " => " "↯" : command =>
-  `(test_extern' String.validateUTF8 $arr => false)
+  `(test_extern' ByteArray.validateUTF8 $arr => false)
 macro "validate" arr:term " => " str:term : command =>
-  `(test_extern' String.validateUTF8 $arr => true
-    test_extern' String.fromUTF8 $arr (with_decl_name% _validate by native_decide) => $str
+  `(test_extern' ByteArray.validateUTF8 $arr => true
+    test_extern' String.ofByteArray $arr (with_decl_name% _validate by native_decide) => $str
    test_extern' String.toUTF8 $str => $arr
    #guard checkGet $str ($arr : ByteArray).data)