refactor: incorporate UTF8 material from String.Extra into String.Basic (#10634)

This PR defines `ByteArray.validateUTF8`, uses it to show that
`ByteArray.IsValidUtf8` is decidable and redefines `String.fromUTF8` and
friends to use it.

The functions `String.validateUTF8` and `String.utf8DecodeChar?` are
deprecated in favor of the identically named functions in the
`ByteArray` namespace.
This commit is contained in:
Markus Himmel 2025-10-01 13:33:29 +02:00 committed by GitHub
parent 9dc1faf327
commit 5bfbe2a875
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 206 additions and 104 deletions

View file

@ -94,6 +94,55 @@ where
(have := c.utf8Size_pos; have := le_size_of_utf8DecodeChar?_eq_some h; by omega)
termination_by structural fuel
@[expose, extern "lean_string_validate_utf8"]
def ByteArray.validateUTF8 (b : ByteArray) : Bool :=
go (b.size + 1) 0 (by simp) (by simp)
where
go (fuel : Nat) (i : Nat) (hi : i ≤ b.size) (hf : b.size - i < fuel) : Bool :=
match fuel, hf with
| fuel + 1, _ =>
if hi : i = b.size then
true
else
match h : validateUtf8At b i with
| false => false
| true => go fuel (i + (b[i].utf8ByteSize (isUtf8FirstByte_of_validateUtf8At h)).byteIdx)
?_ ?_
termination_by structural fuel
finally
all_goals rw [ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar?] at h
· rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)]
exact add_utf8Size_utf8DecodeChar_le_size
· rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)]
have := add_utf8Size_utf8DecodeChar_le_size (h := h)
have := (b.utf8DecodeChar i h).utf8Size_pos
omega
theorem ByteArray.isSome_utf8Decode?Go_eq_validateUtf8Go {b : ByteArray} {fuel : Nat}
{i : Nat} {acc : Array Char} {hi : i ≤ b.size} {hf : b.size - i < fuel} :
(utf8Decode?.go b fuel i acc hi hf).isSome = validateUTF8.go b fuel i hi hf := by
fun_induction utf8Decode?.go with
| case1 => simp [validateUTF8.go]
| case2 i acc hi fuel hf h₁ h₂ =>
simp only [Option.isSome_none, validateUTF8.go, h₁, ↓reduceDIte, Bool.false_eq]
split
· rfl
· rename_i heq
simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq
| case3 i acc hi fuel hf h₁ c h₂ ih =>
simp [validateUTF8.go, h₁]
split
· rename_i heq
simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq
· rw [ih]
congr
rw [← ByteArray.utf8Size_utf8DecodeChar (h := by simp [h₂])]
simp [utf8DecodeChar, h₂]
theorem ByteArray.isSome_utf8Decode?_eq_validateUtf8 {b : ByteArray} :
b.utf8Decode?.isSome = b.validateUTF8 :=
b.isSome_utf8Decode?Go_eq_validateUtf8Go
theorem ByteArray.utf8Decode?.go.congr {b b' : ByteArray} {fuel fuel' i i' : Nat} {acc acc' : Array Char} {hi hi' hf hf'}
(hbb' : b = b') (hii' : i = i') (hacc : acc = acc') :
ByteArray.utf8Decode?.go b fuel i acc hi hf = ByteArray.utf8Decode?.go b' fuel' i' acc' hi' hf' := by
@ -151,7 +200,50 @@ theorem ByteArray.isSome_utf8Decode?_iff {b : ByteArray} :
rw [utf8Decode?, isSome_utf8Decode?go_iff, extract_zero_size]
@[simp]
theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl)
theorem ByteArray.validateUTF8_eq_true_iff {b : ByteArray} :
b.validateUTF8 = true ↔ IsValidUtf8 b := by
rw [← isSome_utf8Decode?_eq_validateUtf8, isSome_utf8Decode?_iff]
@[simp]
theorem ByteArray.validateUTF8_eq_false_iff {b : ByteArray} :
b.validateUTF8 = false ↔ ¬ IsValidUtf8 b := by
simp [← Bool.not_eq_true]
instance {b : ByteArray} : Decidable b.IsValidUtf8 :=
decidable_of_iff (b.validateUTF8 = true) ByteArray.validateUTF8_eq_true_iff
/--
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
the corresponding string.
-/
@[inline, expose]
def String.fromUTF8 (a : @& ByteArray) (h : a.IsValidUtf8) : String :=
.ofByteArray a h
/--
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string.
-/
@[inline, expose] def String.fromUTF8? (a : ByteArray) : Option String :=
if h : a.IsValidUtf8 then some (fromUTF8 a h) else none
/--
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string.
-/
@[inline, expose] def String.fromUTF8! (a : ByteArray) : String :=
if h : a.IsValidUtf8 then fromUTF8 a h else panic! "invalid UTF-8 string"
/--
Encodes a string in UTF-8 as an array of bytes.
-/
@[extern "lean_string_to_utf8"]
def String.toUTF8 (a : @& String) : ByteArray :=
a.bytes
@[simp] theorem String.toUTF8_eq_bytes {s : String} : s.toUTF8 = s.bytes := (rfl)
@[simp] theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl)
/--
Appends two strings. Usually accessed via the `++` operator.

View file

@ -499,6 +499,14 @@ theorem assemble₁_eq_some_iff_utf8EncodeChar_eq {w : UInt8} {c : Char} :
omega
simpa [String.utf8EncodeChar_eq_singleton hc, assemble₁, Char.ext_iff, ← UInt32.toNat_inj]
@[inline, expose]
public def verify₁ (_w : UInt8) (_h : parseFirstByte w = .done) : Bool :=
true
theorem verify₁_eq_isSome_assemble₁ {w : UInt8} {h : parseFirstByte w = .done} :
verify₁ w h = (assemble₁ w h).isSome := by
simp [verify₁, assemble₁]
/-! # `assemble₂` -/
@[inline, expose]
@ -531,6 +539,14 @@ where finally
rw [Nat.shiftLeft_eq, Nat.mod_eq_of_lt (by omega), Nat.mul_comm, ← Nat.two_pow_add_eq_or_of_lt hb₁]
omega
@[inline, expose]
public def verify₂ (w x : UInt8) : Bool :=
if isInvalidContinuationByte x then
false
else
let r := assemble₂Unchecked w x
0x80 ≤ r
theorem helper₃ {x : UInt8} (n : Nat) (hn : n < 8) :
(x &&& UInt8.ofNat (2 ^ n - 1)).toUInt32.toBitVec = (x.toBitVec.setWidth n).setWidth 32 := by
apply BitVec.eq_of_toNat_eq
@ -628,6 +644,12 @@ theorem assemble₂_eq_some_iff_utf8EncodeChar_eq {x y : UInt8} {c : Char} :
BitVec.setWidth_setWidth_eq_self]
simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_two_iff.1 hc).2
theorem verify₂_eq_isSome_assemble₂ {w x : UInt8} : verify₂ w x = (assemble₂ w x).isSome := by
simp only [verify₂, assemble₂]
split
· simp
· split <;> simp_all
/-! # `assemble₃` -/
@[inline, expose]
@ -671,6 +693,14 @@ where finally
← Nat.two_pow_add_eq_or_of_lt (by omega)]
omega
@[inline, expose]
public def verify₃ (w x y : UInt8) : Bool :=
if isInvalidContinuationByte x || isInvalidContinuationByte y then
false
else
let r := assemble₃Unchecked w x y
0x800 ≤ r ∧ (r < 0xd800 0xdfff < r)
theorem toBitVec_assemble₃Unchecked {w x y : UInt8} : (assemble₃Unchecked w x y).toBitVec =
(w.toBitVec.setWidth 4 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6).setWidth 32 := by
have hw : (w &&& 15).toUInt32.toBitVec = (w.toBitVec.setWidth 4).setWidth 32 := helper₃ 4 (by decide)
@ -772,6 +802,18 @@ theorem assemble₃_eq_some_iff_utf8EncodeChar_eq {w x y : UInt8} {c : Char} :
← BitVec.setWidth_eq_extractLsb' (by simp), BitVec.setWidth_setWidth_eq_self]
simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_three_iff.1 hc).2
theorem verify₃_eq_isSome_assemble₃ {w x y : UInt8} :
verify₃ w x y = (assemble₃ w x y).isSome := by
simp only [verify₃, assemble₃]
split
· simp
· split
· simp_all [← UInt32.not_le]
· split
· simp_all
· rename_i h h'
simpa [UInt32.not_lt.1 h] using Classical.not_and_iff_not_or_not.1 h'
/-! # `assemble₄` -/
@[inline, expose]
@ -799,6 +841,14 @@ where finally
simp only [UInt32.not_lt, UInt32.le_iff_toNat_le, UInt32.reduceToNat] at h₁ h₂
exact Or.inr ⟨by omega, by omega⟩
@[inline, expose]
public def verify₄ (w x y z : UInt8) : Bool :=
if isInvalidContinuationByte x || isInvalidContinuationByte y || isInvalidContinuationByte z then
false
else
let r := assemble₄Unchecked w x y z
0x10000 ≤ r ∧ r ≤ 0x10ffff
theorem toBitVec_assemble₄Unchecked {w x y z : UInt8} : (assemble₄Unchecked w x y z).toBitVec =
(w.toBitVec.setWidth 3 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ++ z.toBitVec.setWidth 6).setWidth 32 := by
have hw : (w &&& 7).toUInt32.toBitVec = (w.toBitVec.setWidth 3).setWidth 32 := helper₃ 3 (by decide)
@ -917,11 +967,22 @@ theorem assemble₄_eq_some_iff_utf8EncodeChar_eq {w x y z : UInt8} {c : Char} :
Nat.reducePow, Nat.reduceMod, gt_iff_lt]
omega
theorem verify₄_eq_isSome_assemble₄ {w x y z : UInt8} :
verify₄ w x y z = (assemble₄ w x y z).isSome := by
simp only [verify₄, assemble₄]
split
· simp
· split
· simp_all [← UInt32.not_lt]
· split
· simp_all
· simp_all [← UInt32.not_lt]
end ByteArray.utf8DecodeChar?
open ByteArray.utf8DecodeChar?
/- # `utf8DecodeChar?` -/
/- # `utf8DecodeChar?` and `validateUtf8At` -/
@[inline, expose]
public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char :=
@ -944,6 +1005,27 @@ public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char
else none
else none
@[inline, expose]
public def ByteArray.validateUtf8At (bytes : ByteArray) (i : Nat) : Bool :=
if h₀ : i < bytes.size then
match h : parseFirstByte bytes[i] with
| .invalid => false
| .done => verify₁ bytes[i] h
| .oneMore =>
if h₁ : i + 1 < bytes.size then
verify₂ bytes[i] bytes[i + 1]
else
false
| .twoMore =>
if h₁ : i + 2 < bytes.size then
verify₃ bytes[i] bytes[i + 1] bytes[i + 2]
else false
| .threeMore =>
if h₁ : i + 3 < bytes.size then
verify₄ bytes[i] bytes[i + 1] bytes[i + 2] bytes[i + 3]
else false
else false
/-! # `utf8DecodeChar?` low-level API -/
theorem parseFirstByte_eq_done_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char}
@ -1130,6 +1212,14 @@ public theorem String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some {b :
rw [← assemble₄_eq_some_iff_utf8EncodeChar_eq]
exact ⟨this, h⟩
public theorem ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar? {b : ByteArray} {i : Nat} :
b.validateUtf8At i = (b.utf8DecodeChar? i).isSome := by
simp only [validateUtf8At, utf8DecodeChar?]
split
· split <;> (try split) <;> simp [verify₁_eq_isSome_assemble₁, verify₂_eq_isSome_assemble₂,
verify₃_eq_isSome_assemble₃, verify₄_eq_isSome_assemble₄]
· simp
/-! # Corollaries -/
public theorem ByteArray.eq_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : utf8DecodeChar? b 0 = some c) :
@ -1195,6 +1285,10 @@ public theorem ByteArray.lt_size_of_isSome_utf8DecodeChar? {b : ByteArray} {i :
have := c.utf8Size_pos
omega
public theorem ByteArray.lt_size_of_validateUtf8At {b : ByteArray} {i : Nat} :
validateUtf8At b i = true → i < b.size :=
validateUtf8At_eq_isSome_utf8DecodeChar? ▸ lt_size_of_isSome_utf8DecodeChar?
public theorem ByteArray.utf8DecodeChar?_append_eq_some {b : ByteArray} {i : Nat} {c : Char} (h : utf8DecodeChar? b i = some c)
(b' : ByteArray) : utf8DecodeChar? (b ++ b') i = some c := by
have := le_size_of_utf8DecodeChar?_eq_some h
@ -1211,6 +1305,10 @@ public theorem ByteArray.isSome_utf8DecodeChar?_append {b : ByteArray} {i : Nat}
public def ByteArray.utf8DecodeChar (bytes : ByteArray) (i : Nat) (h : (utf8DecodeChar? bytes i).isSome) : Char :=
(utf8DecodeChar? bytes i).get h
public theorem ByteArray.add_utf8Size_utf8DecodeChar_le_size {b : ByteArray} {i : Nat} {h} :
i + (b.utf8DecodeChar i h).utf8Size ≤ b.size :=
le_size_of_utf8DecodeChar?_eq_some (by simp [utf8DecodeChar])
public theorem ByteArray.utf8DecodeChar_eq_utf8DecodeChar_extract {b : ByteArray} {i : Nat} {h} :
utf8DecodeChar b i h =
utf8DecodeChar (b.extract i b.size) 0 (by rwa [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h) := by
@ -1363,6 +1461,11 @@ public theorem ByteArray.isUtf8FirstByte_of_isSome_utf8DecodeChar? {b : ByteArra
conv => congr; congr; rw [eq_of_utf8DecodeChar?_eq_some hc]
exact isUtf8FirstByte_getElem_zero_utf8EncodeChar_append
public theorem ByteArray.isUtf8FirstByte_of_validateUtf8At {b : ByteArray} {i : Nat} :
(h : validateUtf8At b i = true) → (b[i]'(lt_size_of_validateUtf8At h)).IsUtf8FirstByte := by
simp only [validateUtf8At_eq_isSome_utf8DecodeChar?]
exact isUtf8FirstByte_of_isSome_utf8DecodeChar?
theorem Char.byteIdx_utf8ByteSize_getElem_utf8EncodeChar {c : Char} :
(((String.utf8EncodeChar c)[0]'(by simp [c.utf8Size_pos])).utf8ByteSize
UInt8.isUtf8FirstByte_getElem_zero_utf8EncodeChar).byteIdx = c.utf8Size := by

View file

@ -38,109 +38,16 @@ def toNat! (s : String) : Nat :=
else
panic! "Nat expected"
/--
Decodes the UTF-8 character sequence that starts at a given index in a byte array, or `none` if
index `i` is out of bounds or is not the start of a valid UTF-8 character.
-/
def utf8DecodeChar? (a : ByteArray) (i : Nat) : Option Char := do
let c ← a[i]?
if c &&& 0x80 == 0 then
some ⟨c.toUInt32, .inl (Nat.lt_trans c.toBitVec.isLt (by decide))⟩
else if c &&& 0xe0 == 0xc0 then
let c1 ← a[i+1]?
guard (c1 &&& 0xc0 == 0x80)
let r := ((c &&& 0x1f).toUInt32 <<< 6) ||| (c1 &&& 0x3f).toUInt32
guard (0x80 ≤ r)
-- TODO: Prove h from the definition of r once we have the necessary lemmas
if h : r < 0xd800 then some ⟨r, .inl ((UInt32.lt_ofNat_iff (by decide)).1 h)⟩ else none
else if c &&& 0xf0 == 0xe0 then
let c1 ← a[i+1]?
let c2 ← a[i+2]?
guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80)
let r :=
((c &&& 0x0f).toUInt32 <<< 12) |||
((c1 &&& 0x3f).toUInt32 <<< 6) |||
(c2 &&& 0x3f).toUInt32
guard (0x800 ≤ r)
-- TODO: Prove `r < 0x110000` from the definition of r once we have the necessary lemmas
if h : r < 0xd800 0xdfff < r ∧ r < 0x110000 then
have :=
match h with
| .inl h => Or.inl ((UInt32.lt_ofNat_iff (by decide)).1 h)
| .inr h => Or.inr ⟨(UInt32.ofNat_lt_iff (by decide)).1 h.left, (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩
some ⟨r, this⟩
else
none
else if c &&& 0xf8 == 0xf0 then
let c1 ← a[i+1]?
let c2 ← a[i+2]?
let c3 ← a[i+3]?
guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80 && c3 &&& 0xc0 == 0x80)
let r :=
((c &&& 0x07).toUInt32 <<< 18) |||
((c1 &&& 0x3f).toUInt32 <<< 12) |||
((c2 &&& 0x3f).toUInt32 <<< 6) |||
(c3 &&& 0x3f).toUInt32
if h : 0x10000 ≤ r ∧ r < 0x110000 then
some ⟨r, .inr ⟨Nat.lt_of_lt_of_le (by decide) ((UInt32.ofNat_le_iff (by decide)).1 h.left), (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩⟩
else none
else
none
@[deprecated ByteArray.utf8DecodeChar? (since := "2025-10-01")]
abbrev utf8DecodeChar? (a : ByteArray) (i : Nat) : Option Char :=
a.utf8DecodeChar? i
/--
Checks whether an array of bytes is a valid UTF-8 encoding of a string.
-/
@[extern "lean_string_validate_utf8"]
def validateUTF8 (a : @& ByteArray) : Bool :=
(loop 0).isSome
where
loop (i : Nat) : Option Unit := do
if i < a.size then
let c ← utf8DecodeChar? a i
loop (i + c.utf8Size)
else pure ()
termination_by a.size - i
decreasing_by exact Nat.sub_lt_sub_left _ (Nat.lt_add_of_pos_right c.utf8Size_pos)
/--
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
the corresponding string.
-/
@[extern "lean_string_from_utf8_unchecked"]
def fromUTF8 (a : @& ByteArray) (h : validateUTF8 a) : String :=
loop 0 ""
where
loop (i : Nat) (acc : String) : String :=
if i < a.size then
let c := (utf8DecodeChar? a i).getD default
loop (i + c.utf8Size) (acc.push c)
else acc
termination_by a.size - i
decreasing_by exact Nat.sub_lt_sub_left _ (Nat.lt_add_of_pos_right c.utf8Size_pos)
/--
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string.
-/
@[inline] def fromUTF8? (a : ByteArray) : Option String :=
if h : validateUTF8 a then some (fromUTF8 a h) else none
/--
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string.
-/
@[inline] def fromUTF8! (a : ByteArray) : String :=
if h : validateUTF8 a then fromUTF8 a h else panic! "invalid UTF-8 string"
/--
Encodes a string in UTF-8 as an array of bytes.
-/
@[extern "lean_string_to_utf8"]
def toUTF8 (a : @& String) : ByteArray :=
a.bytes
@[simp] theorem size_toUTF8 (s : String) : s.toUTF8.size = s.utf8ByteSize := by
rfl
@[deprecated ByteArray.validateUTF8 (since := "2025-10-01")]
abbrev validateUTF8 (a : ByteArray) : Bool :=
a.validateUTF8
theorem Iterator.sizeOf_next_lt_of_hasNext (i : String.Iterator) (h : i.hasNext) : sizeOf i.next < sizeOf i := by
cases i; rename_i s pos; simp [Iterator.next, Iterator.sizeOf_eq]; simp [Iterator.hasNext] at h

View file

@ -16,10 +16,10 @@ def checkGet (s : String) (arr : Array UInt8) :=
c == arr[i]!
macro "validate" arr:term " => " "↯" : command =>
`(test_extern' String.validateUTF8 $arr => false)
`(test_extern' ByteArray.validateUTF8 $arr => false)
macro "validate" arr:term " => " str:term : command =>
`(test_extern' String.validateUTF8 $arr => true
test_extern' String.fromUTF8 $arr (with_decl_name% _validate by native_decide) => $str
`(test_extern' ByteArray.validateUTF8 $arr => true
test_extern' String.ofByteArray $arr (with_decl_name% _validate by native_decide) => $str
test_extern' String.toUTF8 $str => $arr
#guard checkGet $str ($arr : ByteArray).data)