refactor: incorporate UTF8 material from String.Extra into String.Basic (#10634)
This PR defines `ByteArray.validateUTF8`, uses it to show that `ByteArray.IsValidUtf8` is decidable and redefines `String.fromUTF8` and friends to use it. The functions `String.validateUTF8` and `String.utf8DecodeChar?` are deprecated in favor of the identically named functions in the `ByteArray` namespace.
This commit is contained in:
parent
9dc1faf327
commit
5bfbe2a875
4 changed files with 206 additions and 104 deletions
|
|
@ -94,6 +94,55 @@ where
|
|||
(have := c.utf8Size_pos; have := le_size_of_utf8DecodeChar?_eq_some h; by omega)
|
||||
termination_by structural fuel
|
||||
|
||||
@[expose, extern "lean_string_validate_utf8"]
|
||||
def ByteArray.validateUTF8 (b : ByteArray) : Bool :=
|
||||
go (b.size + 1) 0 (by simp) (by simp)
|
||||
where
|
||||
go (fuel : Nat) (i : Nat) (hi : i ≤ b.size) (hf : b.size - i < fuel) : Bool :=
|
||||
match fuel, hf with
|
||||
| fuel + 1, _ =>
|
||||
if hi : i = b.size then
|
||||
true
|
||||
else
|
||||
match h : validateUtf8At b i with
|
||||
| false => false
|
||||
| true => go fuel (i + (b[i].utf8ByteSize (isUtf8FirstByte_of_validateUtf8At h)).byteIdx)
|
||||
?_ ?_
|
||||
termination_by structural fuel
|
||||
finally
|
||||
all_goals rw [ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar?] at h
|
||||
· rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)]
|
||||
exact add_utf8Size_utf8DecodeChar_le_size
|
||||
· rw [← ByteArray.utf8Size_utf8DecodeChar (h := h)]
|
||||
have := add_utf8Size_utf8DecodeChar_le_size (h := h)
|
||||
have := (b.utf8DecodeChar i h).utf8Size_pos
|
||||
omega
|
||||
|
||||
theorem ByteArray.isSome_utf8Decode?Go_eq_validateUtf8Go {b : ByteArray} {fuel : Nat}
|
||||
{i : Nat} {acc : Array Char} {hi : i ≤ b.size} {hf : b.size - i < fuel} :
|
||||
(utf8Decode?.go b fuel i acc hi hf).isSome = validateUTF8.go b fuel i hi hf := by
|
||||
fun_induction utf8Decode?.go with
|
||||
| case1 => simp [validateUTF8.go]
|
||||
| case2 i acc hi fuel hf h₁ h₂ =>
|
||||
simp only [Option.isSome_none, validateUTF8.go, h₁, ↓reduceDIte, Bool.false_eq]
|
||||
split
|
||||
· rfl
|
||||
· rename_i heq
|
||||
simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq
|
||||
| case3 i acc hi fuel hf h₁ c h₂ ih =>
|
||||
simp [validateUTF8.go, h₁]
|
||||
split
|
||||
· rename_i heq
|
||||
simp [validateUtf8At_eq_isSome_utf8DecodeChar?, h₂] at heq
|
||||
· rw [ih]
|
||||
congr
|
||||
rw [← ByteArray.utf8Size_utf8DecodeChar (h := by simp [h₂])]
|
||||
simp [utf8DecodeChar, h₂]
|
||||
|
||||
theorem ByteArray.isSome_utf8Decode?_eq_validateUtf8 {b : ByteArray} :
|
||||
b.utf8Decode?.isSome = b.validateUTF8 :=
|
||||
b.isSome_utf8Decode?Go_eq_validateUtf8Go
|
||||
|
||||
theorem ByteArray.utf8Decode?.go.congr {b b' : ByteArray} {fuel fuel' i i' : Nat} {acc acc' : Array Char} {hi hi' hf hf'}
|
||||
(hbb' : b = b') (hii' : i = i') (hacc : acc = acc') :
|
||||
ByteArray.utf8Decode?.go b fuel i acc hi hf = ByteArray.utf8Decode?.go b' fuel' i' acc' hi' hf' := by
|
||||
|
|
@ -151,7 +200,50 @@ theorem ByteArray.isSome_utf8Decode?_iff {b : ByteArray} :
|
|||
rw [utf8Decode?, isSome_utf8Decode?go_iff, extract_zero_size]
|
||||
|
||||
@[simp]
|
||||
theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl)
|
||||
theorem ByteArray.validateUTF8_eq_true_iff {b : ByteArray} :
|
||||
b.validateUTF8 = true ↔ IsValidUtf8 b := by
|
||||
rw [← isSome_utf8Decode?_eq_validateUtf8, isSome_utf8Decode?_iff]
|
||||
|
||||
@[simp]
|
||||
theorem ByteArray.validateUTF8_eq_false_iff {b : ByteArray} :
|
||||
b.validateUTF8 = false ↔ ¬ IsValidUtf8 b := by
|
||||
simp [← Bool.not_eq_true]
|
||||
|
||||
instance {b : ByteArray} : Decidable b.IsValidUtf8 :=
|
||||
decidable_of_iff (b.validateUTF8 = true) ByteArray.validateUTF8_eq_true_iff
|
||||
|
||||
/--
|
||||
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
|
||||
the corresponding string.
|
||||
-/
|
||||
@[inline, expose]
|
||||
def String.fromUTF8 (a : @& ByteArray) (h : a.IsValidUtf8) : String :=
|
||||
.ofByteArray a h
|
||||
|
||||
/--
|
||||
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
|
||||
the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string.
|
||||
-/
|
||||
@[inline, expose] def String.fromUTF8? (a : ByteArray) : Option String :=
|
||||
if h : a.IsValidUtf8 then some (fromUTF8 a h) else none
|
||||
|
||||
/--
|
||||
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
|
||||
the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string.
|
||||
-/
|
||||
@[inline, expose] def String.fromUTF8! (a : ByteArray) : String :=
|
||||
if h : a.IsValidUtf8 then fromUTF8 a h else panic! "invalid UTF-8 string"
|
||||
|
||||
/--
|
||||
Encodes a string in UTF-8 as an array of bytes.
|
||||
-/
|
||||
@[extern "lean_string_to_utf8"]
|
||||
def String.toUTF8 (a : @& String) : ByteArray :=
|
||||
a.bytes
|
||||
|
||||
@[simp] theorem String.toUTF8_eq_bytes {s : String} : s.toUTF8 = s.bytes := (rfl)
|
||||
|
||||
@[simp] theorem String.bytes_empty : "".bytes = ByteArray.empty := (rfl)
|
||||
|
||||
/--
|
||||
Appends two strings. Usually accessed via the `++` operator.
|
||||
|
|
|
|||
|
|
@ -499,6 +499,14 @@ theorem assemble₁_eq_some_iff_utf8EncodeChar_eq {w : UInt8} {c : Char} :
|
|||
omega
|
||||
simpa [String.utf8EncodeChar_eq_singleton hc, assemble₁, Char.ext_iff, ← UInt32.toNat_inj]
|
||||
|
||||
@[inline, expose]
|
||||
public def verify₁ (_w : UInt8) (_h : parseFirstByte w = .done) : Bool :=
|
||||
true
|
||||
|
||||
theorem verify₁_eq_isSome_assemble₁ {w : UInt8} {h : parseFirstByte w = .done} :
|
||||
verify₁ w h = (assemble₁ w h).isSome := by
|
||||
simp [verify₁, assemble₁]
|
||||
|
||||
/-! # `assemble₂` -/
|
||||
|
||||
@[inline, expose]
|
||||
|
|
@ -531,6 +539,14 @@ where finally
|
|||
rw [Nat.shiftLeft_eq, Nat.mod_eq_of_lt (by omega), Nat.mul_comm, ← Nat.two_pow_add_eq_or_of_lt hb₁]
|
||||
omega
|
||||
|
||||
@[inline, expose]
|
||||
public def verify₂ (w x : UInt8) : Bool :=
|
||||
if isInvalidContinuationByte x then
|
||||
false
|
||||
else
|
||||
let r := assemble₂Unchecked w x
|
||||
0x80 ≤ r
|
||||
|
||||
theorem helper₃ {x : UInt8} (n : Nat) (hn : n < 8) :
|
||||
(x &&& UInt8.ofNat (2 ^ n - 1)).toUInt32.toBitVec = (x.toBitVec.setWidth n).setWidth 32 := by
|
||||
apply BitVec.eq_of_toNat_eq
|
||||
|
|
@ -628,6 +644,12 @@ theorem assemble₂_eq_some_iff_utf8EncodeChar_eq {x y : UInt8} {c : Char} :
|
|||
BitVec.setWidth_setWidth_eq_self]
|
||||
simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_two_iff.1 hc).2
|
||||
|
||||
theorem verify₂_eq_isSome_assemble₂ {w x : UInt8} : verify₂ w x = (assemble₂ w x).isSome := by
|
||||
simp only [verify₂, assemble₂]
|
||||
split
|
||||
· simp
|
||||
· split <;> simp_all
|
||||
|
||||
/-! # `assemble₃` -/
|
||||
|
||||
@[inline, expose]
|
||||
|
|
@ -671,6 +693,14 @@ where finally
|
|||
← Nat.two_pow_add_eq_or_of_lt (by omega)]
|
||||
omega
|
||||
|
||||
@[inline, expose]
|
||||
public def verify₃ (w x y : UInt8) : Bool :=
|
||||
if isInvalidContinuationByte x || isInvalidContinuationByte y then
|
||||
false
|
||||
else
|
||||
let r := assemble₃Unchecked w x y
|
||||
0x800 ≤ r ∧ (r < 0xd800 ∨ 0xdfff < r)
|
||||
|
||||
theorem toBitVec_assemble₃Unchecked {w x y : UInt8} : (assemble₃Unchecked w x y).toBitVec =
|
||||
(w.toBitVec.setWidth 4 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6).setWidth 32 := by
|
||||
have hw : (w &&& 15).toUInt32.toBitVec = (w.toBitVec.setWidth 4).setWidth 32 := helper₃ 4 (by decide)
|
||||
|
|
@ -772,6 +802,18 @@ theorem assemble₃_eq_some_iff_utf8EncodeChar_eq {w x y : UInt8} {c : Char} :
|
|||
← BitVec.setWidth_eq_extractLsb' (by simp), BitVec.setWidth_setWidth_eq_self]
|
||||
simpa [BitVec.lt_def, UInt32.le_iff_toNat_le] using Nat.lt_succ_iff.2 (Char.utf8Size_eq_three_iff.1 hc).2
|
||||
|
||||
theorem verify₃_eq_isSome_assemble₃ {w x y : UInt8} :
|
||||
verify₃ w x y = (assemble₃ w x y).isSome := by
|
||||
simp only [verify₃, assemble₃]
|
||||
split
|
||||
· simp
|
||||
· split
|
||||
· simp_all [← UInt32.not_le]
|
||||
· split
|
||||
· simp_all
|
||||
· rename_i h h'
|
||||
simpa [UInt32.not_lt.1 h] using Classical.not_and_iff_not_or_not.1 h'
|
||||
|
||||
/-! # `assemble₄` -/
|
||||
|
||||
@[inline, expose]
|
||||
|
|
@ -799,6 +841,14 @@ where finally
|
|||
simp only [UInt32.not_lt, UInt32.le_iff_toNat_le, UInt32.reduceToNat] at h₁ h₂
|
||||
exact Or.inr ⟨by omega, by omega⟩
|
||||
|
||||
@[inline, expose]
|
||||
public def verify₄ (w x y z : UInt8) : Bool :=
|
||||
if isInvalidContinuationByte x || isInvalidContinuationByte y || isInvalidContinuationByte z then
|
||||
false
|
||||
else
|
||||
let r := assemble₄Unchecked w x y z
|
||||
0x10000 ≤ r ∧ r ≤ 0x10ffff
|
||||
|
||||
theorem toBitVec_assemble₄Unchecked {w x y z : UInt8} : (assemble₄Unchecked w x y z).toBitVec =
|
||||
(w.toBitVec.setWidth 3 ++ x.toBitVec.setWidth 6 ++ y.toBitVec.setWidth 6 ++ z.toBitVec.setWidth 6).setWidth 32 := by
|
||||
have hw : (w &&& 7).toUInt32.toBitVec = (w.toBitVec.setWidth 3).setWidth 32 := helper₃ 3 (by decide)
|
||||
|
|
@ -917,11 +967,22 @@ theorem assemble₄_eq_some_iff_utf8EncodeChar_eq {w x y z : UInt8} {c : Char} :
|
|||
Nat.reducePow, Nat.reduceMod, gt_iff_lt]
|
||||
omega
|
||||
|
||||
theorem verify₄_eq_isSome_assemble₄ {w x y z : UInt8} :
|
||||
verify₄ w x y z = (assemble₄ w x y z).isSome := by
|
||||
simp only [verify₄, assemble₄]
|
||||
split
|
||||
· simp
|
||||
· split
|
||||
· simp_all [← UInt32.not_lt]
|
||||
· split
|
||||
· simp_all
|
||||
· simp_all [← UInt32.not_lt]
|
||||
|
||||
end ByteArray.utf8DecodeChar?
|
||||
|
||||
open ByteArray.utf8DecodeChar?
|
||||
|
||||
/- # `utf8DecodeChar?` -/
|
||||
/- # `utf8DecodeChar?` and `validateUtf8At` -/
|
||||
|
||||
@[inline, expose]
|
||||
public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char :=
|
||||
|
|
@ -944,6 +1005,27 @@ public def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) : Option Char
|
|||
else none
|
||||
else none
|
||||
|
||||
@[inline, expose]
|
||||
public def ByteArray.validateUtf8At (bytes : ByteArray) (i : Nat) : Bool :=
|
||||
if h₀ : i < bytes.size then
|
||||
match h : parseFirstByte bytes[i] with
|
||||
| .invalid => false
|
||||
| .done => verify₁ bytes[i] h
|
||||
| .oneMore =>
|
||||
if h₁ : i + 1 < bytes.size then
|
||||
verify₂ bytes[i] bytes[i + 1]
|
||||
else
|
||||
false
|
||||
| .twoMore =>
|
||||
if h₁ : i + 2 < bytes.size then
|
||||
verify₃ bytes[i] bytes[i + 1] bytes[i + 2]
|
||||
else false
|
||||
| .threeMore =>
|
||||
if h₁ : i + 3 < bytes.size then
|
||||
verify₄ bytes[i] bytes[i + 1] bytes[i + 2] bytes[i + 3]
|
||||
else false
|
||||
else false
|
||||
|
||||
/-! # `utf8DecodeChar?` low-level API -/
|
||||
|
||||
theorem parseFirstByte_eq_done_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char}
|
||||
|
|
@ -1130,6 +1212,14 @@ public theorem String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some {b :
|
|||
rw [← assemble₄_eq_some_iff_utf8EncodeChar_eq]
|
||||
exact ⟨this, h⟩
|
||||
|
||||
public theorem ByteArray.validateUtf8At_eq_isSome_utf8DecodeChar? {b : ByteArray} {i : Nat} :
|
||||
b.validateUtf8At i = (b.utf8DecodeChar? i).isSome := by
|
||||
simp only [validateUtf8At, utf8DecodeChar?]
|
||||
split
|
||||
· split <;> (try split) <;> simp [verify₁_eq_isSome_assemble₁, verify₂_eq_isSome_assemble₂,
|
||||
verify₃_eq_isSome_assemble₃, verify₄_eq_isSome_assemble₄]
|
||||
· simp
|
||||
|
||||
/-! # Corollaries -/
|
||||
|
||||
public theorem ByteArray.eq_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : utf8DecodeChar? b 0 = some c) :
|
||||
|
|
@ -1195,6 +1285,10 @@ public theorem ByteArray.lt_size_of_isSome_utf8DecodeChar? {b : ByteArray} {i :
|
|||
have := c.utf8Size_pos
|
||||
omega
|
||||
|
||||
public theorem ByteArray.lt_size_of_validateUtf8At {b : ByteArray} {i : Nat} :
|
||||
validateUtf8At b i = true → i < b.size :=
|
||||
validateUtf8At_eq_isSome_utf8DecodeChar? ▸ lt_size_of_isSome_utf8DecodeChar?
|
||||
|
||||
public theorem ByteArray.utf8DecodeChar?_append_eq_some {b : ByteArray} {i : Nat} {c : Char} (h : utf8DecodeChar? b i = some c)
|
||||
(b' : ByteArray) : utf8DecodeChar? (b ++ b') i = some c := by
|
||||
have := le_size_of_utf8DecodeChar?_eq_some h
|
||||
|
|
@ -1211,6 +1305,10 @@ public theorem ByteArray.isSome_utf8DecodeChar?_append {b : ByteArray} {i : Nat}
|
|||
public def ByteArray.utf8DecodeChar (bytes : ByteArray) (i : Nat) (h : (utf8DecodeChar? bytes i).isSome) : Char :=
|
||||
(utf8DecodeChar? bytes i).get h
|
||||
|
||||
public theorem ByteArray.add_utf8Size_utf8DecodeChar_le_size {b : ByteArray} {i : Nat} {h} :
|
||||
i + (b.utf8DecodeChar i h).utf8Size ≤ b.size :=
|
||||
le_size_of_utf8DecodeChar?_eq_some (by simp [utf8DecodeChar])
|
||||
|
||||
public theorem ByteArray.utf8DecodeChar_eq_utf8DecodeChar_extract {b : ByteArray} {i : Nat} {h} :
|
||||
utf8DecodeChar b i h =
|
||||
utf8DecodeChar (b.extract i b.size) 0 (by rwa [utf8DecodeChar?_eq_utf8DecodeChar?_extract] at h) := by
|
||||
|
|
@ -1363,6 +1461,11 @@ public theorem ByteArray.isUtf8FirstByte_of_isSome_utf8DecodeChar? {b : ByteArra
|
|||
conv => congr; congr; rw [eq_of_utf8DecodeChar?_eq_some hc]
|
||||
exact isUtf8FirstByte_getElem_zero_utf8EncodeChar_append
|
||||
|
||||
public theorem ByteArray.isUtf8FirstByte_of_validateUtf8At {b : ByteArray} {i : Nat} :
|
||||
(h : validateUtf8At b i = true) → (b[i]'(lt_size_of_validateUtf8At h)).IsUtf8FirstByte := by
|
||||
simp only [validateUtf8At_eq_isSome_utf8DecodeChar?]
|
||||
exact isUtf8FirstByte_of_isSome_utf8DecodeChar?
|
||||
|
||||
theorem Char.byteIdx_utf8ByteSize_getElem_utf8EncodeChar {c : Char} :
|
||||
(((String.utf8EncodeChar c)[0]'(by simp [c.utf8Size_pos])).utf8ByteSize
|
||||
UInt8.isUtf8FirstByte_getElem_zero_utf8EncodeChar).byteIdx = c.utf8Size := by
|
||||
|
|
|
|||
|
|
@ -38,109 +38,16 @@ def toNat! (s : String) : Nat :=
|
|||
else
|
||||
panic! "Nat expected"
|
||||
|
||||
/--
|
||||
Decodes the UTF-8 character sequence that starts at a given index in a byte array, or `none` if
|
||||
index `i` is out of bounds or is not the start of a valid UTF-8 character.
|
||||
-/
|
||||
def utf8DecodeChar? (a : ByteArray) (i : Nat) : Option Char := do
|
||||
let c ← a[i]?
|
||||
if c &&& 0x80 == 0 then
|
||||
some ⟨c.toUInt32, .inl (Nat.lt_trans c.toBitVec.isLt (by decide))⟩
|
||||
else if c &&& 0xe0 == 0xc0 then
|
||||
let c1 ← a[i+1]?
|
||||
guard (c1 &&& 0xc0 == 0x80)
|
||||
let r := ((c &&& 0x1f).toUInt32 <<< 6) ||| (c1 &&& 0x3f).toUInt32
|
||||
guard (0x80 ≤ r)
|
||||
-- TODO: Prove h from the definition of r once we have the necessary lemmas
|
||||
if h : r < 0xd800 then some ⟨r, .inl ((UInt32.lt_ofNat_iff (by decide)).1 h)⟩ else none
|
||||
else if c &&& 0xf0 == 0xe0 then
|
||||
let c1 ← a[i+1]?
|
||||
let c2 ← a[i+2]?
|
||||
guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80)
|
||||
let r :=
|
||||
((c &&& 0x0f).toUInt32 <<< 12) |||
|
||||
((c1 &&& 0x3f).toUInt32 <<< 6) |||
|
||||
(c2 &&& 0x3f).toUInt32
|
||||
guard (0x800 ≤ r)
|
||||
-- TODO: Prove `r < 0x110000` from the definition of r once we have the necessary lemmas
|
||||
if h : r < 0xd800 ∨ 0xdfff < r ∧ r < 0x110000 then
|
||||
have :=
|
||||
match h with
|
||||
| .inl h => Or.inl ((UInt32.lt_ofNat_iff (by decide)).1 h)
|
||||
| .inr h => Or.inr ⟨(UInt32.ofNat_lt_iff (by decide)).1 h.left, (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩
|
||||
some ⟨r, this⟩
|
||||
else
|
||||
none
|
||||
else if c &&& 0xf8 == 0xf0 then
|
||||
let c1 ← a[i+1]?
|
||||
let c2 ← a[i+2]?
|
||||
let c3 ← a[i+3]?
|
||||
guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80 && c3 &&& 0xc0 == 0x80)
|
||||
let r :=
|
||||
((c &&& 0x07).toUInt32 <<< 18) |||
|
||||
((c1 &&& 0x3f).toUInt32 <<< 12) |||
|
||||
((c2 &&& 0x3f).toUInt32 <<< 6) |||
|
||||
(c3 &&& 0x3f).toUInt32
|
||||
if h : 0x10000 ≤ r ∧ r < 0x110000 then
|
||||
some ⟨r, .inr ⟨Nat.lt_of_lt_of_le (by decide) ((UInt32.ofNat_le_iff (by decide)).1 h.left), (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩⟩
|
||||
else none
|
||||
else
|
||||
none
|
||||
@[deprecated ByteArray.utf8DecodeChar? (since := "2025-10-01")]
|
||||
abbrev utf8DecodeChar? (a : ByteArray) (i : Nat) : Option Char :=
|
||||
a.utf8DecodeChar? i
|
||||
|
||||
/--
|
||||
Checks whether an array of bytes is a valid UTF-8 encoding of a string.
|
||||
-/
|
||||
@[extern "lean_string_validate_utf8"]
|
||||
def validateUTF8 (a : @& ByteArray) : Bool :=
|
||||
(loop 0).isSome
|
||||
where
|
||||
loop (i : Nat) : Option Unit := do
|
||||
if i < a.size then
|
||||
let c ← utf8DecodeChar? a i
|
||||
loop (i + c.utf8Size)
|
||||
else pure ()
|
||||
termination_by a.size - i
|
||||
decreasing_by exact Nat.sub_lt_sub_left ‹_› (Nat.lt_add_of_pos_right c.utf8Size_pos)
|
||||
|
||||
/--
|
||||
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
|
||||
the corresponding string.
|
||||
-/
|
||||
@[extern "lean_string_from_utf8_unchecked"]
|
||||
def fromUTF8 (a : @& ByteArray) (h : validateUTF8 a) : String :=
|
||||
loop 0 ""
|
||||
where
|
||||
loop (i : Nat) (acc : String) : String :=
|
||||
if i < a.size then
|
||||
let c := (utf8DecodeChar? a i).getD default
|
||||
loop (i + c.utf8Size) (acc.push c)
|
||||
else acc
|
||||
termination_by a.size - i
|
||||
decreasing_by exact Nat.sub_lt_sub_left ‹_› (Nat.lt_add_of_pos_right c.utf8Size_pos)
|
||||
|
||||
/--
|
||||
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
|
||||
the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string.
|
||||
-/
|
||||
@[inline] def fromUTF8? (a : ByteArray) : Option String :=
|
||||
if h : validateUTF8 a then some (fromUTF8 a h) else none
|
||||
|
||||
/--
|
||||
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
|
||||
the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string.
|
||||
-/
|
||||
@[inline] def fromUTF8! (a : ByteArray) : String :=
|
||||
if h : validateUTF8 a then fromUTF8 a h else panic! "invalid UTF-8 string"
|
||||
|
||||
/--
|
||||
Encodes a string in UTF-8 as an array of bytes.
|
||||
-/
|
||||
@[extern "lean_string_to_utf8"]
|
||||
def toUTF8 (a : @& String) : ByteArray :=
|
||||
a.bytes
|
||||
|
||||
@[simp] theorem size_toUTF8 (s : String) : s.toUTF8.size = s.utf8ByteSize := by
|
||||
rfl
|
||||
@[deprecated ByteArray.validateUTF8 (since := "2025-10-01")]
|
||||
abbrev validateUTF8 (a : ByteArray) : Bool :=
|
||||
a.validateUTF8
|
||||
|
||||
theorem Iterator.sizeOf_next_lt_of_hasNext (i : String.Iterator) (h : i.hasNext) : sizeOf i.next < sizeOf i := by
|
||||
cases i; rename_i s pos; simp [Iterator.next, Iterator.sizeOf_eq]; simp [Iterator.hasNext] at h
|
||||
|
|
|
|||
|
|
@ -16,10 +16,10 @@ def checkGet (s : String) (arr : Array UInt8) :=
|
|||
c == arr[i]!
|
||||
|
||||
macro "validate" arr:term " => " "↯" : command =>
|
||||
`(test_extern' String.validateUTF8 $arr => false)
|
||||
`(test_extern' ByteArray.validateUTF8 $arr => false)
|
||||
macro "validate" arr:term " => " str:term : command =>
|
||||
`(test_extern' String.validateUTF8 $arr => true
|
||||
test_extern' String.fromUTF8 $arr (with_decl_name% _validate by native_decide) => $str
|
||||
`(test_extern' ByteArray.validateUTF8 $arr => true
|
||||
test_extern' String.ofByteArray $arr (with_decl_name% _validate by native_decide) => $str
|
||||
test_extern' String.toUTF8 $str => $arr
|
||||
#guard checkGet $str ($arr : ByteArray).data)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue