fix: handle surrogate pairs correctly in Json.parse (#8080)

This PR fixes `Json.parse` to handle surrogate pairs correctly.

Closes #5445
This commit is contained in:
Rob23oba 2025-04-24 21:07:46 +02:00 committed by GitHub
parent 406bda8807
commit 416e07a68e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 39 additions and 5 deletions

View file

@ -14,17 +14,33 @@ open Std.Internal.Parsec.String
namespace Lean.Json.Parser
def hexChar : Parser Nat := do
def hexChar : Parser UInt16 := do
let c ← any
if '0' <= c && c <= '9' then
pure $ (c.val - '0'.val).toNat
pure $ (c.val - '0'.val).toUInt16
else if 'a' <= c && c <= 'f' then
pure $ (c.val - 'a'.val + 10).toNat
pure $ (c.val - 'a'.val + 10).toUInt16
else if 'A' <= c && c <= 'F' then
pure $ (c.val - 'A'.val + 10).toNat
pure $ (c.val - 'A'.val + 10).toUInt16
else
fail "invalid hex character"
def finishSurrogatePair (low : UInt16) : Parser Char := do
let c ← any
if c != '\\' then fail ""
let c ← any
if c != 'u' then fail ""
let c ← any
if c != 'd' && c != 'D' then fail ""
let u2 ← hexChar; let u3 ← hexChar; let u4 ← hexChar
let val := (u2 <<< 8) ||| (u3 <<< 4) ||| u4
if val < 0xC00 then fail "" -- low or not a surrogate
let charVal := (((low.toUInt32 &&& 0x3FF) <<< 10) ||| (val.toUInt32 &&& 0x3FF)) + 0x10000
if h : charVal.isValidChar then
return ⟨charVal, h⟩
else
fail "" -- should be unreachable
def escapedChar : Parser Char := do
let c ← any
match c with
@ -38,7 +54,19 @@ def escapedChar : Parser Char := do
| 't' => return '\t'
| 'u' =>
let u1 ← hexChar; let u2 ← hexChar; let u3 ← hexChar; let u4 ← hexChar
return Char.ofNat $ 4096*u1 + 256*u2 + 16*u3 + u4
let val := (u1 <<< 12) ||| (u2 <<< 8) ||| (u3 <<< 4) ||| u4
if h : val < 0xD800 then
return ⟨val.toUInt32, Or.inl h⟩
else if h' : val < 0xE000 then
-- low or high surrogate
if val < 0xDC00 then
-- low surrogate
attempt (finishSurrogatePair val) <|> pure '\ufffd'
else
-- high/lone surrogate
return '\ufffd' -- replacement character
else
return ⟨val.toUInt32, Or.inr ⟨Nat.not_lt.mp h', Nat.lt_trans val.toFin.isLt (by decide)⟩⟩
| _ => fail "illegal \\u escape"
partial def strCore (acc : String) : Parser String := do

View file

@ -0,0 +1,6 @@
import Lean.Data.Json
#guard (Lean.Json.parse "\"\\ud835\\udd5c\"").toOption.get! == Lean.Json.str "𝕜"
#guard (Lean.Json.parse "\"\\ud835\"").toOption.get! == Lean.Json.str "\ufffd"
#guard (Lean.Json.parse "\"\\udd5c\"").toOption.get! == Lean.Json.str "\ufffd"
#guard (Lean.Json.parse "\"\\ud823\\ud835\\udd5c\"").toOption.get! == Lean.Json.str "\ufffd𝕜"