fix: handle surrogate pairs correctly in Json.parse (#8080)
This PR fixes `Json.parse` to handle surrogate pairs correctly. Closes #5445
This commit is contained in:
parent
406bda8807
commit
416e07a68e
2 changed files with 39 additions and 5 deletions
|
|
@ -14,17 +14,33 @@ open Std.Internal.Parsec.String
|
|||
|
||||
namespace Lean.Json.Parser
|
||||
|
||||
def hexChar : Parser Nat := do
|
||||
def hexChar : Parser UInt16 := do
|
||||
let c ← any
|
||||
if '0' <= c && c <= '9' then
|
||||
pure $ (c.val - '0'.val).toNat
|
||||
pure $ (c.val - '0'.val).toUInt16
|
||||
else if 'a' <= c && c <= 'f' then
|
||||
pure $ (c.val - 'a'.val + 10).toNat
|
||||
pure $ (c.val - 'a'.val + 10).toUInt16
|
||||
else if 'A' <= c && c <= 'F' then
|
||||
pure $ (c.val - 'A'.val + 10).toNat
|
||||
pure $ (c.val - 'A'.val + 10).toUInt16
|
||||
else
|
||||
fail "invalid hex character"
|
||||
|
||||
def finishSurrogatePair (low : UInt16) : Parser Char := do
|
||||
let c ← any
|
||||
if c != '\\' then fail ""
|
||||
let c ← any
|
||||
if c != 'u' then fail ""
|
||||
let c ← any
|
||||
if c != 'd' && c != 'D' then fail ""
|
||||
let u2 ← hexChar; let u3 ← hexChar; let u4 ← hexChar
|
||||
let val := (u2 <<< 8) ||| (u3 <<< 4) ||| u4
|
||||
if val < 0xC00 then fail "" -- low or not a surrogate
|
||||
let charVal := (((low.toUInt32 &&& 0x3FF) <<< 10) ||| (val.toUInt32 &&& 0x3FF)) + 0x10000
|
||||
if h : charVal.isValidChar then
|
||||
return ⟨charVal, h⟩
|
||||
else
|
||||
fail "" -- should be unreachable
|
||||
|
||||
def escapedChar : Parser Char := do
|
||||
let c ← any
|
||||
match c with
|
||||
|
|
@ -38,7 +54,19 @@ def escapedChar : Parser Char := do
|
|||
| 't' => return '\t'
|
||||
| 'u' =>
|
||||
let u1 ← hexChar; let u2 ← hexChar; let u3 ← hexChar; let u4 ← hexChar
|
||||
return Char.ofNat $ 4096*u1 + 256*u2 + 16*u3 + u4
|
||||
let val := (u1 <<< 12) ||| (u2 <<< 8) ||| (u3 <<< 4) ||| u4
|
||||
if h : val < 0xD800 then
|
||||
return ⟨val.toUInt32, Or.inl h⟩
|
||||
else if h' : val < 0xE000 then
|
||||
-- low or high surrogate
|
||||
if val < 0xDC00 then
|
||||
-- low surrogate
|
||||
attempt (finishSurrogatePair val) <|> pure '\ufffd'
|
||||
else
|
||||
-- high/lone surrogate
|
||||
return '\ufffd' -- replacement character
|
||||
else
|
||||
return ⟨val.toUInt32, Or.inr ⟨Nat.not_lt.mp h', Nat.lt_trans val.toFin.isLt (by decide)⟩⟩
|
||||
| _ => fail "illegal \\u escape"
|
||||
|
||||
partial def strCore (acc : String) : Parser String := do
|
||||
|
|
|
|||
6
tests/lean/run/jsonSurrogates.lean
Normal file
6
tests/lean/run/jsonSurrogates.lean
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import Lean.Data.Json
|
||||
|
||||
#guard (Lean.Json.parse "\"\\ud835\\udd5c\"").toOption.get! == Lean.Json.str "𝕜"
|
||||
#guard (Lean.Json.parse "\"\\ud835\"").toOption.get! == Lean.Json.str "\ufffd"
|
||||
#guard (Lean.Json.parse "\"\\udd5c\"").toOption.get! == Lean.Json.str "\ufffd"
|
||||
#guard (Lean.Json.parse "\"\\ud823\\ud835\\udd5c\"").toOption.get! == Lean.Json.str "\ufffd𝕜"
|
||||
Loading…
Add table
Reference in a new issue