From 416e07a68e5d9b884de8af4836497a7cc2414c8f Mon Sep 17 00:00:00 2001 From: Rob23oba <152706811+Rob23oba@users.noreply.github.com> Date: Thu, 24 Apr 2025 21:07:46 +0200 Subject: [PATCH] fix: handle surrogate pairs correctly in `Json.parse` (#8080) This PR fixes `Json.parse` to handle surrogate pairs correctly. Closes #5445 --- src/Lean/Data/Json/Parser.lean | 38 ++++++++++++++++++++++++++---- tests/lean/run/jsonSurrogates.lean | 6 +++++ 2 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 tests/lean/run/jsonSurrogates.lean diff --git a/src/Lean/Data/Json/Parser.lean b/src/Lean/Data/Json/Parser.lean index 4d3592525e..810d0fcc32 100644 --- a/src/Lean/Data/Json/Parser.lean +++ b/src/Lean/Data/Json/Parser.lean @@ -14,17 +14,33 @@ open Std.Internal.Parsec.String namespace Lean.Json.Parser -def hexChar : Parser Nat := do +def hexChar : Parser UInt16 := do let c ← any if '0' <= c && c <= '9' then - pure $ (c.val - '0'.val).toNat + pure $ (c.val - '0'.val).toUInt16 else if 'a' <= c && c <= 'f' then - pure $ (c.val - 'a'.val + 10).toNat + pure $ (c.val - 'a'.val + 10).toUInt16 else if 'A' <= c && c <= 'F' then - pure $ (c.val - 'A'.val + 10).toNat + pure $ (c.val - 'A'.val + 10).toUInt16 else fail "invalid hex character" +def finishSurrogatePair (low : UInt16) : Parser Char := do + let c ← any + if c != '\\' then fail "" + let c ← any + if c != 'u' then fail "" + let c ← any + if c != 'd' && c != 'D' then fail "" + let u2 ← hexChar; let u3 ← hexChar; let u4 ← hexChar + let val := (u2 <<< 8) ||| (u3 <<< 4) ||| u4 + if val < 0xC00 then fail "" -- low or not a surrogate + let charVal := (((low.toUInt32 &&& 0x3FF) <<< 10) ||| (val.toUInt32 &&& 0x3FF)) + 0x10000 + if h : charVal.isValidChar then + return ⟨charVal, h⟩ + else + fail "" -- should be unreachable + def escapedChar : Parser Char := do let c ← any match c with @@ -38,7 +54,19 @@ def escapedChar : Parser Char := do | 't' => return '\t' | 'u' => let u1 ← hexChar; let u2 ← hexChar; let u3 ← hexChar; let u4 ← hexChar - return Char.ofNat $ 4096*u1 + 256*u2 + 16*u3 + u4 + let val := (u1 <<< 12) ||| (u2 <<< 8) ||| (u3 <<< 4) ||| u4 + if h : val < 0xD800 then + return ⟨val.toUInt32, Or.inl h⟩ + else if h' : val < 0xE000 then + -- low or high surrogate + if val < 0xDC00 then + -- low surrogate + attempt (finishSurrogatePair val) <|> pure '\ufffd' + else + -- high/lone surrogate + return '\ufffd' -- replacement character + else + return ⟨val.toUInt32, Or.inr ⟨Nat.not_lt.mp h', Nat.lt_trans val.toFin.isLt (by decide)⟩⟩ | _ => fail "illegal \\u escape" partial def strCore (acc : String) : Parser String := do diff --git a/tests/lean/run/jsonSurrogates.lean b/tests/lean/run/jsonSurrogates.lean new file mode 100644 index 0000000000..f719bcda94 --- /dev/null +++ b/tests/lean/run/jsonSurrogates.lean @@ -0,0 +1,6 @@ +import Lean.Data.Json + +#guard (Lean.Json.parse "\"\\ud835\\udd5c\"").toOption.get! == Lean.Json.str "𝕜" +#guard (Lean.Json.parse "\"\\ud835\"").toOption.get! == Lean.Json.str "\ufffd" +#guard (Lean.Json.parse "\"\\udd5c\"").toOption.get! == Lean.Json.str "\ufffd" +#guard (Lean.Json.parse "\"\\ud823\\ud835\\udd5c\"").toOption.get! == Lean.Json.str "\ufffd𝕜"