lean4-htt/src/Init/System/Uri.lean
Mario Carneiro 62cdb51ed5
feat: UTF-8 string validation (#3958)
Previously, there was a function `opaque fromUTF8Unchecked : ByteArray
-> String` which would convert a list of bytes into a string, but as the
name implies it does not validate that the string is UTF-8 before doing
so and as a result it produces unsound results in the compiler (because
the lean model of `String` indirectly asserts UTF-8 validity). This PR
replaces that function by
```lean
opaque validateUTF8 (a : @& ByteArray) : Bool

opaque fromUTF8 (a : @& ByteArray) (h : validateUTF8 a) : String
```
so that while the function is still "unchecked", we have a proof witness
that the string is valid. To recover the original, actually unchecked
version, use `lcProof` or other unsafe methods to produce the proof
witness.

Because this was the only `ByteArray -> String` conversion function, it
was used in several places in an unsound way (e.g. reading untrusted
input from IO and treating it as UTF-8). These have been replaced by
`fromUTF8?` or `fromUTF8!` as appropriate.
2024-04-20 18:36:37 +00:00

116 lines
4.5 KiB
Text

/-
Copyright (c) 2019 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Chris Lovett
-/
prelude
import Init.Data.String.Extra
import Init.Data.Nat.Linear
import Init.System.FilePath
namespace System
namespace Uri
namespace UriEscape
/- https://www.ietf.org/rfc/rfc3986.txt -/
@[inline] def zero : UInt8 := '0'.toNat.toUInt8
@[inline] def nine : UInt8 := '9'.toNat.toUInt8
@[inline] def lettera : UInt8 := 'a'.toNat.toUInt8
@[inline] def letterf : UInt8 := 'f'.toNat.toUInt8
@[inline] def letterA : UInt8 := 'A'.toNat.toUInt8
@[inline] def letterF : UInt8 := 'F'.toNat.toUInt8
/-- Decode %HH escapings in the given string. Note that sometimes a consecutive
sequence of multiple escapings can represent a utf-8 encoded sequence for
a single unicode code point and these will also be decoded correctly. -/
def decodeUri (uri : String) : String := Id.run do
let mut decoded : ByteArray := ByteArray.empty
let rawBytes := uri.toUTF8
let len := rawBytes.size
let mut i := 0
let percent := '%'.toNat.toUInt8
while i < len do
let c := rawBytes[i]!
(decoded, i) := if c == percent && i + 1 < len then
let h1 := rawBytes[i + 1]!
if let some hd1 := hexDigitToUInt8? h1 then
if i + 2 < len then
let h2 := rawBytes[i + 2]!
if let some hd2 := hexDigitToUInt8? h2 then
-- decode the hex digits into a byte.
(decoded.push (hd1 * 16 + hd2), i + 3)
else
-- not a valid second hex digit so keep the original bytes
(((decoded.push c).push h1).push h2, i + 3)
else
-- hit end of string, there is no h2.
((decoded.push c).push h1, i + 2)
else
-- not a valid hex digit so keep the original bytes
((decoded.push c).push h1, i + 2)
else
(decoded.push c, i + 1)
return String.fromUTF8! decoded
where hexDigitToUInt8? (c : UInt8) : Option UInt8 :=
if zero ≤ c ∧ c ≤ nine then some (c - zero)
else if lettera ≤ c ∧ c ≤ letterf then some (c - lettera + 10)
else if letterA ≤ c ∧ c ≤ letterF then some (c - letterA + 10)
else none
def rfc3986ReservedChars : List Char := [ ';', ':', '?', '#', '[', ']', '@', '&', '=', '+', '$', ',', '!', '\'', '(', ')', '*', '%', ' ' ]
def uriEscapeAsciiChar (c : Char) : String :=
if rfc3986ReservedChars.contains c || c < ' ' then
"%" ++ uInt8ToHex c.toNat.toUInt8
else if (Char.toNat c) < 127 then
c.toString
else
c.toString.toUTF8.foldl (fun s b => s ++ "%" ++ (uInt8ToHex b)) ""
where
uInt8ToHex (c : UInt8) : String :=
let d2 := c / 16;
let d1 := c % 16;
(hexDigitRepr d2.toNat ++ hexDigitRepr d1.toNat).toUpper
end UriEscape
/-- Replaces special characters in the given Uri with %HH Uri escapings. -/
def escapeUri (uri: String) : String :=
uri.foldl (fun s c => s ++ UriEscape.uriEscapeAsciiChar c) ""
/-- Replaces all %HH Uri escapings in the given string with their
corresponding unicode code points. Note that sometimes a consecutive
sequence of multiple escapings can represent a utf-8 encoded sequence for
a single unicode code point and these will also be decoded correctly. -/
def unescapeUri (s: String) : String :=
UriEscape.decodeUri s
/-- Convert the given FilePath to a "file:///encodedpath" Uri. -/
def pathToUri (fname : System.FilePath) : String := Id.run do
let mut uri := fname.normalize.toString
if System.Platform.isWindows then
-- normalize drive letter
-- lower-case drive letters seem to be preferred in URIs
if uri.length >= 2 && (uri.get 0).isUpper && uri.get ⟨1⟩ == ':' then
uri := uri.set 0 (uri.get 0).toLower
uri := uri.map (fun c => if c == '\\' then '/' else c)
uri := uri.foldl (fun s c => s ++ UriEscape.uriEscapeAsciiChar c) ""
let result := if uri.startsWith "/" then "file://" ++ uri else "file:///" ++ uri
result
/-- Convert the given uri to a FilePath stripping the 'file://' prefix,
ignoring the optional host name. -/
def fileUriToPath? (uri : String) : Option System.FilePath := Id.run do
if !uri.startsWith "file://" then
none
else
let mut p := (unescapeUri uri).drop "file://".length
p := p.dropWhile (λ c => c != '/') -- drop the hostname.
-- On Windows, the path "/c:/temp" needs to become "C:/temp"
if System.Platform.isWindows && p.length >= 2 &&
p.get 0 == '/' && (p.get ⟨1⟩).isAlpha && p.get ⟨2⟩ == ':' then
-- see also `pathToUri`
p := p.drop 1 |>.modify 0 .toUpper
some p
end Uri
end System