feat: add xml parser.

in order to generate the LLVM extern declarations we want to use a generator that spits out XML. Hence adding a small XML parser.
This commit is contained in:
Daniel Fabian 2021-06-25 13:44:08 +00:00 committed by Leonardo de Moura
parent b5d8bc1b8f
commit 0d41fd03f7
7 changed files with 733 additions and 112 deletions

View file

@ -195,6 +195,9 @@ structure Iterator where
def mkIterator (s : String) : Iterator :=
⟨s, 0⟩
instance : DecidableEq Iterator := fun ⟨s₁, i₁⟩ ⟨s₂, i₂⟩ =>
if h₁ : i₁ = i₂ ∧ s₁ = s₂ then isTrue (by simp_all) else isFalse (λ h₂ => by simp_all)
namespace Iterator
def toString : Iterator → String

View file

@ -4,7 +4,9 @@ Released under Apache 2.0 license as described in the file LICENSE.
Authors: Sebastian Ullrich
-/
import Lean.Data.Format
import Lean.Data.Parsec
import Lean.Data.Json
import Lean.Data.Xml
import Lean.Data.JsonRpc
import Lean.Data.KVMap
import Lean.Data.LBool

View file

@ -5,104 +5,19 @@ Released under Apache 2.0 license as described in the file LICENSE.
Authors: Gabriel Ebner, Marc Huisinga
-/
import Lean.Data.Json.Basic
import Lean.Data.Parsec
namespace Lean
open Std (RBNode RBNode.singleton RBNode.leaf)
inductive Quickparse.Result (α : Type) where
| success (pos : String.Iterator) (res : α) : Result α
| error (pos : String.Iterator) (err : String) : Result α
def Quickparse (α : Type) : Type := String.Iterator → Lean.Quickparse.Result α
instance (α : Type) : Inhabited (Quickparse α) :=
⟨fun it => Quickparse.Result.error it ""⟩
namespace Quickparse
open Result
partial def skipWs (it : String.Iterator) : String.Iterator :=
if it.hasNext then
let c := it.curr
if c = '\u0009' c = '\u000a' c = '\u000d' c = '\u0020' then
skipWs it.next
else
it
else
it
@[inline]
protected def pure (a : α) : Quickparse α := fun it =>
success it a
@[inline]
protected def bind {α β : Type} (f : Quickparse α) (g : α → Quickparse β) : Quickparse β := fun it =>
match f it with
| success rem a => g a rem
| error pos msg => error pos msg
@[inline]
def fail (msg : String) : Quickparse α := fun it =>
error it msg
@[inline]
instance : Monad Quickparse :=
{ pure := @Quickparse.pure, bind := @Quickparse.bind }
def unexpectedEndOfInput := "unexpected end of input"
@[inline]
def peek? : Quickparse (Option Char) := fun it =>
if it.hasNext then
success it it.curr
else
success it none
@[inline]
def peek! : Quickparse Char := do
let some c ← peek? | fail unexpectedEndOfInput
c
@[inline]
def skip : Quickparse Unit := fun it =>
success it.next ()
@[inline]
def next : Quickparse Char := do
let c ← peek!
skip
c
def expect (s : String) : Quickparse Unit := fun it =>
if it.extract (it.forward s.length) = s then
success (it.forward s.length) ()
else
error it s!"expected: {s}"
@[inline]
def ws : Quickparse Unit := fun it =>
success (skipWs it) ()
def expectedEndOfInput := "expected end of input"
@[inline]
def eoi : Quickparse Unit := fun it =>
if it.hasNext then
error it expectedEndOfInput
else
success it ()
end Quickparse
namespace Json.Parser
open Quickparse
open Lean.Parsec
@[inline]
def hexChar : Quickparse Nat := do
let c ← next
def hexChar : Parsec Nat := do
let c ← anyChar
if '0' ≤ c ∧ c ≤ '9' then
pure $ c.val.toNat - '0'.val.toNat
else if 'a' ≤ c ∧ c ≤ 'f' then
@ -112,8 +27,8 @@ def hexChar : Quickparse Nat := do
else
fail "invalid hex character"
def escapedChar : Quickparse Char := do
let c ← next
def escapedChar : Parsec Char := do
let c ← anyChar
match c with
| '\\' => '\\'
| '"' => '"'
@ -128,13 +43,13 @@ def escapedChar : Quickparse Char := do
Char.ofNat $ 4096*u1 + 256*u2 + 16*u3 + u4
| _ => fail "illegal \\u escape"
partial def strCore (acc : String) : Quickparse String := do
partial def strCore (acc : String) : Parsec String := do
let c ← peek!
if c = '"' then -- "
skip
acc
else
let c ← next
let c ← anyChar
let ec ←
if c = '\\' then
escapedChar
@ -147,9 +62,9 @@ partial def strCore (acc : String) : Quickparse String := do
fail "unexpected character in string"
strCore (acc.push ec)
def str : Quickparse String := strCore ""
def str : Parsec String := strCore ""
partial def natCore (acc digits : Nat) : Quickparse (Nat × Nat) := do
partial def natCore (acc digits : Nat) : Parsec (Nat × Nat) := do
let some c ← peek? | (acc, digits)
if '0' ≤ c ∧ c ≤ '9' then
skip
@ -159,7 +74,7 @@ partial def natCore (acc digits : Nat) : Quickparse (Nat × Nat) := do
(acc, digits)
@[inline]
def lookahead (p : Char → Prop) (desc : String) [DecidablePred p] : Quickparse Unit := do
def lookahead (p : Char → Prop) (desc : String) [DecidablePred p] : Parsec Unit := do
let c ← peek!
if p c then
()
@ -167,22 +82,22 @@ def lookahead (p : Char → Prop) (desc : String) [DecidablePred p] : Quickparse
fail $ "expected " ++ desc
@[inline]
def natNonZero : Quickparse Nat := do
def natNonZero : Parsec Nat := do
lookahead (fun c => '1' ≤ c ∧ c ≤ '9') "1-9"
let (n, _) ← natCore 0 0
n
@[inline]
def natNumDigits : Quickparse (Nat × Nat) := do
def natNumDigits : Parsec (Nat × Nat) := do
lookahead (fun c => '0' ≤ c ∧ c ≤ '9') "digit"
natCore 0 0
@[inline]
def natMaybeZero : Quickparse Nat := do
def natMaybeZero : Parsec Nat := do
let (n, _) ← natNumDigits
n
def num : Quickparse JsonNumber := do
def num : Parsec JsonNumber := do
let c ← peek!
let sign : Int ←
if c = '-' then
@ -224,10 +139,10 @@ def num : Quickparse JsonNumber := do
else
res
partial def arrayCore (anyCore : Unit → Quickparse Json) (acc : Array Json) : Quickparse (Array Json) := do
partial def arrayCore (anyCore : Unit → Parsec Json) (acc : Array Json) : Parsec (Array Json) := do
let hd ← anyCore ()
let acc' := acc.push hd
let c ← next
let c ← anyChar
if c = ']' then
ws
acc'
@ -237,12 +152,12 @@ partial def arrayCore (anyCore : Unit → Quickparse Json) (acc : Array Json) :
else
fail "unexpected character in array"
partial def objectCore (anyCore : Unit → Quickparse Json) : Quickparse (RBNode String (fun _ => Json)) := do
partial def objectCore (anyCore : Unit → Parsec Json) : Parsec (RBNode String (fun _ => Json)) := do
lookahead (fun c => c = '"') "\""; skip; -- "
let k ← strCore ""; ws
lookahead (fun c => c = ':') ":"; skip; ws
let v ← anyCore ()
let c ← next
let c ← anyChar
if c = '}' then
ws
RBNode.singleton k v
@ -255,7 +170,7 @@ partial def objectCore (anyCore : Unit → Quickparse Json) : Quickparse (RBNode
-- takes a unit parameter so that
-- we can use the equation compiler and recursion
partial def anyCore (u : Unit) : Quickparse Json := do
partial def anyCore (u : Unit) : Parsec Json := do
let c ← peek!
if c = '[' then
skip; ws
@ -281,13 +196,13 @@ partial def anyCore (u : Unit) : Quickparse Json := do
ws
Json.str s
else if c = 'f' then
expect "false"; ws
skipString "false"; ws
Json.bool false
else if c = 't' then
expect "true"; ws
skipString "true"; ws
Json.bool true
else if c = 'n' then
expect "null"; ws
skipString "null"; ws
Json.null
else if c = '-' ('0' ≤ c ∧ c ≤ '9') then
let n ← num
@ -297,10 +212,10 @@ partial def anyCore (u : Unit) : Quickparse Json := do
fail "unexpected input"
def any : Quickparse Json := do
def any : Parsec Json := do
ws
let res ← anyCore ()
eoi
eof
res
end Json.Parser
@ -309,8 +224,8 @@ namespace Json
def parse (s : String) : Except String Lean.Json :=
match Json.Parser.any s.mkIterator with
| Quickparse.Result.success _ res => Except.ok res
| Quickparse.Result.error it err => Except.error s!"offset {it.i.repr}: {err}"
| Parsec.ParseResult.success _ res => Except.ok res
| Parsec.ParseResult.error it err => Except.error s!"offset {it.i.repr}: {err}"
end Json

170
src/Lean/Data/Parsec.lean Normal file
View file

@ -0,0 +1,170 @@
/-
Copyright (c) 2021 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Dany Fabian
-/
namespace Lean
namespace Parsec
inductive ParseResult (α : Type) where
| success (pos : String.Iterator) (res : α)
| error (pos : String.Iterator) (err : String)
deriving Repr
end Parsec
def Parsec (α : Type) : Type := String.Iterator → Lean.Parsec.ParseResult α
namespace Parsec
open ParseResult
instance (α : Type) : Inhabited (Parsec α) :=
⟨λ it => error it ""⟩
@[inline]
protected def pure (a : α) : Parsec α := λ it =>
success it a
@[inline]
def bind {α β : Type} (f : Parsec α) (g : α → Parsec β) : Parsec β := λ it =>
match f it with
| success rem a => g a rem
| error pos msg => error pos msg
instance : Monad Parsec :=
{ pure := Parsec.pure, bind }
@[inline]
def fail (msg : String) : Parsec α := fun it =>
error it msg
@[inline]
def orElse (p q : Parsec α) : Parsec α := fun it =>
match p it with
| success rem a => success rem a
| error rem err =>
if it = rem then q it else error rem err
@[inline]
def attempt (p : Parsec α) : Parsec α := λ it =>
match p it with
| success rem res => success rem res
| error _ err => error it err
instance : Alternative Parsec :=
{ failure := fail "", orElse }
def expectedEndOfInput := "expected end of input"
@[inline]
def eof : Parsec Unit := fun it =>
if it.hasNext then
error it expectedEndOfInput
else
success it ()
@[inline]
partial def manyCore (p : Parsec α) (acc : Array α) : Parsec $ Array α :=
(do manyCore p (acc.push $ ←p))
<|> pure acc
@[inline]
def many (p : Parsec α) : Parsec $ Array α := manyCore p #[]
@[inline]
def many1 (p : Parsec α) : Parsec $ Array α := do manyCore p #[←p]
@[inline]
partial def manyCharsCore (p : Parsec Char) (acc : String) : Parsec String :=
(do manyCharsCore p (acc.push $ ←p))
<|> pure acc
@[inline]
def manyChars (p : Parsec Char) : Parsec String := manyCharsCore p ""
@[inline]
def many1Chars (p : Parsec Char) : Parsec String := do manyCharsCore p (←p).toString
def pstring (s : String) : Parsec String := λ it =>
let substr := it.extract (it.forward s.length)
if substr = s then
success (it.forward s.length) substr
else
error it s!"expected: {s}"
@[inline]
def skipString (s : String) : Parsec Unit := pstring s *> pure ()
def unexpectedEndOfInput := "unexpected end of input"
@[inline]
def anyChar : Parsec Char := λ it =>
if it.hasNext then success it.next it.curr else error it unexpectedEndOfInput
@[inline]
def pchar (c : Char) : Parsec Char := attempt do
if (←anyChar) = c then pure c else fail s!"expected: '{c}'"
@[inline]
def skipChar (c : Char) : Parsec Unit := pchar c *> pure ()
@[inline]
def digit : Parsec Char := attempt do
let c ← anyChar
if '0' ≤ c ∧ c ≤ '9' then c else fail s!"digit expected"
@[inline]
def hexDigit : Parsec Char := attempt do
let c ← anyChar
if ('0' ≤ c ∧ c ≤ '9')
('a' ≤ c ∧ c ≤ 'a')
('A' ≤ c ∧ c ≤ 'A') then c else fail s!"hex digit expected"
@[inline]
def asciiLetter : Parsec Char := attempt do
let c ← anyChar
if ('A' ≤ c ∧ c ≤ 'Z') ('a' ≤ c ∧ c ≤ 'z') then c else fail s!"ASCII letter expected"
@[inline]
def satisfy (p : Char → Bool) : Parsec Char := attempt do
let c ← anyChar
if p c then c else fail "condition not satisfied"
@[inline]
def notFollowedBy (p : Parsec α) : Parsec Unit := λ it =>
match p it with
| success _ _ => error it ""
| error _ _ => success it ()
partial def skipWs (it : String.Iterator) : String.Iterator :=
if it.hasNext then
let c := it.curr
if c = '\u0009' c = '\u000a' c = '\u000d' c = '\u0020' then
skipWs it.next
else
it
else
it
@[inline]
def peek? : Parsec (Option Char) := fun it =>
if it.hasNext then
success it it.curr
else
success it none
@[inline]
def peek! : Parsec Char := do
let some c ← peek? | fail unexpectedEndOfInput
c
@[inline]
def skip : Parsec Unit := fun it =>
success it.next ()
@[inline]
def ws : Parsec Unit := fun it =>
success (skipWs it) ()
end Parsec

2
src/Lean/Data/Xml.lean Normal file
View file

@ -0,0 +1,2 @@
import Lean.Data.Xml.Basic
import Lean.Data.Xml.Parser

View file

@ -0,0 +1,41 @@
/-
Copyright (c) 2021 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Dany Fabian
-/
import Std.Data.RBMap
namespace Lean
namespace Xml
def Attributes := Std.RBMap String String compare
instance : ToString Attributes := ⟨λ as => as.fold (λ s n v => s ++ s!" {n}=\"{v}\"") ""⟩
mutual
inductive Element
| Element
(name : String)
(attributes : Attributes)
(content : Array Content)
inductive Content
| Element (element : Element)
| Comment (comment : String)
| Character (content : String)
deriving Inhabited
end
mutual
private partial def eToString : Element → String
| Element.Element n a c => s!"<{n}{a}>{c.map cToString |>.foldl (· ++ ·) ""}</{n}>"
private partial def cToString : Content → String
| Content.Element e => eToString e
| Content.Comment c => s!"<!--{c}-->"
| Content.Character c => c
end
instance : ToString Element := ⟨eToString⟩
instance : ToString Content := ⟨cToString⟩

View file

@ -0,0 +1,488 @@
/-
Copyright (c) 2021 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Dany Fabian
-/
import Lean.Data.Parsec
import Lean.Data.Xml.Basic
open IO
open System
open Lean
namespace Lean
namespace Xml
namespace Parser
open Lean.Parsec
open Parsec.ParseResult
abbrev LeanChar := Char
/-- consume a newline character sequence pretending, that we read '\n'. As per spec:
https://www.w3.org/TR/xml/#sec-line-ends -/
def endl : Parsec LeanChar := (skipString "\r\n" <|> skipChar '\r' <|> skipChar '\n') *> pure '\n'
def quote (p : Parsec α) : Parsec α :=
skipChar '\'' *> p <* skipChar '\''
<|> skipChar '"' *> p <* skipChar '"'
/-- https://www.w3.org/TR/xml/#NT-Char -/
def Char : Parsec LeanChar :=
(attempt do
let c ← anyChar
let cNat ← c.toNat
if (0x20 ≤ cNat ∧ cNat ≤ 0xD7FF)
(0xE000 ≤ cNat ∧ cNat ≤ 0xFFFD)
(0x10000 ≤ cNat ∧ cNat ≤ 0x10FFFF) then c else fail "expected xml char")
<|> pchar '\t' <|> endl
/-- https://www.w3.org/TR/xml/#NT-S -/
def S : Parsec String :=
many1Chars (pchar ' ' <|> endl <|> pchar '\t')
/-- https://www.w3.org/TR/xml/#NT-Eq -/
def Eq : Parsec Unit :=
optional S *> skipChar '=' <* optional S
private def nameStartCharRanges : Array (Nat × Nat) :=
#[(0xC0, 0xD6),
(0xD8, 0xF6),
(0xF8, 0x2FF),
(0x370, 0x37D),
(0x37F, 0x1FFF),
(0x200C, 0x200D),
(0x2070, 0x218F),
(0x2C00, 0x2FEF),
(0x3001, 0xD7FF),
(0xF900, 0xFDCF),
(0xFDF0, 0xFFFD),
(0x10000, 0xEFFFF)]
/-- https://www.w3.org/TR/xml/#NT-NameStartChar -/
def NameStartChar : Parsec LeanChar := attempt do
let c ← anyChar
if ('A' ≤ c ∧ c ≤ 'Z') ('a' ≤ c ∧ c ≤ 'z') then c
else if c = ':' c = '_' then c
else
let cNum := c.toNat
if nameStartCharRanges.any (fun (lo, hi) => lo ≤ cNum ∧ cNum ≤ hi) then c
else fail "expected a name character"
/-- https://www.w3.org/TR/xml/#NT-NameChar -/
def NameChar : Parsec LeanChar :=
NameStartChar <|> digit <|> pchar '-' <|> pchar '.' <|> pchar '\xB7'
<|> satisfy (λ c => ('\u0300' ≤ c ∧ c ≤ '\u036F') ('\u203F' ≤ c ∧ c ≤ '\u2040'))
/-- https://www.w3.org/TR/xml/#NT-Name -/
def Name : Parsec String := do
let x ← NameStartChar
manyCharsCore NameChar x.toString
/-- https://www.w3.org/TR/xml/#NT-VersionNum -/
def VersionNum : Parsec Unit :=
skipString "1." <* (many1 digit)
/-- https://www.w3.org/TR/xml/#NT-VersionInfo -/
def VersionInfo : Parsec Unit := do
S *>
skipString "version"
Eq
quote VersionNum
/-- https://www.w3.org/TR/xml/#NT-EncName -/
def EncName : Parsec String := do
let x ← asciiLetter
manyCharsCore (asciiLetter <|> digit <|> pchar '-' <|> pchar '_' <|> pchar '.') x.toString
/-- https://www.w3.org/TR/xml/#NT-EncodingDecl -/
def EncodingDecl : Parsec String := do
S *>
skipString "encoding"
Eq
quote EncName
/-- https://www.w3.org/TR/xml/#NT-SDDecl -/
def SDDecl : Parsec String := do
S *> skipString "standalone" *> Eq *> quote (pstring "yes" <|> pstring "no")
/-- https://www.w3.org/TR/xml/#NT-XMLDecl -/
def XMLdecl : Parsec Unit := do
skipString "<?xml"
VersionInfo
optional EncodingDecl *>
optional SDDecl *>
optional S *>
skipString "?>"
/-- https://www.w3.org/TR/xml/#NT-Comment -/
def Comment : Parsec String :=
let notDash := Char.toString <$> satisfy (λ c => c ≠ '-')
skipString "<!--" *>
Array.foldl String.append "" <$> many (notDash <|> (do
let d ← pchar '-'
let c ← notDash
pure $ d.toString ++ c))
<* skipString "-->"
/-- https://www.w3.org/TR/xml/#NT-PITarget -/
def PITarget : Parsec String :=
Name <* (skipChar 'X' <|> skipChar 'x') <* (skipChar 'M' <|> skipChar 'm') <* (skipChar 'L' <|> skipChar 'l')
/-- https://www.w3.org/TR/xml/#NT-PI -/
def PI : Parsec Unit := do
skipString "<?"
<* PITarget <*
optional (S *> manyChars (notFollowedBy (skipString "?>") *> Char))
skipString "?>"
/-- https://www.w3.org/TR/xml/#NT-Misc -/
def Misc : Parsec Unit :=
Comment *> pure () <|> PI <|> S *> pure ()
/-- https://www.w3.org/TR/xml/#NT-SystemLiteral -/
def SystemLiteral : Parsec String :=
pchar '"' *> manyChars (satisfy λ c => c ≠ '"') <* pchar '"'
<|> pchar '\'' *> manyChars (satisfy λ c => c ≠ '\'') <* '\''
/-- https://www.w3.org/TR/xml/#NT-PubidChar -/
def PubidChar : Parsec LeanChar :=
asciiLetter <|> digit <|> endl <|> attempt do
let c ← anyChar
if "-'()+,./:=?;!*#@$_%".contains c then c else fail "PublidChar expected"
/-- https://www.w3.org/TR/xml/#NT-PubidLiteral -/
def PubidLiteral : Parsec String :=
pchar '"' *> manyChars PubidChar <* pchar '"'
<|> pchar '\'' *> manyChars (attempt do
let c ← PubidChar
if c = '\'' then fail "'\\'' not expected" else c) <* pchar '\''
/-- https://www.w3.org/TR/xml/#NT-ExternalID -/
def ExternalID : Parsec Unit :=
skipString "SYSTEM" *> S *> SystemLiteral *> pure ()
<|> skipString "PUBLIC" *> S *> PubidLiteral *> S *> SystemLiteral *> pure ()
/-- https://www.w3.org/TR/xml/#NT-Mixed -/
def Mixed : Parsec Unit :=
(do
skipChar '('
optional S *>
skipString "#PCDATA" *>
many (optional S *> skipChar '|' *> optional S *> Name) *>
optional S *>
skipString ")*")
<|> skipChar '(' *> (optional S) *> skipString "#PCDATA" <* (optional S) <* skipChar ')'
mutual
/-- https://www.w3.org/TR/xml/#NT-cp -/
partial def cp : Parsec Unit :=
(Name *> pure () <|> choice <|> seq) <* optional (skipChar '?' <|> skipChar '*' <|> skipChar '+')
/-- https://www.w3.org/TR/xml/#NT-choice -/
partial def choice : Parsec Unit := do
skipChar '('
optional S *>
cp
many1 (optional S *> skipChar '|' *> optional S *> cp) *>
optional S *>
skipChar ')'
/-- https://www.w3.org/TR/xml/#NT-seq -/
partial def seq : Parsec Unit := do
skipChar '('
optional S *>
cp
many (optional S *> skipChar ',' *> optional S *> cp) *>
optional S *>
skipChar ')'
end
/-- https://www.w3.org/TR/xml/#NT-children -/
def children : Parsec Unit :=
(choice <|> seq) <* optional (skipChar '?' <|> skipChar '*' <|> skipChar '+')
/-- https://www.w3.org/TR/xml/#NT-contentspec -/
def contentspec : Parsec Unit := do
skipString "EMPTY" <|> skipString "ANY" <|> Mixed <|> children
/-- https://www.w3.org/TR/xml/#NT-elementdecl -/
def elementDecl : Parsec Unit := do
skipString "<!ELEMENT"
S *>
Name *>
contentspec *>
optional S *>
skipChar '>'
/-- https://www.w3.org/TR/xml/#NT-StringType -/
def StringType : Parsec Unit :=
skipString "CDATA"
/-- https://www.w3.org/TR/xml/#NT-TokenizedType -/
def TokenizedType : Parsec Unit :=
skipString "ID"
<|> skipString "IDREF"
<|> skipString "IDREFS"
<|> skipString "ENTITY"
<|> skipString "ENTITIES"
<|> skipString "NMTOKEN"
<|> skipString "NMTOKENS"
/-- https://www.w3.org/TR/xml/#NT-NotationType -/
def NotationType : Parsec Unit := do
skipString "NOTATION"
S *>
skipChar '(' <*
optional S
Name *> many (optional S *> skipChar '|' *> optional S *> Name) *>
optional S *>
skipChar ')'
/-- https://www.w3.org/TR/xml/#NT-Nmtoken -/
def Nmtoken : Parsec String := do
many1Chars NameChar
/-- https://www.w3.org/TR/xml/#NT-Enumeration -/
def Enumeration : Parsec Unit := do
skipChar '('
optional S *>
Nmtoken *> many (optional S *> skipChar '|' *> optional S *> Nmtoken) *>
optional S *>
skipChar ')'
/-- https://www.w3.org/TR/xml/#NT-EnumeratedType -/
def EnumeratedType : Parsec Unit :=
NotationType <|> Enumeration
/-- https://www.w3.org/TR/xml/#NT-AttType -/
def AttType : Parsec Unit :=
StringType <|> TokenizedType <|> EnumeratedType
def predefinedEntityToChar : String → Option LeanChar
| "lt" => some '<'
| "gt" => some '>'
| "amp" => some '&'
| "apos" => some '\''
| "quot" => some '"'
| _ => none
/-- https://www.w3.org/TR/xml/#NT-EntityRef -/
def EntityRef : Parsec $ Option LeanChar := attempt $
skipChar '&' *> predefinedEntityToChar <$> Name <* skipChar ';'
@[inline]
def hexDigitToNat (c : LeanChar) : Nat :=
if '0' ≤ c ∧ c ≤ '9' then c.toNat - '0'.toNat
else if 'a' ≤ c ∧ c ≤ 'f' then c.toNat - 'a'.toNat + 10
else c.toNat - 'A'.toNat + 10
def digitsToNat (base : Nat) (digits : Array Nat) : Nat :=
digits.foldl (λ r d => r * base + d) 0
/-- https://www.w3.org/TR/xml/#NT-CharRef -/
def CharRef : Parsec LeanChar := do
skipString "&#"
let charCode ←
digitsToNat 10 <$> many1 (hexDigitToNat <$> digit)
<|> skipChar 'x' *> digitsToNat 16 <$> many1 (hexDigitToNat <$> hexDigit)
skipChar ';'
Char.ofNat charCode
/-- https://www.w3.org/TR/xml/#NT-Reference -/
def Reference : Parsec $ Option LeanChar :=
EntityRef <|> some <$> CharRef
/-- https://www.w3.org/TR/xml/#NT-AttValue -/
def AttValue : Parsec String := do
let chars ←
(do
skipChar '"'
many (some <$> satisfy (λ c => c ≠ '<' ∧ c ≠ '&' ∧ c ≠ '"') <|> Reference) <*
skipChar '"')
<|> (do
skipChar '\''
many (some <$> satisfy (λ c => c ≠ '<' ∧ c ≠ '&' ∧ c ≠ '\'') <|> Reference) <*
skipChar '\'')
return chars.foldl (λ s c => if let some c := c then s.push c else s) ""
/-- https://www.w3.org/TR/xml/#NT-DefaultDecl -/
def DefaultDecl : Parsec Unit :=
skipString "#REQUIRED"
<|> skipString "#IMPLIED"
<|> optional (skipString "#FIXED" <* S) *> AttValue *> pure ()
/-- https://www.w3.org/TR/xml/#NT-AttDef -/
def AttDef : Parsec Unit :=
S *> Name *> S *> AttType *> S *> DefaultDecl
/-- https://www.w3.org/TR/xml/#NT-AttlistDecl -/
def AttlistDecl : Parsec Unit :=
skipString "<!ATTLIST" *> S *> Name *> many AttDef *> optional S *> skipChar '>'
/-- https://www.w3.org/TR/xml/#NT-PEReference -/
def PEReference : Parsec Unit :=
skipChar '%' *> Name *> skipChar ';'
/-- https://www.w3.org/TR/xml/#NT-EntityValue -/
def EntityValue : Parsec String := do
let chars ←
(do
skipChar '"'
many (some <$> satisfy (λ c => c ≠ '%' ∧ c ≠ '&' ∧ c ≠ '"') <|> PEReference *> pure none <|> Reference) <*
skipChar '"')
<|> (do
skipChar '\''
many (some <$> satisfy (λ c => c ≠ '%' ∧ c ≠ '&' ∧ c ≠ '\'') <|> PEReference *> pure none <|> Reference) <*
skipChar '\'')
return chars.foldl (λ s c => if let some c := c then s.push c else s) ""
/-- https://www.w3.org/TR/xml/#NT-NDataDecl -/
def NDataDecl : Parsec Unit :=
S *> skipString "NDATA" <* S <* Name
/-- https://www.w3.org/TR/xml/#NT-EntityDef -/
def EntityDef : Parsec Unit :=
EntityValue *> pure () <|> (ExternalID <* optional NDataDecl)
/-- https://www.w3.org/TR/xml/#NT-GEDecl -/
def GEDecl : Parsec Unit :=
skipString "<!ENTITY" *> S *> Name *> S *> EntityDef *> optional S *> skipChar '>'
/-- https://www.w3.org/TR/xml/#NT-PEDef -/
def PEDef : Parsec Unit :=
EntityValue *> pure () <|> ExternalID
/-- https://www.w3.org/TR/xml/#NT-PEDecl -/
def PEDecl : Parsec Unit :=
skipString "<!ENTITY" *> S *> skipChar '%' *> S *> Name *> PEDef *> optional S *> skipChar '>'
/-- https://www.w3.org/TR/xml/#NT-EntityDecl -/
def EntityDecl : Parsec Unit :=
GEDecl <|> PEDecl
/-- https://www.w3.org/TR/xml/#NT-PublicID -/
def PublicID : Parsec Unit :=
skipString "PUBLIC" <* S <* PubidLiteral
/-- https://www.w3.org/TR/xml/#NT-NotationDecl -/
def NotationDecl : Parsec Unit :=
skipString "<!NOTATION" *> S *> Name *> (ExternalID <|> PublicID) *> optional S *> skipChar '>'
/-- https://www.w3.org/TR/xml/#NT-markupdecl -/
def markupDecl : Parsec Unit :=
elementDecl <|> AttlistDecl <|> EntityDecl <|> NotationDecl <|> PI <|> (Comment *> pure ())
/-- https://www.w3.org/TR/xml/#NT-DeclSep -/
def DeclSep : Parsec Unit :=
PEReference <|> S *> pure ()
/-- https://www.w3.org/TR/xml/#NT-intSubset -/
def intSubset : Parsec Unit :=
many (markupDecl <|> DeclSep) *> pure ()
/-- https://www.w3.org/TR/xml/#NT-doctypedecl -/
def doctypedecl : Parsec Unit := do
skipString "<!DOCTYPE"
S *>
Name *>
optional (S *> ExternalID) *> pure ()
<* optional S
optional (skipChar '[' *> intSubset <* skipChar ']' <* optional S) *>
skipChar '>'
/-- https://www.w3.org/TR/xml/#NT-prolog -/
def prolog : Parsec Unit :=
optional XMLdecl *>
many Misc *>
optional (doctypedecl <* many Misc) *> pure ()
/-- https://www.w3.org/TR/xml/#NT-Attribute -/
def Attribute : Parsec (String × String) := do
let name ← Name
Eq
let value ← AttValue
(name, value)
protected def elementPrefix : Parsec (Array Content → Element) := do
skipChar '<'
let name ← Name
let attributes ← many (S *> Attribute)
optional S *> pure ()
Element.Element name (Std.RBMap.fromList attributes.toList compare)
/-- https://www.w3.org/TR/xml/#NT-EmptyElemTag -/
def EmptyElemTag (elem : Array Content → Element) : Parsec Element := do
skipString "/>" *> pure (elem #[])
/-- https://www.w3.org/TR/xml/#NT-STag -/
def STag (elem : Array Content → Element) : Parsec (Array Content → Element) := do
skipChar '>' *> pure elem
/-- https://www.w3.org/TR/xml/#NT-ETag -/
def ETag : Parsec Unit :=
skipString "</" *> Name *> optional S *> skipChar '>'
/-- https://www.w3.org/TR/xml/#NT-CDStart -/
def CDStart : Parsec Unit :=
skipString "<![CDATA["
/-- https://www.w3.org/TR/xml/#NT-CDEnd -/
def CDEnd : Parsec Unit :=
skipString "]]>"
/-- https://www.w3.org/TR/xml/#NT-CData -/
def CData : Parsec String :=
manyChars (notFollowedBy (skipString "]]>") *> anyChar)
/-- https://www.w3.org/TR/xml/#NT-CDSect -/
def CDSect : Parsec String :=
CDStart *> CData <* CDEnd
/-- https://www.w3.org/TR/xml/#NT-CharData -/
def CharData : Parsec String :=
notFollowedBy (skipString "]]>") *> manyChars (satisfy λ c => c ≠ '<' ∧ c ≠ '&')
mutual
/-- https://www.w3.org/TR/xml/#NT-content -/
partial def content : Parsec (Array Content) := do
let x ← optional (Content.Character <$> CharData)
let xs ← many do
let y ←
attempt (some <$> Content.Element <$> element)
<|> (do let c ← Reference; c.map (Content.Character ∘ Char.toString))
<|> some <$> Content.Character <$> CDSect
<|> PI *> none
<|> some <$> Content.Comment <$> Comment
let z ← optional (Content.Character <$> CharData)
#[y, z]
let xs := #[x] ++ xs.concatMap id |>.filterMap id
let mut res := #[]
for x in xs do
if res.size > 0 then
match res.back, x with
| Content.Character x, Content.Character y => res ← res.set! (res.size - 1) (Content.Character $ x ++ y)
| _, x => res ← res.push x
else res ← res.push x
res
/-- https://www.w3.org/TR/xml/#NT-element -/
partial def element : Parsec Element := do
let elem ← Parser.elementPrefix
EmptyElemTag elem <|> STag elem <*> content <* ETag
end
/-- https://www.w3.org/TR/xml/#NT-document -/
def document : Parsec Element := prolog *> element <* many Misc <* eof
end Parser
def parse (s : String) : Except String Element :=
match Xml.Parser.document s.mkIterator with
| Parsec.ParseResult.success _ res => Except.ok res
| Parsec.ParseResult.error it err => Except.error s!"offset {it.i.repr}: {err}\n{(it.prevn 10).extract it}"
end Xml