From 0d41fd03f794654bea99259ece9e1f6278bbb316 Mon Sep 17 00:00:00 2001 From: Daniel Fabian Date: Fri, 25 Jun 2021 13:44:08 +0000 Subject: [PATCH] feat: add xml parser. in order to generate the LLVM extern declarations we want to use a generator that spits out XML. Hence adding a small XML parser. --- src/Init/Data/String/Basic.lean | 3 + src/Lean/Data.lean | 2 + src/Lean/Data/Json/Parser.lean | 139 ++------- src/Lean/Data/Parsec.lean | 170 +++++++++++ src/Lean/Data/Xml.lean | 2 + src/Lean/Data/Xml/Basic.lean | 41 +++ src/Lean/Data/Xml/Parser.lean | 488 ++++++++++++++++++++++++++++++++ 7 files changed, 733 insertions(+), 112 deletions(-) create mode 100644 src/Lean/Data/Parsec.lean create mode 100644 src/Lean/Data/Xml.lean create mode 100644 src/Lean/Data/Xml/Basic.lean create mode 100644 src/Lean/Data/Xml/Parser.lean diff --git a/src/Init/Data/String/Basic.lean b/src/Init/Data/String/Basic.lean index 204a4371a7..8f0e23147f 100644 --- a/src/Init/Data/String/Basic.lean +++ b/src/Init/Data/String/Basic.lean @@ -195,6 +195,9 @@ structure Iterator where def mkIterator (s : String) : Iterator := ⟨s, 0⟩ + +instance : DecidableEq Iterator := fun ⟨s₁, i₁⟩ ⟨s₂, i₂⟩ => + if h₁ : i₁ = i₂ ∧ s₁ = s₂ then isTrue (by simp_all) else isFalse (λ h₂ => by simp_all) namespace Iterator def toString : Iterator → String diff --git a/src/Lean/Data.lean b/src/Lean/Data.lean index 61f35a40fe..de84c8f733 100644 --- a/src/Lean/Data.lean +++ b/src/Lean/Data.lean @@ -4,7 +4,9 @@ Released under Apache 2.0 license as described in the file LICENSE. Authors: Sebastian Ullrich -/ import Lean.Data.Format +import Lean.Data.Parsec import Lean.Data.Json +import Lean.Data.Xml import Lean.Data.JsonRpc import Lean.Data.KVMap import Lean.Data.LBool diff --git a/src/Lean/Data/Json/Parser.lean b/src/Lean/Data/Json/Parser.lean index c3d8b434fc..1255653ae6 100644 --- a/src/Lean/Data/Json/Parser.lean +++ b/src/Lean/Data/Json/Parser.lean @@ -5,104 +5,19 @@ Released under Apache 2.0 license as described in the file LICENSE. Authors: Gabriel Ebner, Marc Huisinga -/ import Lean.Data.Json.Basic +import Lean.Data.Parsec namespace Lean open Std (RBNode RBNode.singleton RBNode.leaf) -inductive Quickparse.Result (α : Type) where - | success (pos : String.Iterator) (res : α) : Result α - | error (pos : String.Iterator) (err : String) : Result α - -def Quickparse (α : Type) : Type := String.Iterator → Lean.Quickparse.Result α - -instance (α : Type) : Inhabited (Quickparse α) := - ⟨fun it => Quickparse.Result.error it ""⟩ - -namespace Quickparse - -open Result - -partial def skipWs (it : String.Iterator) : String.Iterator := - if it.hasNext then - let c := it.curr - if c = '\u0009' ∨ c = '\u000a' ∨ c = '\u000d' ∨ c = '\u0020' then - skipWs it.next - else - it - else - it - -@[inline] -protected def pure (a : α) : Quickparse α := fun it => - success it a - -@[inline] -protected def bind {α β : Type} (f : Quickparse α) (g : α → Quickparse β) : Quickparse β := fun it => - match f it with - | success rem a => g a rem - | error pos msg => error pos msg - -@[inline] -def fail (msg : String) : Quickparse α := fun it => - error it msg - -@[inline] -instance : Monad Quickparse := - { pure := @Quickparse.pure, bind := @Quickparse.bind } - -def unexpectedEndOfInput := "unexpected end of input" - -@[inline] -def peek? : Quickparse (Option Char) := fun it => - if it.hasNext then - success it it.curr - else - success it none - -@[inline] -def peek! : Quickparse Char := do - let some c ← peek? | fail unexpectedEndOfInput - c - -@[inline] -def skip : Quickparse Unit := fun it => - success it.next () - -@[inline] -def next : Quickparse Char := do - let c ← peek! - skip - c - -def expect (s : String) : Quickparse Unit := fun it => - if it.extract (it.forward s.length) = s then - success (it.forward s.length) () - else - error it s!"expected: {s}" - -@[inline] -def ws : Quickparse Unit := fun it => - success (skipWs it) () - -def expectedEndOfInput := "expected end of input" - -@[inline] -def eoi : Quickparse Unit := fun it => - if it.hasNext then - error it expectedEndOfInput - else - success it () - -end Quickparse - namespace Json.Parser -open Quickparse +open Lean.Parsec @[inline] -def hexChar : Quickparse Nat := do - let c ← next +def hexChar : Parsec Nat := do + let c ← anyChar if '0' ≤ c ∧ c ≤ '9' then pure $ c.val.toNat - '0'.val.toNat else if 'a' ≤ c ∧ c ≤ 'f' then @@ -112,8 +27,8 @@ def hexChar : Quickparse Nat := do else fail "invalid hex character" -def escapedChar : Quickparse Char := do - let c ← next +def escapedChar : Parsec Char := do + let c ← anyChar match c with | '\\' => '\\' | '"' => '"' @@ -128,13 +43,13 @@ def escapedChar : Quickparse Char := do Char.ofNat $ 4096*u1 + 256*u2 + 16*u3 + u4 | _ => fail "illegal \\u escape" -partial def strCore (acc : String) : Quickparse String := do +partial def strCore (acc : String) : Parsec String := do let c ← peek! if c = '"' then -- " skip acc else - let c ← next + let c ← anyChar let ec ← if c = '\\' then escapedChar @@ -147,9 +62,9 @@ partial def strCore (acc : String) : Quickparse String := do fail "unexpected character in string" strCore (acc.push ec) -def str : Quickparse String := strCore "" +def str : Parsec String := strCore "" -partial def natCore (acc digits : Nat) : Quickparse (Nat × Nat) := do +partial def natCore (acc digits : Nat) : Parsec (Nat × Nat) := do let some c ← peek? | (acc, digits) if '0' ≤ c ∧ c ≤ '9' then skip @@ -159,7 +74,7 @@ partial def natCore (acc digits : Nat) : Quickparse (Nat × Nat) := do (acc, digits) @[inline] -def lookahead (p : Char → Prop) (desc : String) [DecidablePred p] : Quickparse Unit := do +def lookahead (p : Char → Prop) (desc : String) [DecidablePred p] : Parsec Unit := do let c ← peek! if p c then () @@ -167,22 +82,22 @@ def lookahead (p : Char → Prop) (desc : String) [DecidablePred p] : Quickparse fail $ "expected " ++ desc @[inline] -def natNonZero : Quickparse Nat := do +def natNonZero : Parsec Nat := do lookahead (fun c => '1' ≤ c ∧ c ≤ '9') "1-9" let (n, _) ← natCore 0 0 n @[inline] -def natNumDigits : Quickparse (Nat × Nat) := do +def natNumDigits : Parsec (Nat × Nat) := do lookahead (fun c => '0' ≤ c ∧ c ≤ '9') "digit" natCore 0 0 @[inline] -def natMaybeZero : Quickparse Nat := do +def natMaybeZero : Parsec Nat := do let (n, _) ← natNumDigits n -def num : Quickparse JsonNumber := do +def num : Parsec JsonNumber := do let c ← peek! let sign : Int ← if c = '-' then @@ -224,10 +139,10 @@ def num : Quickparse JsonNumber := do else res -partial def arrayCore (anyCore : Unit → Quickparse Json) (acc : Array Json) : Quickparse (Array Json) := do +partial def arrayCore (anyCore : Unit → Parsec Json) (acc : Array Json) : Parsec (Array Json) := do let hd ← anyCore () let acc' := acc.push hd - let c ← next + let c ← anyChar if c = ']' then ws acc' @@ -237,12 +152,12 @@ partial def arrayCore (anyCore : Unit → Quickparse Json) (acc : Array Json) : else fail "unexpected character in array" -partial def objectCore (anyCore : Unit → Quickparse Json) : Quickparse (RBNode String (fun _ => Json)) := do +partial def objectCore (anyCore : Unit → Parsec Json) : Parsec (RBNode String (fun _ => Json)) := do lookahead (fun c => c = '"') "\""; skip; -- " let k ← strCore ""; ws lookahead (fun c => c = ':') ":"; skip; ws let v ← anyCore () - let c ← next + let c ← anyChar if c = '}' then ws RBNode.singleton k v @@ -255,7 +170,7 @@ partial def objectCore (anyCore : Unit → Quickparse Json) : Quickparse (RBNode -- takes a unit parameter so that -- we can use the equation compiler and recursion -partial def anyCore (u : Unit) : Quickparse Json := do +partial def anyCore (u : Unit) : Parsec Json := do let c ← peek! if c = '[' then skip; ws @@ -281,13 +196,13 @@ partial def anyCore (u : Unit) : Quickparse Json := do ws Json.str s else if c = 'f' then - expect "false"; ws + skipString "false"; ws Json.bool false else if c = 't' then - expect "true"; ws + skipString "true"; ws Json.bool true else if c = 'n' then - expect "null"; ws + skipString "null"; ws Json.null else if c = '-' ∨ ('0' ≤ c ∧ c ≤ '9') then let n ← num @@ -297,10 +212,10 @@ partial def anyCore (u : Unit) : Quickparse Json := do fail "unexpected input" -def any : Quickparse Json := do +def any : Parsec Json := do ws let res ← anyCore () - eoi + eof res end Json.Parser @@ -309,8 +224,8 @@ namespace Json def parse (s : String) : Except String Lean.Json := match Json.Parser.any s.mkIterator with - | Quickparse.Result.success _ res => Except.ok res - | Quickparse.Result.error it err => Except.error s!"offset {it.i.repr}: {err}" + | Parsec.ParseResult.success _ res => Except.ok res + | Parsec.ParseResult.error it err => Except.error s!"offset {it.i.repr}: {err}" end Json diff --git a/src/Lean/Data/Parsec.lean b/src/Lean/Data/Parsec.lean new file mode 100644 index 0000000000..b7ab019d02 --- /dev/null +++ b/src/Lean/Data/Parsec.lean @@ -0,0 +1,170 @@ + +/- +Copyright (c) 2021 Microsoft Corporation. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Author: Dany Fabian +-/ + +namespace Lean + +namespace Parsec +inductive ParseResult (α : Type) where + | success (pos : String.Iterator) (res : α) + | error (pos : String.Iterator) (err : String) + deriving Repr +end Parsec + +def Parsec (α : Type) : Type := String.Iterator → Lean.Parsec.ParseResult α + +namespace Parsec + +open ParseResult + +instance (α : Type) : Inhabited (Parsec α) := + ⟨λ it => error it ""⟩ + +@[inline] +protected def pure (a : α) : Parsec α := λ it => + success it a + +@[inline] +def bind {α β : Type} (f : Parsec α) (g : α → Parsec β) : Parsec β := λ it => + match f it with + | success rem a => g a rem + | error pos msg => error pos msg + +instance : Monad Parsec := + { pure := Parsec.pure, bind } + +@[inline] +def fail (msg : String) : Parsec α := fun it => + error it msg + +@[inline] +def orElse (p q : Parsec α) : Parsec α := fun it => + match p it with + | success rem a => success rem a + | error rem err => + if it = rem then q it else error rem err + +@[inline] +def attempt (p : Parsec α) : Parsec α := λ it => + match p it with + | success rem res => success rem res + | error _ err => error it err + +instance : Alternative Parsec := +{ failure := fail "", orElse } + +def expectedEndOfInput := "expected end of input" + +@[inline] +def eof : Parsec Unit := fun it => + if it.hasNext then + error it expectedEndOfInput + else + success it () + +@[inline] +partial def manyCore (p : Parsec α) (acc : Array α) : Parsec $ Array α := + (do manyCore p (acc.push $ ←p)) + <|> pure acc + +@[inline] +def many (p : Parsec α) : Parsec $ Array α := manyCore p #[] + +@[inline] +def many1 (p : Parsec α) : Parsec $ Array α := do manyCore p #[←p] + +@[inline] +partial def manyCharsCore (p : Parsec Char) (acc : String) : Parsec String := + (do manyCharsCore p (acc.push $ ←p)) + <|> pure acc + +@[inline] +def manyChars (p : Parsec Char) : Parsec String := manyCharsCore p "" + +@[inline] +def many1Chars (p : Parsec Char) : Parsec String := do manyCharsCore p (←p).toString + +def pstring (s : String) : Parsec String := λ it => + let substr := it.extract (it.forward s.length) + if substr = s then + success (it.forward s.length) substr + else + error it s!"expected: {s}" + +@[inline] +def skipString (s : String) : Parsec Unit := pstring s *> pure () + +def unexpectedEndOfInput := "unexpected end of input" + +@[inline] +def anyChar : Parsec Char := λ it => + if it.hasNext then success it.next it.curr else error it unexpectedEndOfInput + +@[inline] +def pchar (c : Char) : Parsec Char := attempt do + if (←anyChar) = c then pure c else fail s!"expected: '{c}'" + +@[inline] +def skipChar (c : Char) : Parsec Unit := pchar c *> pure () + +@[inline] +def digit : Parsec Char := attempt do + let c ← anyChar + if '0' ≤ c ∧ c ≤ '9' then c else fail s!"digit expected" + +@[inline] +def hexDigit : Parsec Char := attempt do + let c ← anyChar + if ('0' ≤ c ∧ c ≤ '9') + ∨ ('a' ≤ c ∧ c ≤ 'a') + ∨ ('A' ≤ c ∧ c ≤ 'A') then c else fail s!"hex digit expected" + +@[inline] +def asciiLetter : Parsec Char := attempt do + let c ← anyChar + if ('A' ≤ c ∧ c ≤ 'Z') ∨ ('a' ≤ c ∧ c ≤ 'z') then c else fail s!"ASCII letter expected" + +@[inline] +def satisfy (p : Char → Bool) : Parsec Char := attempt do + let c ← anyChar + if p c then c else fail "condition not satisfied" + +@[inline] +def notFollowedBy (p : Parsec α) : Parsec Unit := λ it => + match p it with + | success _ _ => error it "" + | error _ _ => success it () + +partial def skipWs (it : String.Iterator) : String.Iterator := + if it.hasNext then + let c := it.curr + if c = '\u0009' ∨ c = '\u000a' ∨ c = '\u000d' ∨ c = '\u0020' then + skipWs it.next + else + it + else + it + +@[inline] +def peek? : Parsec (Option Char) := fun it => + if it.hasNext then + success it it.curr + else + success it none + +@[inline] +def peek! : Parsec Char := do + let some c ← peek? | fail unexpectedEndOfInput + c + +@[inline] +def skip : Parsec Unit := fun it => + success it.next () + +@[inline] +def ws : Parsec Unit := fun it => + success (skipWs it) () +end Parsec diff --git a/src/Lean/Data/Xml.lean b/src/Lean/Data/Xml.lean new file mode 100644 index 0000000000..8c6ad55a2f --- /dev/null +++ b/src/Lean/Data/Xml.lean @@ -0,0 +1,2 @@ +import Lean.Data.Xml.Basic +import Lean.Data.Xml.Parser diff --git a/src/Lean/Data/Xml/Basic.lean b/src/Lean/Data/Xml/Basic.lean new file mode 100644 index 0000000000..3f9d4fb507 --- /dev/null +++ b/src/Lean/Data/Xml/Basic.lean @@ -0,0 +1,41 @@ + +/- +Copyright (c) 2021 Microsoft Corporation. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Author: Dany Fabian +-/ + +import Std.Data.RBMap +namespace Lean +namespace Xml + +def Attributes := Std.RBMap String String compare +instance : ToString Attributes := ⟨λ as => as.fold (λ s n v => s ++ s!" {n}=\"{v}\"") ""⟩ + +mutual +inductive Element +| Element + (name : String) + (attributes : Attributes) + (content : Array Content) + +inductive Content +| Element (element : Element) +| Comment (comment : String) +| Character (content : String) +deriving Inhabited +end + +mutual +private partial def eToString : Element → String +| Element.Element n a c => s!"<{n}{a}>{c.map cToString |>.foldl (· ++ ·) ""}" + +private partial def cToString : Content → String +| Content.Element e => eToString e +| Content.Comment c => s!"" +| Content.Character c => c + +end +instance : ToString Element := ⟨eToString⟩ +instance : ToString Content := ⟨cToString⟩ + diff --git a/src/Lean/Data/Xml/Parser.lean b/src/Lean/Data/Xml/Parser.lean new file mode 100644 index 0000000000..25138df817 --- /dev/null +++ b/src/Lean/Data/Xml/Parser.lean @@ -0,0 +1,488 @@ +/- +Copyright (c) 2021 Microsoft Corporation. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Author: Dany Fabian +-/ + +import Lean.Data.Parsec +import Lean.Data.Xml.Basic +open IO +open System +open Lean + +namespace Lean +namespace Xml + +namespace Parser +open Lean.Parsec +open Parsec.ParseResult + +abbrev LeanChar := Char + +/-- consume a newline character sequence pretending, that we read '\n'. As per spec: + https://www.w3.org/TR/xml/#sec-line-ends -/ +def endl : Parsec LeanChar := (skipString "\r\n" <|> skipChar '\r' <|> skipChar '\n') *> pure '\n' + +def quote (p : Parsec α) : Parsec α := + skipChar '\'' *> p <* skipChar '\'' + <|> skipChar '"' *> p <* skipChar '"' + +/-- https://www.w3.org/TR/xml/#NT-Char -/ +def Char : Parsec LeanChar := + (attempt do + let c ← anyChar + let cNat ← c.toNat + if (0x20 ≤ cNat ∧ cNat ≤ 0xD7FF) + ∨ (0xE000 ≤ cNat ∧ cNat ≤ 0xFFFD) + ∨ (0x10000 ≤ cNat ∧ cNat ≤ 0x10FFFF) then c else fail "expected xml char") + <|> pchar '\t' <|> endl + +/-- https://www.w3.org/TR/xml/#NT-S -/ +def S : Parsec String := + many1Chars (pchar ' ' <|> endl <|> pchar '\t') + +/-- https://www.w3.org/TR/xml/#NT-Eq -/ +def Eq : Parsec Unit := + optional S *> skipChar '=' <* optional S + +private def nameStartCharRanges : Array (Nat × Nat) := + #[(0xC0, 0xD6), + (0xD8, 0xF6), + (0xF8, 0x2FF), + (0x370, 0x37D), + (0x37F, 0x1FFF), + (0x200C, 0x200D), + (0x2070, 0x218F), + (0x2C00, 0x2FEF), + (0x3001, 0xD7FF), + (0xF900, 0xFDCF), + (0xFDF0, 0xFFFD), + (0x10000, 0xEFFFF)] + +/-- https://www.w3.org/TR/xml/#NT-NameStartChar -/ +def NameStartChar : Parsec LeanChar := attempt do + let c ← anyChar + if ('A' ≤ c ∧ c ≤ 'Z') ∨ ('a' ≤ c ∧ c ≤ 'z') then c + else if c = ':' ∨ c = '_' then c + else + let cNum := c.toNat + if nameStartCharRanges.any (fun (lo, hi) => lo ≤ cNum ∧ cNum ≤ hi) then c + else fail "expected a name character" + +/-- https://www.w3.org/TR/xml/#NT-NameChar -/ +def NameChar : Parsec LeanChar := + NameStartChar <|> digit <|> pchar '-' <|> pchar '.' <|> pchar '\xB7' + <|> satisfy (λ c => ('\u0300' ≤ c ∧ c ≤ '\u036F') ∨ ('\u203F' ≤ c ∧ c ≤ '\u2040')) + +/-- https://www.w3.org/TR/xml/#NT-Name -/ +def Name : Parsec String := do + let x ← NameStartChar + manyCharsCore NameChar x.toString + +/-- https://www.w3.org/TR/xml/#NT-VersionNum -/ +def VersionNum : Parsec Unit := + skipString "1." <* (many1 digit) + +/-- https://www.w3.org/TR/xml/#NT-VersionInfo -/ +def VersionInfo : Parsec Unit := do + S *> + skipString "version" + Eq + quote VersionNum + +/-- https://www.w3.org/TR/xml/#NT-EncName -/ +def EncName : Parsec String := do + let x ← asciiLetter + manyCharsCore (asciiLetter <|> digit <|> pchar '-' <|> pchar '_' <|> pchar '.') x.toString + +/-- https://www.w3.org/TR/xml/#NT-EncodingDecl -/ +def EncodingDecl : Parsec String := do + S *> + skipString "encoding" + Eq + quote EncName + +/-- https://www.w3.org/TR/xml/#NT-SDDecl -/ +def SDDecl : Parsec String := do + S *> skipString "standalone" *> Eq *> quote (pstring "yes" <|> pstring "no") + +/-- https://www.w3.org/TR/xml/#NT-XMLDecl -/ +def XMLdecl : Parsec Unit := do + skipString " + optional SDDecl *> + optional S *> + skipString "?>" + +/-- https://www.w3.org/TR/xml/#NT-Comment -/ +def Comment : Parsec String := + let notDash := Char.toString <$> satisfy (λ c => c ≠ '-') + skipString "" + +/-- https://www.w3.org/TR/xml/#NT-PITarget -/ +def PITarget : Parsec String := + Name <* (skipChar 'X' <|> skipChar 'x') <* (skipChar 'M' <|> skipChar 'm') <* (skipChar 'L' <|> skipChar 'l') + +/-- https://www.w3.org/TR/xml/#NT-PI -/ +def PI : Parsec Unit := do + skipString " manyChars (notFollowedBy (skipString "?>") *> Char)) + skipString "?>" + +/-- https://www.w3.org/TR/xml/#NT-Misc -/ +def Misc : Parsec Unit := + Comment *> pure () <|> PI <|> S *> pure () + +/-- https://www.w3.org/TR/xml/#NT-SystemLiteral -/ +def SystemLiteral : Parsec String := + pchar '"' *> manyChars (satisfy λ c => c ≠ '"') <* pchar '"' + <|> pchar '\'' *> manyChars (satisfy λ c => c ≠ '\'') <* '\'' + +/-- https://www.w3.org/TR/xml/#NT-PubidChar -/ +def PubidChar : Parsec LeanChar := + asciiLetter <|> digit <|> endl <|> attempt do + let c ← anyChar + if "-'()+,./:=?;!*#@$_%".contains c then c else fail "PublidChar expected" + +/-- https://www.w3.org/TR/xml/#NT-PubidLiteral -/ +def PubidLiteral : Parsec String := + pchar '"' *> manyChars PubidChar <* pchar '"' + <|> pchar '\'' *> manyChars (attempt do + let c ← PubidChar + if c = '\'' then fail "'\\'' not expected" else c) <* pchar '\'' + +/-- https://www.w3.org/TR/xml/#NT-ExternalID -/ +def ExternalID : Parsec Unit := + skipString "SYSTEM" *> S *> SystemLiteral *> pure () + <|> skipString "PUBLIC" *> S *> PubidLiteral *> S *> SystemLiteral *> pure () + +/-- https://www.w3.org/TR/xml/#NT-Mixed -/ +def Mixed : Parsec Unit := + (do + skipChar '(' + optional S *> + skipString "#PCDATA" *> + many (optional S *> skipChar '|' *> optional S *> Name) *> + optional S *> + skipString ")*") + <|> skipChar '(' *> (optional S) *> skipString "#PCDATA" <* (optional S) <* skipChar ')' + +mutual + /-- https://www.w3.org/TR/xml/#NT-cp -/ + partial def cp : Parsec Unit := + (Name *> pure () <|> choice <|> seq) <* optional (skipChar '?' <|> skipChar '*' <|> skipChar '+') + + /-- https://www.w3.org/TR/xml/#NT-choice -/ + partial def choice : Parsec Unit := do + skipChar '(' + optional S *> + cp + many1 (optional S *> skipChar '|' *> optional S *> cp) *> + optional S *> + skipChar ')' + + /-- https://www.w3.org/TR/xml/#NT-seq -/ + partial def seq : Parsec Unit := do + skipChar '(' + optional S *> + cp + many (optional S *> skipChar ',' *> optional S *> cp) *> + optional S *> + skipChar ')' +end + +/-- https://www.w3.org/TR/xml/#NT-children -/ +def children : Parsec Unit := + (choice <|> seq) <* optional (skipChar '?' <|> skipChar '*' <|> skipChar '+') + +/-- https://www.w3.org/TR/xml/#NT-contentspec -/ +def contentspec : Parsec Unit := do + skipString "EMPTY" <|> skipString "ANY" <|> Mixed <|> children + +/-- https://www.w3.org/TR/xml/#NT-elementdecl -/ +def elementDecl : Parsec Unit := do + skipString " + Name *> + contentspec *> + optional S *> + skipChar '>' + +/-- https://www.w3.org/TR/xml/#NT-StringType -/ +def StringType : Parsec Unit := + skipString "CDATA" + +/-- https://www.w3.org/TR/xml/#NT-TokenizedType -/ +def TokenizedType : Parsec Unit := + skipString "ID" + <|> skipString "IDREF" + <|> skipString "IDREFS" + <|> skipString "ENTITY" + <|> skipString "ENTITIES" + <|> skipString "NMTOKEN" + <|> skipString "NMTOKENS" + +/-- https://www.w3.org/TR/xml/#NT-NotationType -/ +def NotationType : Parsec Unit := do + skipString "NOTATION" + S *> + skipChar '(' <* + optional S + Name *> many (optional S *> skipChar '|' *> optional S *> Name) *> + optional S *> + skipChar ')' + +/-- https://www.w3.org/TR/xml/#NT-Nmtoken -/ +def Nmtoken : Parsec String := do + many1Chars NameChar + +/-- https://www.w3.org/TR/xml/#NT-Enumeration -/ +def Enumeration : Parsec Unit := do + skipChar '(' + optional S *> + Nmtoken *> many (optional S *> skipChar '|' *> optional S *> Nmtoken) *> + optional S *> + skipChar ')' + +/-- https://www.w3.org/TR/xml/#NT-EnumeratedType -/ +def EnumeratedType : Parsec Unit := + NotationType <|> Enumeration + +/-- https://www.w3.org/TR/xml/#NT-AttType -/ +def AttType : Parsec Unit := + StringType <|> TokenizedType <|> EnumeratedType + +def predefinedEntityToChar : String → Option LeanChar +| "lt" => some '<' +| "gt" => some '>' +| "amp" => some '&' +| "apos" => some '\'' +| "quot" => some '"' +| _ => none + +/-- https://www.w3.org/TR/xml/#NT-EntityRef -/ +def EntityRef : Parsec $ Option LeanChar := attempt $ + skipChar '&' *> predefinedEntityToChar <$> Name <* skipChar ';' + +@[inline] +def hexDigitToNat (c : LeanChar) : Nat := + if '0' ≤ c ∧ c ≤ '9' then c.toNat - '0'.toNat + else if 'a' ≤ c ∧ c ≤ 'f' then c.toNat - 'a'.toNat + 10 + else c.toNat - 'A'.toNat + 10 + +def digitsToNat (base : Nat) (digits : Array Nat) : Nat := + digits.foldl (λ r d => r * base + d) 0 + +/-- https://www.w3.org/TR/xml/#NT-CharRef -/ +def CharRef : Parsec LeanChar := do + skipString "&#" + let charCode ← + digitsToNat 10 <$> many1 (hexDigitToNat <$> digit) + <|> skipChar 'x' *> digitsToNat 16 <$> many1 (hexDigitToNat <$> hexDigit) + skipChar ';' + Char.ofNat charCode + +/-- https://www.w3.org/TR/xml/#NT-Reference -/ +def Reference : Parsec $ Option LeanChar := + EntityRef <|> some <$> CharRef + +/-- https://www.w3.org/TR/xml/#NT-AttValue -/ +def AttValue : Parsec String := do + let chars ← + (do + skipChar '"' + many (some <$> satisfy (λ c => c ≠ '<' ∧ c ≠ '&' ∧ c ≠ '"') <|> Reference) <* + skipChar '"') + <|> (do + skipChar '\'' + many (some <$> satisfy (λ c => c ≠ '<' ∧ c ≠ '&' ∧ c ≠ '\'') <|> Reference) <* + skipChar '\'') + return chars.foldl (λ s c => if let some c := c then s.push c else s) "" + +/-- https://www.w3.org/TR/xml/#NT-DefaultDecl -/ +def DefaultDecl : Parsec Unit := + skipString "#REQUIRED" + <|> skipString "#IMPLIED" + <|> optional (skipString "#FIXED" <* S) *> AttValue *> pure () + +/-- https://www.w3.org/TR/xml/#NT-AttDef -/ +def AttDef : Parsec Unit := + S *> Name *> S *> AttType *> S *> DefaultDecl + +/-- https://www.w3.org/TR/xml/#NT-AttlistDecl -/ +def AttlistDecl : Parsec Unit := + skipString " S *> Name *> many AttDef *> optional S *> skipChar '>' + +/-- https://www.w3.org/TR/xml/#NT-PEReference -/ +def PEReference : Parsec Unit := + skipChar '%' *> Name *> skipChar ';' + +/-- https://www.w3.org/TR/xml/#NT-EntityValue -/ +def EntityValue : Parsec String := do + let chars ← + (do + skipChar '"' + many (some <$> satisfy (λ c => c ≠ '%' ∧ c ≠ '&' ∧ c ≠ '"') <|> PEReference *> pure none <|> Reference) <* + skipChar '"') + <|> (do + skipChar '\'' + many (some <$> satisfy (λ c => c ≠ '%' ∧ c ≠ '&' ∧ c ≠ '\'') <|> PEReference *> pure none <|> Reference) <* + skipChar '\'') + return chars.foldl (λ s c => if let some c := c then s.push c else s) "" + + +/-- https://www.w3.org/TR/xml/#NT-NDataDecl -/ +def NDataDecl : Parsec Unit := + S *> skipString "NDATA" <* S <* Name + +/-- https://www.w3.org/TR/xml/#NT-EntityDef -/ +def EntityDef : Parsec Unit := + EntityValue *> pure () <|> (ExternalID <* optional NDataDecl) + +/-- https://www.w3.org/TR/xml/#NT-GEDecl -/ +def GEDecl : Parsec Unit := + skipString " S *> Name *> S *> EntityDef *> optional S *> skipChar '>' + +/-- https://www.w3.org/TR/xml/#NT-PEDef -/ +def PEDef : Parsec Unit := + EntityValue *> pure () <|> ExternalID + +/-- https://www.w3.org/TR/xml/#NT-PEDecl -/ +def PEDecl : Parsec Unit := + skipString " S *> skipChar '%' *> S *> Name *> PEDef *> optional S *> skipChar '>' + +/-- https://www.w3.org/TR/xml/#NT-EntityDecl -/ +def EntityDecl : Parsec Unit := + GEDecl <|> PEDecl + +/-- https://www.w3.org/TR/xml/#NT-PublicID -/ +def PublicID : Parsec Unit := + skipString "PUBLIC" <* S <* PubidLiteral + +/-- https://www.w3.org/TR/xml/#NT-NotationDecl -/ +def NotationDecl : Parsec Unit := + skipString " S *> Name *> (ExternalID <|> PublicID) *> optional S *> skipChar '>' + +/-- https://www.w3.org/TR/xml/#NT-markupdecl -/ +def markupDecl : Parsec Unit := + elementDecl <|> AttlistDecl <|> EntityDecl <|> NotationDecl <|> PI <|> (Comment *> pure ()) + +/-- https://www.w3.org/TR/xml/#NT-DeclSep -/ +def DeclSep : Parsec Unit := + PEReference <|> S *> pure () + +/-- https://www.w3.org/TR/xml/#NT-intSubset -/ +def intSubset : Parsec Unit := + many (markupDecl <|> DeclSep) *> pure () + +/-- https://www.w3.org/TR/xml/#NT-doctypedecl -/ +def doctypedecl : Parsec Unit := do + skipString " + Name *> + optional (S *> ExternalID) *> pure () + <* optional S + optional (skipChar '[' *> intSubset <* skipChar ']' <* optional S) *> + skipChar '>' + +/-- https://www.w3.org/TR/xml/#NT-prolog -/ +def prolog : Parsec Unit := + optional XMLdecl *> + many Misc *> + optional (doctypedecl <* many Misc) *> pure () + +/-- https://www.w3.org/TR/xml/#NT-Attribute -/ +def Attribute : Parsec (String × String) := do + let name ← Name + Eq + let value ← AttValue + (name, value) + +protected def elementPrefix : Parsec (Array Content → Element) := do + skipChar '<' + let name ← Name + let attributes ← many (S *> Attribute) + optional S *> pure () + Element.Element name (Std.RBMap.fromList attributes.toList compare) + +/-- https://www.w3.org/TR/xml/#NT-EmptyElemTag -/ +def EmptyElemTag (elem : Array Content → Element) : Parsec Element := do + skipString "/>" *> pure (elem #[]) + +/-- https://www.w3.org/TR/xml/#NT-STag -/ +def STag (elem : Array Content → Element) : Parsec (Array Content → Element) := do + skipChar '>' *> pure elem + +/-- https://www.w3.org/TR/xml/#NT-ETag -/ +def ETag : Parsec Unit := + skipString " Name *> optional S *> skipChar '>' + +/-- https://www.w3.org/TR/xml/#NT-CDStart -/ +def CDStart : Parsec Unit := + skipString "" + +/-- https://www.w3.org/TR/xml/#NT-CData -/ +def CData : Parsec String := + manyChars (notFollowedBy (skipString "]]>") *> anyChar) + +/-- https://www.w3.org/TR/xml/#NT-CDSect -/ +def CDSect : Parsec String := + CDStart *> CData <* CDEnd + +/-- https://www.w3.org/TR/xml/#NT-CharData -/ +def CharData : Parsec String := + notFollowedBy (skipString "]]>") *> manyChars (satisfy λ c => c ≠ '<' ∧ c ≠ '&') + +mutual + /-- https://www.w3.org/TR/xml/#NT-content -/ + partial def content : Parsec (Array Content) := do + let x ← optional (Content.Character <$> CharData) + let xs ← many do + let y ← + attempt (some <$> Content.Element <$> element) + <|> (do let c ← Reference; c.map (Content.Character ∘ Char.toString)) + <|> some <$> Content.Character <$> CDSect + <|> PI *> none + <|> some <$> Content.Comment <$> Comment + + let z ← optional (Content.Character <$> CharData) + #[y, z] + let xs := #[x] ++ xs.concatMap id |>.filterMap id + let mut res := #[] + for x in xs do + if res.size > 0 then + match res.back, x with + | Content.Character x, Content.Character y => res ← res.set! (res.size - 1) (Content.Character $ x ++ y) + | _, x => res ← res.push x + else res ← res.push x + res + + /-- https://www.w3.org/TR/xml/#NT-element -/ + partial def element : Parsec Element := do + let elem ← Parser.elementPrefix + EmptyElemTag elem <|> STag elem <*> content <* ETag + +end + +/-- https://www.w3.org/TR/xml/#NT-document -/ +def document : Parsec Element := prolog *> element <* many Misc <* eof + +end Parser + +def parse (s : String) : Except String Element := + match Xml.Parser.document s.mkIterator with + | Parsec.ParseResult.success _ res => Except.ok res + | Parsec.ParseResult.error it err => Except.error s!"offset {it.i.repr}: {err}\n{(it.prevn 10).extract it}" + +end Xml