lean4-htt/library/init/lean/parser/parsec.lean
2018-09-19 12:36:34 -07:00

586 lines
20 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/-
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Leonardo de Moura, Sebastian Ullrich
Implementation for the parsec parser combinators described in the
paper:
https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/parsec-paper-letter.pdf
-/
prelude
import init.data.to_string init.data.string.basic init.data.list.basic init.control.except
import init.data.repr init.lean.name init.data.dlist init.control.monad_fail init.control.combinators
import init.util
namespace lean
namespace parser
open string (iterator)
namespace parsec
@[reducible] def position : Type := nat
structure message (μ : Type := unit) :=
(it : iterator)
(unexpected : string := "") -- unexpected input
(expected : dlist string := dlist.empty) -- expected productions
(custom : μ)
def expected.to_string : list string → string
| [] := ""
| [e] := e
| [e1, e2] := e1 ++ " or " ++ e2
| (e::es) := e ++ ", " ++ expected.to_string es
protected def message.to_string {μ : Type} (msg : message μ) : string :=
let (line, col) := msg.it.to_string.line_column msg.it.offset in
-- always print ":"; we assume at least one of `unexpected` and `expected` to be non-empty
let loc := ["error at line " ++ to_string line ++ ", column " ++ to_string col ++ ":"] in
let unexpected := (if msg.unexpected = "" then [] else ["unexpected " ++ msg.unexpected]) in
let ex_list := msg.expected.to_list in
let expected := if ex_list = [] then [] else ["expected " ++ expected.to_string ex_list] in
"\n".intercalate $ loc ++ unexpected ++ expected
instance {μ : Type} : has_to_string (message μ) :=
⟨message.to_string⟩
-- use for e.g. upcasting parsec errors with `monad_except.lift_except`
instance {μ : Type} : has_lift (message μ) string :=
⟨to_string⟩
/-
Remark: we store expected "error" messages in `ok_eps` results.
They contain the error that would have occurred if a
successful "epsilon" alternative was not taken.
-/
inductive result (μ α : Type)
| ok {} (a : α) (it : iterator) : result
| ok_eps {} (a : α) (it : iterator) (expected : dlist string) : result
| error {} (msg : message μ) (consumed : bool) : result
@[inline] def result.mk_eps {μ α : Type} (a : α) (it : iterator) : result μ α :=
result.ok_eps a it dlist.empty
end parsec
open parsec
def parsec_t (μ : Type) (m : Type → Type) (α : Type) :=
iterator → m (result μ α)
abbreviation parsec (μ : Type) := parsec_t μ id
/-- `parsec` without custom error message type -/
abbreviation parsec' := parsec unit
namespace parsec_t
open parsec.result
variables {m : Type → Type} [monad m] {μ α β : Type}
def run (p : parsec_t μ m α) (s : string) (fname := "") : m (except (message μ) α) :=
do r ← p s.mk_iterator,
pure $ match r with
| ok a _ := except.ok a
| ok_eps a _ _ := except.ok a
| error msg _ := except.error msg
@[inline] protected def pure (a : α) : parsec_t μ m α :=
λ it, pure (mk_eps a it)
def eps : parsec_t μ m unit :=
parsec_t.pure ()
protected def failure [inhabited μ] : parsec_t μ m α :=
λ it, pure (error { unexpected := "failure", it := it, custom := default μ } ff)
def merge (msg₁ msg₂ : message μ) : message μ :=
{ expected := msg₁.expected ++ msg₂.expected, ..msg₁ }
private def bind_1 (r : result μ β) : result μ β :=
match r with
| ok_eps b it msg₂ := ok b it
| error msg ff := error msg tt
| other := other
private def bind_2 (ex₁) (r : result μ β) : result μ β :=
match r with
| ok_eps b it ex₂ := ok_eps b it (ex₁ ++ ex₂)
| error msg₂ ff := error { expected := ex₁ ++ msg₂.expected, .. msg₂ } ff
| other := other
/--
The `bind p q` combinator behaves as follows:
1- If `p` fails, then it fails.
2- If `p` succeeds and consumes input, then execute `q`
3- If `q` succeeds but does not consume input, then execute `q`
and merge error messages if both do not consume any input.
-/
@[inline] protected def bind (p : parsec_t μ m α) (q : α → parsec_t μ m β) : parsec_t μ m β :=
λ it, do r ← p it,
match r with
| ok a it := bind_1 <$> q a it
| ok_eps a it ex₁ := bind_2 ex₁ <$> q a it
| error msg c := pure (error msg c)
instance : monad (parsec_t μ m) :=
{ bind := λ _ _, parsec_t.bind, pure := λ _, parsec_t.pure }
instance : monad_fail parsec' :=
{ fail := λ _ s it, error { unexpected := s, it := it, custom := () } ff }
instance : monad_except (message μ) (parsec_t μ m) :=
{ throw := λ _ msg it, pure (error msg ff),
catch := λ _ p c it, do
r ← p it,
match r with
| error msg cns := do {
r ← c msg msg.it,
pure $ match r with
| error msg' cns' := error msg' (cns || cns')
| other := other }
| other := pure other }
instance : has_monad_lift m (parsec_t μ m) :=
{ monad_lift := λ α x it, do a ← x, pure (mk_eps a it) }
def expect (msg : message μ) (exp : string) : message μ :=
{expected := dlist.singleton exp, ..msg}
@[inline] def labels (p : parsec_t μ m α) (lbls : dlist string) : parsec_t μ m α :=
λ it, do
r ← p it,
pure $ match r with
| ok_eps a it _ := ok_eps a it lbls
| error msg ff := error {expected := lbls, ..msg} ff
| other := other
/--
`try p` behaves like `p`, but it pretends `p` hasn't
consumed any input when `p` fails.
It is useful for implementing infinite lookahead.
The parser `try p <|> q` will try `q` even when
`p` has consumed input.
It is also useful for specifying both the lexer and parser
together.
```
(do try (ch 'l' >> ch 'e' >> ch 't'), whitespace, ...)
<|>
...
```
Without the `try` combinator we will not be able to backtrack on the `let` keyword.
-/
def try (p : parsec_t μ m α) : parsec_t μ m α :=
λ it, do
r ← p it,
pure $ match r with
| error msg _ := error msg ff
| other := other
/--
The `orelse p q` combinator behaves as follows:
1- If `p` succeeds *or* consumes input, return
its result. Otherwise, execute `q` and return its
result.
Recall that the `try p` combinator can be used to
pretend that `p` did not consume any input, and
simulate infinite lookahead.
2- If both `p` and `q` did not consume any input, then
combine their error messages (even if one of
them succeeded).
-/
protected def orelse (p q : parsec_t μ m α) : parsec_t μ m α :=
λ it, do
r ← p it,
match r with
| error msg₁ ff := do {
r ← q it,
pure $ match r with
| ok_eps a it' ex₂ := ok_eps a it' (msg₁.expected ++ ex₂)
| error msg₂ ff := error (merge msg₁ msg₂) ff
| other := other }
| other := pure other
instance [inhabited μ] : alternative (parsec_t μ m) :=
{ orelse := λ _, parsec_t.orelse,
failure := λ _, parsec_t.failure }
/-- Parse `p` without consuming any input. -/
def lookahead (p : parsec_t μ m α) : parsec_t μ m α :=
λ it, do
r ← p it,
pure $ match r with
| ok a s' := mk_eps a it
| other := other
/-- `not_followed_by p` succeeds when parser `p` fails -/
def not_followed_by (p : parsec' α) (msg : string := "input") : parsec' unit :=
λ it, do
r ← p it,
pure $ match r with
| ok _ _ := error { it := it, unexpected := msg, custom := () } ff
| ok_eps _ _ _ := error { it := it, unexpected := msg, custom := () } ff
| error _ _ := mk_eps () it
def dbg (label : string) (p : parsec_t μ m α) : parsec_t μ m α :=
λ it, do
r ← p it,
pure $ trace ("DBG " ++ label ++ ": \"" ++ (it.extract (it.nextn 40)).get_or_else "" ++ "\"") $ match r : _ → result μ α with
| ok a it' := trace ("consumed ok : '" ++ (it.extract it').get_or_else "" ++ "'") $ @ok μ α a it'
| ok_eps a it' ex := trace ("empty ok : '" ++ (it.extract it').get_or_else "" ++ "'") $ @ok_eps μ α a it' ex
| error msg tt := trace ("consumed error : '" ++ (it.extract msg.it).get_or_else "" ++ "'\n" ++ to_string msg) $ @error μ α msg tt
| error msg ff := trace ("empty error : '" ++ (it.extract msg.it).get_or_else "" ++ "'\n" ++ to_string msg) $ @error μ α msg ff
end parsec_t
/- Type class for abstracting from concrete monad stacks containing a `parsec` somewhere. -/
class monad_parsec (μ : out_param Type) (m : Type → Type) :=
-- analogous to e.g. `monad_reader.lift` before simplification (see there)
(lift {} {α : Type} : parsec μ α → m α)
-- Analogous to e.g. `monad_reader_adapter.map` before simplification (see there).
-- Its usage seems to be way too common to justify moving it into a separate type class.
(map {} {α : Type} : (∀ {m'} [monad m'] {α}, parsec_t μ m' α → parsec_t μ m' α) → m α → m α)
/-- `parsec` without custom error message type -/
abbreviation monad_parsec' := monad_parsec unit
variables {μ : Type}
instance {m : Type → Type} [monad m] : monad_parsec μ (parsec_t μ m) :=
{ lift := λ α p it, pure (p it),
map := λ α f x, f x }
instance monad_parsec_trans {m n : Type → Type} [has_monad_lift m n] [monad_functor m m n n] [monad_parsec μ m] : monad_parsec μ n :=
{ lift := λ α p, monad_lift (monad_parsec.lift p : m α),
map := λ α f x, monad_map (λ β x, (monad_parsec.map @f x : m β)) x }
namespace monad_parsec
open parsec_t
variables {m : Type → Type} [monad m] [monad_parsec μ m] [inhabited μ] {α β : Type}
@[inline] def error {α : Type} (unexpected : string) (expected : dlist string := dlist.empty) (it : option iterator := none) (custom : μ := default _) : m α :=
lift $ λ it', result.error { unexpected := unexpected, expected := expected, it := it.get_or_else it', custom := custom } ff
@[inline] def left_over : m iterator :=
lift $ λ it, result.mk_eps it it
/-- Return the number of characters left to be parsed. -/
def remaining : m nat :=
string.iterator.remaining <$> left_over
@[inline] def labels (p : m α) (lbls : dlist string) : m α :=
map (λ m' inst β p, @parsec_t.labels m' inst μ β p lbls) p
@[inline] def label (p : m α) (lbl : string) : m α :=
labels p (dlist.singleton lbl)
infixr ` <?> `:2 := label
@[inline] def hidden (p : m α) : m α :=
labels p dlist.empty
/--
`try p` behaves like `p`, but it pretends `p` hasn't
consumed any input when `p` fails.
It is useful for implementing infinite lookahead.
The parser `try p <|> q` will try `q` even when
`p` has consumed input.
It is also useful for specifying both the lexer and parser
together.
```
(do try (ch 'l' >> ch 'e' >> ch 't'), whitespace, ...)
<|>
...
```
Without the `try` combinator we will not be able to backtrack on the `let` keyword.
-/
@[inline] def try (p : m α) : m α :=
map (λ m' inst β p, @parsec_t.try m' inst μ β p) p
/-- Parse `p` without consuming any input. -/
@[inline] def lookahead (p : m α) : m α :=
map (λ m' inst β p, @parsec_t.lookahead m' inst μ β p) p
/-- Faster version of `not_followed_by (satisfy p)` -/
@[inline] def not_followed_by_sat (p : char → bool) : m unit :=
do it ← left_over,
if !it.has_next then pure ()
else let c := it.curr in
if p c then error (repr c)
else pure ()
@[inline] def eoi_error (it : iterator) : result μ α :=
result.error { it := it, unexpected := "end of input", custom := default _ } ff
def curr : m char :=
string.iterator.curr <$> left_over
@[inline] def cond (p : char → bool) (t : m α) (e : m α) : m α :=
mcond (p <$> curr) t e
/--
If the next character `c` satisfies `p`, then
update position and return `c`. Otherwise,
generate error message with current position and character. -/
@[inline] def satisfy (p : char → bool) : m char :=
do it ← left_over,
if !it.has_next then error "end of input"
else let c := it.curr in
if p c then lift $ λ _, result.ok c it.next
else error (repr c)
def ch (c : char) : m char :=
satisfy (= c)
def alpha : m char :=
satisfy char.is_alpha
def digit : m char :=
satisfy char.is_digit
def upper : m char :=
satisfy char.is_upper
def lower : m char :=
satisfy char.is_lower
def any : m char :=
satisfy (λ _, true)
private def str_aux : nat → iterator → iterator → option iterator
| 0 _ it := some it
| (n+1) s_it it :=
if it.has_next ∧ s_it.curr = it.curr then str_aux n s_it.next it.next
else none
/--
`str s` parses a sequence of elements that match `s`. Returns the parsed string (i.e. `s`).
This parser consumes no input if it fails (even if a partial match).
Note: The behaviour of this parser is different to that the `string` parser in the Parsec_t Μ M Haskell library,
as this one is all-or-nothing.
-/
def str (s : string) : m string :=
if s.is_empty then pure ""
else lift $ λ it, match str_aux s.length s.mk_iterator it with
| some it' := result.ok s it'
| none := result.error { it := it, expected := dlist.singleton (repr s), custom := default μ } ff
private def take_aux : nat → string → iterator → result μ string
| 0 r it := result.ok r it
| (n+1) r it :=
if !it.has_next then eoi_error it
else take_aux n (r.push (it.curr)) it.next
/-- Consume `n` characters. -/
def take (n : nat) : m string :=
if n = 0 then pure ""
else lift $ take_aux n ""
@[inline] private def mk_string_result (r : string) (it : iterator) : result μ string :=
if r.is_empty then result.mk_eps r it
else result.ok r it
private def take_while_aux (p : char → bool) : nat → string → iterator → result μ string
| 0 r it := mk_string_result r it
| (n+1) r it :=
if !it.has_next then mk_string_result r it
else let c := it.curr in
if p c then take_while_aux n (r.push c) it.next
else mk_string_result r it
/--
Consume input as long as the predicate returns `tt`, and return the consumed input.
This parser does not fail. It will return an empty string if the predicate
returns `ff` on the current character. -/
def take_while (p : char → bool) : m string :=
lift $ λ it, take_while_aux p it.remaining "" it
def take_while_cont (p : char → bool) (ini : string) : m string :=
lift $ λ it, take_while_aux p it.remaining ini it
/--
Consume input as long as the predicate returns `tt`, and return the consumed input.
This parser requires the predicate to succeed on at least once. -/
def take_while1 (p : char → bool) : m string :=
do c ← satisfy p,
take_while_cont p (to_string c)
/--
Consume input as long as the predicate returns `ff` (i.e. until it returns `tt`), and return the consumed input.
This parser does not fail. -/
def take_until (p : char → bool) : m string :=
take_while (λ c, !p c)
def take_until1 (p : char → bool) : m string :=
take_while1 (λ c, !p c)
@[inline] private def mk_consumed_result (consumed : bool) (it : iterator) : result μ unit :=
if consumed then result.ok () it
else result.mk_eps () it
private def take_while_aux' (p : char → bool) : nat → bool → iterator → result μ unit
| 0 consumed it := mk_consumed_result consumed it
| (n+1) consumed it :=
if !it.has_next then mk_consumed_result consumed it
else let c := it.curr in
if p c then take_while_aux' n tt it.next
else mk_consumed_result consumed it
/-- Similar to `take_while` but it does not return the consumed input. -/
def take_while' (p : char → bool) : m unit :=
lift $ λ it, take_while_aux' p it.remaining ff it
/-- Similar to `take_while1` but it does not return the consumed input. -/
def take_while1' (p : char → bool) : m unit :=
satisfy p *> take_while' p
/-- Consume zero or more whitespaces. -/
def whitespace : m unit :=
take_while' char.is_whitespace
/-- Shorthand for `p <* whitespace` -/
def lexeme (p : m α) : m α :=
p <* whitespace
/-- Parse a numeral in decimal. -/
def num : m nat :=
string.to_nat <$> (take_while1 char.is_digit)
/-- Succeed only if there are at least `n` characters left. -/
def ensure (n : nat) : m unit :=
do it ← left_over,
if n ≤ it.remaining then pure ()
else error "end of input" (dlist.singleton ("at least " ++ to_string n ++ " characters"))
/-- Return the current position. -/
def pos : m position :=
string.iterator.offset <$> left_over
@[inline] def not_followed_by [monad_except (message μ) m] (p : m α) (msg : string := "input") : m unit :=
do it ← left_over,
b ← lookahead $ catch (p *> pure ff) (λ _, pure tt),
if b then pure () else error msg dlist.empty it
def eoi : m unit :=
do it ← left_over,
if it.remaining = 0 then pure ()
else error (repr it.curr) (dlist.singleton ("end of input"))
def many1_aux [alternative m] (p : m α) : nat → m (list α)
| 0 := do a ← p, pure [a]
| (n+1) := do a ← p,
as ← (many1_aux n <|> pure []),
pure (a::as)
def many1 [alternative m] (p : m α) : m (list α) :=
do r ← remaining, many1_aux p r
def many [alternative m] (p : m α) : m (list α) :=
many1 p <|> pure []
def many1_aux' [alternative m] (p : m α) : nat → m unit
| 0 := p *> pure ()
| (n+1) := p *> (many1_aux' n <|> pure ())
def many1' [alternative m] (p : m α) : m unit :=
do r ← remaining, many1_aux' p r
def many' [alternative m] (p : m α) : m unit :=
many1' p <|> pure ()
def sep_by1 [alternative m] (p : m α) (sep : m β) : m (list α) :=
(::) <$> p <*> many (sep *> p)
def sep_by [alternative m] (p : m α) (sep : m β) : m (list α) :=
sep_by1 p sep <|> pure []
def fix_aux [alternative m] (f : m α → m α) : nat → m α
| 0 := error "fix_aux: no progress"
| (n+1) := f (fix_aux n)
def fix [alternative m] (f : m α → m α) : m α :=
do n ← remaining, fix_aux f (n+1)
def foldr_aux [alternative m] (f : α → β → β) (p : m α) (b : β) : nat → m β
| 0 := pure b
| (n+1) := (f <$> p <*> foldr_aux n) <|> pure b
/-- Matches zero or more occurrences of `p`, and folds the result. -/
def foldr [alternative m] (f : α → β → β) (p : m α) (b : β) : m β :=
do it ← left_over,
foldr_aux f p b it.remaining
def foldl_aux [alternative m] (f : α → β → α) (p : m β) : α → nat → m α
| a 0 := pure a
| a (n+1) := (do x ← p, foldl_aux (f a x) n) <|> pure a
/-- Matches zero or more occurrences of `p`, and folds the result. -/
def foldl [alternative m] (f : α → β → α) (a : α) (p : m β) : m α :=
do it ← left_over,
foldl_aux f p a it.remaining
def unexpected (msg : string) : m α :=
error msg
def unexpected_at (msg : string) (it : iterator) : m α :=
error msg dlist.empty it
/- Execute all parsers in `ps` and return the result of the longest parse(s) if any,
or else the result of the furthest error. If there are two parses of
equal length, the first parse wins. -/
def longest_match [monad_except (message μ) m] (ps : list (m α)) : m (list α) :=
do it ← left_over,
r ← ps.mfoldr (λ p (r : result μ (list α)),
lookahead $ catch
(do
a ← p,
it ← left_over,
pure $ match r with
| result.ok as it' := if it'.offset > it.offset then r
else if it.offset > it'.offset then result.ok [a] it
else result.ok (a::as) it
| _ := result.ok [a] it)
(λ msg, pure $ match r with
| result.error msg' _ := if msg.it.offset > msg.it.offset then r
else if msg.it.offset > msg'.it.offset then result.error msg tt
else result.error (merge msg msg') (msg.it.offset > it.offset)
| _ := r))
((error "longest_match: empty list" : parsec _ _) it),
lift $ λ _, r
/-- Add trace information about `p`'s input and output. -/
def dbg (label : string) (p : m α) : m α :=
map (λ m' inst β, @parsec_t.dbg m' inst μ β label) p
def observing [monad_except (message μ) m] (p : m α) : m (except (message μ) α) :=
catch (except.ok <$> p) $ λ msg, pure (except.error msg)
end monad_parsec
namespace monad_parsec
open parsec_t
variables {m : Type → Type} [monad m] [monad_parsec unit m] {α β : Type}
end monad_parsec
namespace parsec_t
open monad_parsec
variables {m : Type → Type} [monad m] {α β : Type}
def parse (p : parsec_t μ m α) (s : string) (fname := "") : m (except (message μ) α) :=
run p s fname
def parse_with_eoi [inhabited μ] (p : parsec_t μ m α) (s : string) (fname := "") : m (except (message μ) α) :=
run (p <* eoi) s fname
def parse_with_left_over [inhabited μ] (p : parsec_t μ m α) (s : string) (fname := "") : m (except (message μ) (α × iterator)) :=
run (prod.mk <$> p <*> left_over) s fname
end parsec_t
def parsec.parse {α : Type} (p : parsec μ α) (s : string) (fname := "") : except (message μ) α :=
parsec_t.parse p s fname
end parser
end lean