352 lines
14 KiB
Text
352 lines
14 KiB
Text
/-
|
||
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
|
||
Released under Apache 2.0 license as described in the file LICENSE.
|
||
Author: Sebastian Ullrich
|
||
|
||
Parser for the Lean language
|
||
-/
|
||
prelude
|
||
import init.lean.parser.parsec init.lean.parser.syntax
|
||
import init.lean.parser.identifier init.data.rbmap init.lean.message
|
||
|
||
/-- A small wrapper of `reader_t` that simplifies introducing and invoking
|
||
recursion points in a computation. -/
|
||
-- TODO(Sebastian): move?
|
||
def rec_t (α δ : Type) (m : Type → Type) (β : Type) :=
|
||
reader_t (α → m δ) m β
|
||
|
||
namespace rec_t
|
||
variables {m : Type → Type} {α δ β : Type} [monad m]
|
||
local attribute [reducible] rec_t
|
||
|
||
/-- Continue at the recursion point stored at `run`. -/
|
||
def recurse (a : α) : rec_t α δ m δ :=
|
||
do x ← read,
|
||
monad_lift (x a)
|
||
|
||
variables (base : α → m δ) (rec : α → rec_t α δ m δ)
|
||
private def run_aux : nat → α → m δ
|
||
| 0 := base
|
||
| (n+1) := λ a, (rec a).run (run_aux n)
|
||
|
||
/-- Execute `rec a`, re-executing it whenever `recurse` (with a new `a`) is called.
|
||
After `max_rec` recursion steps, `base` is executed instead. -/
|
||
protected def run (a : α) (max_rec := 1000) : m δ :=
|
||
(rec a).run (run_aux base rec max_rec)
|
||
|
||
-- not clear how to auto-derive these given the additional constraints
|
||
instance : monad (rec_t α δ m) := infer_instance
|
||
instance [alternative m] : alternative (rec_t α δ m) := infer_instance
|
||
instance : has_monad_lift m (rec_t α δ m) := infer_instance
|
||
instance (ε) [monad_except ε m] : monad_except ε (rec_t α δ m) := infer_instance
|
||
instance (μ) [alternative m] [lean.parser.monad_parsec μ m] : lean.parser.monad_parsec μ (rec_t α δ m) :=
|
||
infer_instance
|
||
end rec_t
|
||
|
||
class monad_rec (α δ : out_param Type) (m : Type → Type) :=
|
||
(recurse {} : α → m δ)
|
||
export monad_rec (recurse)
|
||
|
||
instance monad_rec.trans (α δ m m') [has_monad_lift m m'] [monad_rec α δ m] [monad m] : monad_rec α δ m' :=
|
||
{ recurse := λ a, monad_lift (recurse a : m δ) }
|
||
|
||
instance monad_rec.base (α δ m) [monad m] : monad_rec α δ (rec_t α δ m) :=
|
||
{ recurse := rec_t.recurse }
|
||
|
||
namespace lean
|
||
namespace parser
|
||
|
||
inductive trie (α : Type)
|
||
| node : option α → rbnode (char × trie) → trie
|
||
|
||
namespace trie
|
||
variables {α : Type}
|
||
|
||
protected def mk : trie α :=
|
||
⟨none, rbnode.leaf⟩
|
||
|
||
private def insert_aux (val : α) : nat → trie α → string.iterator → trie α
|
||
| 0 (trie.node _ map) _ := trie.node (some val) map -- NOTE: overrides old value
|
||
| (n+1) (trie.node val map) it :=
|
||
let t' := (rbmap_core.find (<) map it.curr).get_or_else trie.mk in
|
||
trie.node val (rbmap_core.insert (<) map it.curr (insert_aux n t' it.next))
|
||
|
||
def insert (t : trie α) (s : string) (val : α) : trie α :=
|
||
insert_aux val s.length t s.mk_iterator
|
||
|
||
private def match_prefix_aux : nat → trie α → string.iterator → option (string.iterator × α) → option (string.iterator × α)
|
||
| 0 (trie.node val map) it acc := prod.mk it <$> val <|> acc
|
||
| (n+1) (trie.node val map) it acc :=
|
||
let acc' := prod.mk it <$> val <|> acc in
|
||
match rbmap_core.find (<) map it.curr with
|
||
| some t := match_prefix_aux n t it.next acc'
|
||
| none := acc'
|
||
|
||
def match_prefix {α : Type} (t : trie α) (it : string.iterator) : option (string.iterator × α) :=
|
||
match_prefix_aux it.remaining t it none
|
||
end trie
|
||
|
||
/- Maximum standard precedence. This is the precedence of function application.
|
||
In the standard Lean language, only the token `.` has a left-binding power greater
|
||
than `max_prec` (so that field accesses like `g (h x).f` are parsed as `g ((h x).f)`,
|
||
not `(g (h x)).f`). -/
|
||
def max_prec : nat := 1024
|
||
|
||
structure token_config :=
|
||
(«prefix» : string)
|
||
/- Left-binding power used by the term parser. The term parser operates in the context
|
||
of a right-binding power between 0 (used by parentheses and on the top-level) and
|
||
(usually) `max_prec` (used by function application). After parsing an initial term,
|
||
it continues parsing and expanding that term only when the left-binding power of
|
||
the next token is greater than the current right-binding power. For example, it
|
||
never continues parsing an argument after the initial parse, unless a token with
|
||
lbp > max_prec is encountered. Conversely, the term parser will always continue
|
||
parsing inside parentheses until it finds a token with lbp 0 (such as `)`). -/
|
||
(lbp : nat)
|
||
-- reading a token should not need any state
|
||
/- An optional parser that is activated after matching `prefix`.
|
||
It should return a syntax tree with a "hole" for the
|
||
`source_info` surrounding the token, which will be supplied
|
||
by the `token` parser. -/
|
||
(token_parser : option (parsec' (source_info → syntax)) := none)
|
||
|
||
structure parser_state :=
|
||
(tokens : trie token_config)
|
||
(messages : message_log)
|
||
/- Start position of the current token. This might not be equal to the parser
|
||
position for two reasons:
|
||
* We plan to eagerly parse leading whitespace so as not to do so multiple times
|
||
* During error recovery, skipped input should be associated to the next token -/
|
||
(token_start : string.iterator)
|
||
|
||
structure parser_config :=
|
||
(filename : string)
|
||
|
||
@[derive monad alternative monad_reader monad_state monad_parsec monad_except]
|
||
def parser_t (m : Type → Type) [monad m] := reader_t parser_config $ state_t parser_state $ parsec_t syntax m
|
||
abbreviation basic_parser_m := parser_t id
|
||
abbreviation basic_parser := basic_parser_m syntax
|
||
|
||
-- an arbitrary `parser` type; parsers are usually some monad stack based on `basic_parser_m` returning `syntax`
|
||
variable {ρ : Type}
|
||
|
||
class has_tokens (r : ρ) := mk {} ::
|
||
(tokens : list token_config)
|
||
|
||
open parser.has_tokens (tokens)
|
||
|
||
instance list.nil.tokens : parser.has_tokens ([] : list ρ) :=
|
||
⟨[]⟩
|
||
instance list.cons.tokens (r : ρ) (rs : list ρ) [parser.has_tokens r] [parser.has_tokens rs] :
|
||
parser.has_tokens (r::rs) :=
|
||
⟨tokens r ++ tokens rs⟩
|
||
|
||
protected class has_view (r : ρ) (α : out_param Type) :=
|
||
(view : syntax → option α)
|
||
(review : α → syntax)
|
||
|
||
instance has_view.default (r : ρ) : inhabited (parser.has_view r syntax) :=
|
||
⟨{ view := some, review := id }⟩
|
||
|
||
class syntax_node_kind.has_view (k : syntax_node_kind) (α : out_param Type) :=
|
||
(view : syntax → option α)
|
||
(review : α → syntax)
|
||
|
||
def message_of_parsec_message {μ : Type} (cfg : parser_config) (msg : parsec.message μ) : message :=
|
||
-- FIXME: translate position
|
||
{filename := cfg.filename, pos := ⟨0, 0⟩, text := to_string msg}
|
||
|
||
section
|
||
local attribute [reducible] parser_t
|
||
protected def run {m : Type → Type} [monad m] (cfg : parser_config) (st : parser_state) (s : string) (r : parser_t m syntax) :
|
||
m (syntax × message_log) :=
|
||
do r ← ((r.run cfg).run st).parse_with_eoi s,
|
||
pure $ match r with
|
||
| except.ok (a, st) := (a, st.messages)
|
||
| except.error msg := (msg.custom, message_log.empty.add (message_of_parsec_message cfg msg))
|
||
end
|
||
|
||
open monad_parsec
|
||
open parser.has_view
|
||
variables {α : Type} {m : Type → Type}
|
||
local notation `parser` := m syntax
|
||
|
||
def log_message {μ : Type} [monad m] [monad_reader parser_config m] [monad_state parser_state m] (msg : parsec.message μ) : m unit :=
|
||
do cfg ← read,
|
||
modify (λ st, {st with messages := st.messages.add (message_of_parsec_message cfg msg)})
|
||
|
||
def eoi : syntax_node_kind := ⟨`lean.parser.parser.eoi⟩
|
||
|
||
protected def parse [monad m] (cfg : parser_config) (s : string) (r : parser_t m syntax) [parser.has_tokens r] :
|
||
m (syntax × message_log) :=
|
||
-- the only hardcoded tokens, because they are never directly mentioned by a `parser`
|
||
let builtin_tokens : list token_config := [⟨"/-", 0, none⟩, ⟨"--", 0, none⟩] in
|
||
let trie := (tokens r ++ builtin_tokens).foldl (λ t cfg, trie.insert t cfg.prefix cfg) trie.mk in
|
||
parser.run cfg ⟨trie, message_log.empty, s.mk_iterator⟩ s $ do
|
||
stx ← catch r $ λ (msg : parsec.message _), do {
|
||
modify $ λ st, {st with token_start := msg.it},
|
||
parser.log_message msg,
|
||
pure msg.custom
|
||
},
|
||
whitespace,
|
||
-- add `eoi` node and store any residual input in its prefix
|
||
catch monad_parsec.eoi parser.log_message,
|
||
tk_start ← parser_state.token_start <$> get,
|
||
let stop := tk_start.to_end in
|
||
pure $ syntax.node ⟨none, [
|
||
stx,
|
||
syntax.node ⟨eoi, [syntax.atom ⟨some ⟨⟨tk_start, stop⟩, stop.offset, ⟨stop, stop⟩⟩, atomic_val.string ""⟩]⟩
|
||
]⟩
|
||
|
||
structure parse.view_ty :=
|
||
(root : syntax)
|
||
(eoi : syntax)
|
||
|
||
def parse.view : syntax → option parse.view_ty
|
||
| (syntax.node ⟨none, [root, eoi]⟩) := some ⟨root, eoi⟩
|
||
| _ := none
|
||
|
||
namespace combinators
|
||
variables [monad m] [monad_except (parsec.message syntax) m] [monad_parsec syntax m] [alternative m]
|
||
|
||
def node' (k : option syntax_node_kind) (rs : list parser) : parser :=
|
||
do (args, _) ← rs.mfoldl (λ (p : list syntax × nat) r, do
|
||
(args, remaining) ← pure p,
|
||
-- on error, append partial syntax tree and `missing` objects to previous successful parses and rethrow
|
||
a ← catch r $ λ msg,
|
||
let args := list.repeat syntax.missing (remaining-1) ++ msg.custom :: args in
|
||
throw {msg with custom := syntax.node ⟨k, args.reverse⟩},
|
||
pure (a::args, remaining - 1)
|
||
) ([], rs.length),
|
||
pure $ syntax.node ⟨k, args.reverse⟩
|
||
|
||
@[reducible] def seq : list parser → parser := node' none
|
||
@[reducible] def node (k : syntax_node_kind) : list parser → parser := node' k
|
||
|
||
instance node'.tokens (k) (rs : list parser) [parser.has_tokens rs] : parser.has_tokens (node' k rs) :=
|
||
⟨tokens rs⟩
|
||
|
||
instance node.view (k) (rs : list parser) [i : syntax_node_kind.has_view k α] : parser.has_view (node k rs) α :=
|
||
{ view := i.view, review := i.review }
|
||
|
||
private def many1_aux (p : parser) : list syntax → nat → parser
|
||
| as 0 := error "unreachable"
|
||
| as (n+1) := do a ← catch p (λ msg, throw {msg with custom :=
|
||
-- append `syntax.missing` to make clear that list is incomplete
|
||
syntax.node ⟨none, (syntax.missing::msg.custom::as).reverse⟩}),
|
||
many1_aux (a::as) n <|> pure (syntax.node ⟨none, (a::as).reverse⟩)
|
||
|
||
def many1 (r : parser) : parser :=
|
||
do rem ← remaining, many1_aux r [] (rem+1)
|
||
|
||
instance many1.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (many1 r) :=
|
||
⟨tokens r⟩
|
||
|
||
instance many1.view (r : parser) [parser.has_view r α] : parser.has_view (many1 r) (list α) :=
|
||
{ view := λ stx, match stx with
|
||
| syntax.missing := list.ret <$> view r syntax.missing
|
||
| syntax.node ⟨none, stxs⟩ := stxs.mmap (view r)
|
||
| _ := failure,
|
||
review := λ as, syntax.node ⟨none, as.map (review r)⟩ }
|
||
|
||
def many (r : parser) : parser :=
|
||
many1 r <|> pure (syntax.node ⟨none, []⟩)
|
||
|
||
instance many.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (many r) :=
|
||
⟨tokens r⟩
|
||
|
||
instance many.view (r : parser) [has_view r α] : parser.has_view (many r) (list α) :=
|
||
{..many1.view r}
|
||
|
||
def optional (r : parser) : parser :=
|
||
do r ← optional $
|
||
-- on error, wrap in "some"
|
||
catch r (λ msg, throw {msg with custom := syntax.node ⟨none, [msg.custom]⟩}),
|
||
pure $ match r with
|
||
| some r := syntax.node ⟨none, [r]⟩
|
||
| none := syntax.node ⟨none, []⟩
|
||
|
||
instance optional.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (optional r) :=
|
||
⟨tokens r⟩
|
||
|
||
inductive optional_view (α : Type)
|
||
| some (a : α) : optional_view
|
||
| none {} : optional_view
|
||
| missing {} : optional_view
|
||
|
||
namespace optional_view
|
||
instance : functor optional_view :=
|
||
{ map := λ _ _ f v, match v with
|
||
| some a := some (f a)
|
||
| none := none
|
||
| missing := missing }
|
||
end optional_view
|
||
|
||
instance optional.view (r : parser) [parser.has_view r α] : parser.has_view (optional r) (optional_view α) :=
|
||
{ view := λ stx, match stx with
|
||
| syntax.missing := pure optional_view.missing
|
||
| syntax.node ⟨none, []⟩ := pure optional_view.none
|
||
| syntax.node ⟨none, [stx]⟩ := optional_view.some <$> view r stx
|
||
| _ := failure,
|
||
review := λ a, match a with
|
||
| optional_view.some a := syntax.node ⟨none, [review r a]⟩
|
||
| optional_view.none := syntax.node ⟨none, []⟩
|
||
| optional_view.missing := syntax.missing }
|
||
|
||
/-- Parse a list `[p1, ..., pn]` of parsers as `p1 <|> ... <|> pn`.
|
||
Note that there is NO explicit encoding of which parser was chosen;
|
||
parsers should instead produce distinct node names for disambiguation. -/
|
||
def any_of (rs : list parser) : parser :=
|
||
match rs with
|
||
| [] := error "any_of"
|
||
| (r::rs) := rs.foldl (<|>) r
|
||
|
||
instance any_of.tokens (rs : list parser) [parser.has_tokens rs] : parser.has_tokens (any_of rs) :=
|
||
⟨tokens rs⟩
|
||
|
||
instance any_of.view (rs : list parser) : parser.has_view (any_of rs) syntax := default _
|
||
|
||
/-- Parse a list `[p1, ..., pn]` of parsers as `p1 <|> ... <|> pn`.
|
||
The result will be wrapped in a node with the the index of the successful
|
||
parser as the name. -/
|
||
def choice (rs : list parser) : parser :=
|
||
rs.enum.foldr
|
||
(λ ⟨i, r⟩ r', (λ stx, syntax.node ⟨some ⟨name.mk_numeral name.anonymous i⟩, [stx]⟩) <$> r <|> r')
|
||
-- use `foldr` so that any other error is preferred over this one
|
||
(error "choice: empty list")
|
||
|
||
instance choice.tokens (rs : list parser) [parser.has_tokens rs] : parser.has_tokens (choice rs) :=
|
||
⟨tokens rs⟩
|
||
|
||
instance try.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (try r) :=
|
||
⟨tokens r⟩
|
||
instance try.view (r : parser) [i : parser.has_view r α] : parser.has_view (try r) α :=
|
||
{..i}
|
||
|
||
instance label.tokens (r : parser) (l) [parser.has_tokens r] : parser.has_tokens (label r l) :=
|
||
⟨tokens r⟩
|
||
instance label.view (r : parser) (l) [i : parser.has_view r α] : parser.has_view (label r l) α :=
|
||
{..i}
|
||
|
||
instance dbg.tokens (r : parser) (l) [parser.has_tokens r] : parser.has_tokens (dbg l r) :=
|
||
⟨tokens r⟩
|
||
instance dbg.view (r : parser) (l) [i : parser.has_view r α] : parser.has_view (dbg l r) α :=
|
||
{..i}
|
||
|
||
instance recurse.tokens (α δ m a) [monad_rec α δ m] : parser.has_tokens (recurse a : m δ) :=
|
||
⟨[]⟩ -- recursive use should not contribute any new tokens
|
||
instance recurse.view (α δ m a) [monad_rec α δ m] : parser.has_view (recurse a : m δ) syntax := default _
|
||
|
||
def with_recurse {α : Type} (init : α) (r : α → rec_t α syntax m syntax) : parser :=
|
||
rec_t.run (λ _, error "recursion limit") r init
|
||
|
||
instance monad_lift.tokens {m' : Type → Type} [has_monad_lift_t m m'] (r : m syntax) [parser.has_tokens r] :
|
||
parser.has_tokens (monad_lift r : m' syntax) :=
|
||
⟨tokens r⟩
|
||
instance monad_lift.view {m' : Type → Type} [has_monad_lift_t m m'] (r : m syntax) [i : parser.has_view r α] :
|
||
parser.has_view (monad_lift r : m' syntax) α :=
|
||
{..i}
|
||
|
||
end combinators
|
||
end «parser»
|
||
end lean
|