@kha I had to add this attribute because the specializer was generated
many specialization candidates for functions that take `[has_tokens ...]`
as an argument. Moreover, these candidates had a lot of
dependencies. I am trying to workaround this issue by marking the
instances with the new attribute `[nospecialize]`.
I did not mark instances created by `[derive]`. It is quite tedious to
do it.
BTW, when I was investigating the problem I stumbled at `node.view`.
Its type is:
```
node.view :
Π {α : Type} {m : Type → Type} [_inst_1 : monad m] [_inst_2 : monad_except (parsec.message syntax) m]
[_inst_3 : monad_parsec syntax m] [_inst_4 : alternative m] (k : syntax_node_kind) (rs : list (m syntax))
[i : @has_view syntax_node_kind k α], @has_view (m syntax) (@node m _inst_1 _inst_2 _inst_3 _inst_4 k rs) α
```
This looks wrong: the view depends on `[monad_parsec syntax m]`
We should also make sure definitions do not have unnecessary type
class instances. Otherwise, we will put additional stress on the code
specializer. One option is to change the frontend and filter unused
instances.
230 lines
9.5 KiB
Text
230 lines
9.5 KiB
Text
/-
|
||
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
|
||
Released under Apache 2.0 license as described in the file LICENSE.
|
||
Author: Sebastian Ullrich
|
||
|
||
Parser for the Lean language
|
||
-/
|
||
prelude
|
||
import init.lean.parser.parsec init.lean.parser.syntax init.lean.parser.rec
|
||
import init.lean.parser.trie
|
||
import init.lean.parser.identifier init.data.rbmap init.lean.message
|
||
import init.control.coroutine
|
||
|
||
namespace lean
|
||
namespace parser
|
||
|
||
/- Maximum standard precedence. This is the precedence of function application.
|
||
In the standard Lean language, only the token `.` has a left-binding power greater
|
||
than `max_prec` (so that field accesses like `g (h x).f` are parsed as `g ((h x).f)`,
|
||
not `(g (h x)).f`). -/
|
||
def max_prec : nat := 1024
|
||
|
||
structure token_config :=
|
||
(«prefix» : string)
|
||
/- Left-binding power used by the term parser. The term parser operates in the context
|
||
of a right-binding power between 0 (used by parentheses and on the top-level) and
|
||
(usually) `max_prec` (used by function application). After parsing an initial term,
|
||
it continues parsing and expanding that term only when the left-binding power of
|
||
the next token is greater than the current right-binding power. For example, it
|
||
never continues parsing an argument after the initial parse, unless a token with
|
||
lbp > max_prec is encountered. Conversely, the term parser will always continue
|
||
parsing inside parentheses until it finds a token with lbp 0 (such as `)`). -/
|
||
(lbp : nat := 0)
|
||
-- reading a token should not need any state
|
||
/- An optional parser that is activated after matching `prefix`.
|
||
It should return a syntax tree with a "hole" for the
|
||
`source_info` surrounding the token, which will be supplied
|
||
by the `token` parser.
|
||
|
||
Remark: `suffix_parser` has many applications for example for parsing
|
||
hexdecimal numbers, `prefix` is `0x` and `suffix_parser` is the parser `digit*`.
|
||
We also use it to parse string literals: here `prefix` is just `"`.
|
||
-/
|
||
(suffix_parser : option (parsec' (source_info → syntax)) := none)
|
||
|
||
-- Backtrackable state
|
||
structure parser_state :=
|
||
(messages : message_log)
|
||
|
||
structure token_cache_entry :=
|
||
(start_it stop_it : string.iterator)
|
||
(tk : syntax)
|
||
|
||
-- Non-backtrackable state
|
||
structure parser_cache :=
|
||
(token_cache : option token_cache_entry := none)
|
||
|
||
/- Remark: if we have a node in the trie with `some token_config`, the string induced by the path is equal to the `token_config.prefix`. -/
|
||
structure parser_config :=
|
||
(tokens : trie token_config)
|
||
(filename : string)
|
||
|
||
@[derive monad alternative monad_state monad_parsec monad_except]
|
||
def parser_core_t (m : Type → Type) [monad m] :=
|
||
state_t parser_state $ parsec_t syntax $ state_t parser_cache $ m
|
||
|
||
@[derive monad alternative monad_reader monad_state monad_parsec monad_except]
|
||
def parser_t (ρ : Type) (m : Type → Type) [monad m] := reader_t ρ $ parser_core_t m
|
||
@[derive monad alternative monad_reader monad_state monad_parsec monad_except]
|
||
abbreviation basic_parser_m := parser_t parser_config id
|
||
abbreviation basic_parser := basic_parser_m syntax
|
||
abbreviation monad_basic_parser := has_monad_lift_t basic_parser_m
|
||
|
||
section
|
||
local attribute [reducible] parser_t parser_core_t
|
||
def get_cache : basic_parser_m parser_cache :=
|
||
monad_lift (get : state_t parser_cache id _)
|
||
|
||
def put_cache : parser_cache → basic_parser_m punit :=
|
||
λ c, monad_lift (put c : state_t parser_cache id _)
|
||
end
|
||
|
||
-- an arbitrary `parser` type; parsers are usually some monad stack based on `basic_parser_m` returning `syntax`
|
||
variable {ρ : Type}
|
||
|
||
class has_tokens (r : ρ) := mk {} ::
|
||
(tokens : list token_config)
|
||
|
||
@[noinline] def tokens (r : ρ) [has_tokens r] :=
|
||
has_tokens.tokens r
|
||
|
||
instance has_tokens.inhabited (r : ρ) : inhabited (has_tokens r) :=
|
||
⟨⟨[]⟩⟩
|
||
|
||
@[nospecialize]
|
||
instance list.nil.tokens : parser.has_tokens ([] : list ρ) :=
|
||
default _
|
||
|
||
@[nospecialize]
|
||
instance list.cons.tokens (r : ρ) (rs : list ρ) [parser.has_tokens r] [parser.has_tokens rs] :
|
||
parser.has_tokens (r::rs) :=
|
||
⟨tokens r ++ tokens rs⟩
|
||
|
||
class has_view (r : ρ) (α : out_param Type) :=
|
||
(view : syntax → α)
|
||
(review : α → syntax)
|
||
|
||
export has_view (view review)
|
||
|
||
def try_view {α : Type} (k : syntax_node_kind) [has_view k α] (stx : syntax) : option α :=
|
||
if stx.is_of_kind k then some (has_view.view k stx) else none
|
||
|
||
instance has_view.default (r : ρ) : inhabited (parser.has_view r syntax) :=
|
||
⟨{ view := id, review := id }⟩
|
||
|
||
class has_view_default (r : ρ) (α : out_param Type) [has_view r α] (default : α) := mk {}
|
||
|
||
def message_of_parsec_message {μ : Type} (cfg : parser_config) (msg : parsec.message μ) : message :=
|
||
-- FIXME: translate position
|
||
{filename := cfg.filename, pos := ⟨0, 0⟩, text := to_string msg}
|
||
|
||
/-- Run parser stack, returning a partial syntax tree in case of a fatal error -/
|
||
protected def run {m : Type → Type} {α ρ : Type} [monad m] [has_coe_t ρ parser_config] (cfg : ρ) (s : string) (r : parser_t ρ m α) :
|
||
m (sum α syntax × message_log) :=
|
||
do (r, _) ← (((r.run cfg).run {messages:=message_log.empty}).parse s).run {},
|
||
pure $ match r with
|
||
| except.ok (a, st) := (sum.inl a, st.messages)
|
||
| except.error msg := (sum.inr msg.custom, message_log.empty.add (message_of_parsec_message cfg msg))
|
||
|
||
open coroutine
|
||
open monad_parsec
|
||
open parser.has_view
|
||
variables {α : Type} {m : Type → Type}
|
||
local notation `parser` := m syntax
|
||
|
||
def log_message {μ : Type} [monad m] [monad_reader ρ m] [has_lift_t ρ parser_config] [monad_state parser_state m]
|
||
(msg : parsec.message μ) : m unit :=
|
||
do cfg ← read,
|
||
modify (λ st, {st with messages := st.messages.add (message_of_parsec_message ↑cfg msg)})
|
||
|
||
def mk_token_trie (tokens : list token_config) : except string (trie token_config) :=
|
||
do -- the only hardcoded tokens, because they are never directly mentioned by a `parser`
|
||
let builtin_tokens : list token_config := [{«prefix» := "/-"}, {«prefix» := "--"}],
|
||
t ← (builtin_tokens ++ tokens).mfoldl (λ (t : trie token_config) tk,
|
||
match t.find tk.prefix with
|
||
| some tk' := (match tk.lbp, tk'.lbp with
|
||
| l, 0 := pure $ t.insert tk.prefix tk
|
||
| 0, _ := pure t
|
||
| l, l' := if l = l' then pure t else throw $
|
||
"invalid token '" ++ tk.prefix ++ "', has been defined with precedences " ++
|
||
to_string l ++ " and " ++ to_string l')
|
||
| none := pure $ t.insert tk.prefix tk)
|
||
trie.mk,
|
||
pure t
|
||
|
||
structure parse.view_ty :=
|
||
(root : syntax)
|
||
(eoi : syntax)
|
||
|
||
def parse.view : syntax → option parse.view_ty
|
||
| (syntax.node ⟨none, [root, eoi]⟩) := some ⟨root, eoi⟩
|
||
| _ := none
|
||
|
||
|
||
/- Monad stacks used in multiple files -/
|
||
|
||
/- NOTE: We move `rec_t` under `parser_t`'s `reader_t` so that `term_parser`, which does not
|
||
have access to `command_parser`'s ρ (=`command_parser_config`) can still recurse into it
|
||
(for command quotations). This means that the `command_parser_config` will be reset
|
||
on a recursive call to `command.parser`, i.e. it forgets about locally registered parsers,
|
||
but that's not an issue for our intended uses of it. -/
|
||
@[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec]
|
||
def command_parser_m (ρ : Type) := reader_t ρ $ rec_t unit syntax $ parser_core_t id
|
||
|
||
section
|
||
local attribute [reducible] parser_t command_parser_m
|
||
instance command_parser_m.monad_reader_adapter (ρ ρ' : Type) :
|
||
monad_reader_adapter ρ ρ' (command_parser_m ρ) (command_parser_m ρ') :=
|
||
infer_instance
|
||
instance command_parser_m.basic_parser (ρ : Type) [has_lift_t ρ parser_config] : monad_basic_parser (command_parser_m ρ) :=
|
||
⟨λ _ x cfg rec, x.run ↑cfg⟩
|
||
end
|
||
|
||
/- The `nat` at `rec_t` is the lbp` -/
|
||
@[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec monad_basic_parser]
|
||
def term_parser_m := rec_t nat syntax $ command_parser_m parser_config
|
||
abbreviation term_parser := term_parser_m syntax
|
||
|
||
/-- A term parser for a suffix or infix notation that accepts a preceding term. -/
|
||
@[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec monad_basic_parser]
|
||
def trailing_term_parser_m := reader_t syntax term_parser_m
|
||
abbreviation trailing_term_parser := trailing_term_parser_m syntax
|
||
|
||
instance trailing_term_parser_coe : has_coe term_parser trailing_term_parser :=
|
||
⟨λ x _, x⟩
|
||
|
||
local attribute [instance] name.has_lt_quick
|
||
/-- A multimap indexed by tokens. Used for indexing parsers by their leading token. -/
|
||
def token_map (α : Type) := rbmap name (list α) (<)
|
||
|
||
def token_map.insert {α : Type} (map : token_map α) (k : name) (v : α) : token_map α :=
|
||
match map.find k with
|
||
| none := map.insert k [v]
|
||
| some vs := map.insert k (v::vs)
|
||
|
||
def token_map.of_list {α : Type} : list (name × α) → token_map α
|
||
| [] := mk_rbmap _ _ _
|
||
| (⟨k,v⟩::xs) := (token_map.of_list xs).insert k v
|
||
|
||
@[nospecialize]
|
||
instance token_map_nil.tokens : parser.has_tokens $ @token_map.of_list ρ [] :=
|
||
default _
|
||
|
||
@[nospecialize]
|
||
instance token_map_cons.tokens (k : name) (r : ρ) (rs : list (name × ρ)) [parser.has_tokens r] [parser.has_tokens $ token_map.of_list rs] :
|
||
parser.has_tokens $ token_map.of_list ((k,r)::rs) :=
|
||
⟨tokens r ++ tokens (token_map.of_list rs)⟩
|
||
|
||
-- This needs to be a separate structure since `term_parser`s cannot contain themselves in their config
|
||
structure command_parser_config extends parser_config :=
|
||
(leading_term_parsers : token_map term_parser)
|
||
(trailing_term_parsers : token_map trailing_term_parser)
|
||
|
||
instance command_parser_config_coe_parser_config : has_coe command_parser_config parser_config :=
|
||
⟨command_parser_config.to_parser_config⟩
|
||
|
||
abbreviation command_parser := command_parser_m command_parser_config syntax
|
||
|
||
end «parser»
|
||
end lean
|