@kha This one was crazy, the compiler created a join point for the continuation of the match-expression. Each case of the match was invoking the join point with a different parser. Two of the branches were the partially applied `reader_t.bind` and `reader_t.orelse`. This change did not improve the performance much, but it makes sure we don't waste time trying to figure out why we have these two partial applications in the call graph.
357 lines
13 KiB
Text
357 lines
13 KiB
Text
/-
|
||
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
|
||
Released under Apache 2.0 license as described in the file LICENSE.
|
||
Author: Sebastian Ullrich
|
||
|
||
Tokenizer for the Lean language
|
||
|
||
Even though our parser architecture does not statically depend on a tokenizer but works directly on
|
||
the input string, we still use a "tokenizer" parser in the Lean parser in some circumstances:
|
||
* to distinguish between identifiers and keywords
|
||
* for error recovery: advance until next command token
|
||
* ...?
|
||
-/
|
||
|
||
prelude
|
||
import init.lean.parser.combinators init.lean.parser.string_literal
|
||
|
||
namespace lean
|
||
namespace parser
|
||
open monad_parsec combinators string has_view
|
||
|
||
def match_token : basic_parser_m (option token_config) :=
|
||
do cfg ← read,
|
||
it ← left_over,
|
||
pure $ prod.snd <$> cfg.tokens.match_prefix it
|
||
|
||
private def finish_comment_block_aux : nat → nat → basic_parser_m unit
|
||
| nesting (n+1) :=
|
||
str "/-" *> finish_comment_block_aux (nesting + 1) n
|
||
<|>
|
||
str "-/" *> (if nesting = 1 then pure () else finish_comment_block_aux (nesting - 1) n)
|
||
<|>
|
||
any *> finish_comment_block_aux nesting n
|
||
| _ _ := error "unreachable"
|
||
|
||
def finish_comment_block (nesting := 1) : basic_parser_m unit :=
|
||
do r ← remaining,
|
||
finish_comment_block_aux nesting (r+1) <?> "end of comment block"
|
||
|
||
private def whitespace_aux : nat → basic_parser_m unit
|
||
| (n+1) :=
|
||
do whitespace,
|
||
str "--" *> take_while' (≠ '\n') *> whitespace_aux n
|
||
<|>
|
||
-- a "/--" doc comment is an actual token, not whitespace
|
||
try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n
|
||
<|>
|
||
pure ()
|
||
| 0 := error "unreachable"
|
||
|
||
variables {m : Type → Type}
|
||
local notation `parser` := m syntax
|
||
local notation `lift` := @monad_lift basic_parser_m _ _ _
|
||
|
||
/-- Skip whitespace and comments. -/
|
||
def whitespace : basic_parser_m unit :=
|
||
hidden $ do
|
||
start ← left_over,
|
||
-- every `whitespace_aux` loop reads at least one char
|
||
whitespace_aux (start.remaining+1)
|
||
|
||
section
|
||
variables [monad m] [monad_parsec syntax m]
|
||
|
||
@[inline] def as_substring {α : Type} (p : m α) : m substring :=
|
||
do start ← left_over,
|
||
p,
|
||
stop ← left_over,
|
||
pure ⟨start, stop⟩
|
||
|
||
variables [monad_basic_parser m]
|
||
|
||
private mutual def update_trailing, update_trailing_lst
|
||
with update_trailing : substring → syntax → syntax
|
||
| trailing (syntax.atom a@⟨some info, _⟩) := syntax.atom {a with info := some {info with trailing := trailing}}
|
||
| trailing (syntax.raw_node n) := syntax.raw_node {n with args := update_trailing_lst trailing n.args}
|
||
| trailing stx := stx
|
||
with update_trailing_lst : substring → list syntax → list syntax
|
||
| trailing [] := []
|
||
| trailing [stx] := [update_trailing trailing stx]
|
||
| trailing (stx::stxs) := stx :: update_trailing_lst trailing stxs
|
||
|
||
def with_trailing (stx : syntax) : m syntax :=
|
||
do -- TODO(Sebastian): less greedy, more natural whitespace assignment
|
||
-- E.g. only read up to the next line break
|
||
trailing ← lift $ as_substring $ whitespace,
|
||
pure $ update_trailing trailing stx
|
||
|
||
def mk_raw_res (start stop : string.iterator) : syntax :=
|
||
let ss : substring := ⟨start, stop⟩ in
|
||
syntax.atom ⟨some {leading := ⟨start, start⟩, pos := start.offset, trailing := ⟨stop, stop⟩}, ss.to_string⟩
|
||
|
||
/-- Match an arbitrary parser and return the consumed string in a `syntax.atom`. -/
|
||
@[inline] def raw {α : Type} (p : m α) (trailing_ws := ff) : parser :=
|
||
try $ do
|
||
start ← left_over,
|
||
p,
|
||
stop ← left_over,
|
||
let stx := mk_raw_res start stop,
|
||
if trailing_ws then with_trailing stx else pure stx
|
||
|
||
instance raw.tokens {α} (p : m α) (t) : parser.has_tokens (raw p t : parser) := default _
|
||
instance raw.view {α} (p : m α) (t) : parser.has_view (option syntax_atom) (raw p t : parser) :=
|
||
{ view := λ stx, match stx with
|
||
| syntax.atom atom := some atom
|
||
| _ := none,
|
||
review := λ a, (syntax.atom <$> a).get_or_else syntax.missing }
|
||
|
||
/-- Like `raw (str s)`, but default to `s` in views. -/
|
||
@[inline, derive has_tokens has_view]
|
||
def raw_str (s : string) (trailing_ws := ff) : parser :=
|
||
raw (str s) trailing_ws
|
||
|
||
instance raw_str.view_default (s) (t) : parser.has_view_default (raw_str s t : parser) (option syntax_atom) (some {val := s}) :=
|
||
⟨⟩
|
||
|
||
end
|
||
|
||
set_option class.instance_max_depth 200
|
||
|
||
@[derive has_tokens has_view]
|
||
def ident_part.parser : basic_parser_m syntax :=
|
||
node_choice! ident_part {
|
||
escaped: node! ident_part_escaped [
|
||
esc_begin: raw_str id_begin_escape.to_string,
|
||
escaped: raw $ take_until1 is_id_end_escape,
|
||
esc_end: raw_str id_end_escape.to_string,
|
||
],
|
||
default: raw $ satisfy is_id_first *> take_while is_id_rest
|
||
}
|
||
|
||
@[derive has_tokens has_view]
|
||
def ident_suffix.parser : rec_t unit syntax basic_parser_m syntax :=
|
||
-- consume '.' only when followed by a character starting an ident_part
|
||
try (lookahead (ch '.' *> (ch id_begin_escape <|> satisfy is_id_first)))
|
||
*> node! ident_suffix [«.»: raw_str ".", ident: recurse ()]
|
||
|
||
def ident'' : rec_t unit syntax basic_parser_m syntax :=
|
||
node! ident [part: monad_lift ident_part.parser, suffix: optional ident_suffix.parser]
|
||
|
||
private def ident' : basic_parser_m syntax :=
|
||
rec_t.run_parsec ident'' $ λ _, ident''
|
||
|
||
--TODO(Sebastian): other bases
|
||
def number' : basic_parser :=
|
||
node_choice! number {
|
||
base10: raw (take_while1 char.is_digit),
|
||
}
|
||
|
||
def string_lit' : basic_parser :=
|
||
node! string_lit [val: raw parse_string_literal]
|
||
|
||
private def mk_consume_token (tk : token_config) (it : string.iterator) : basic_parser :=
|
||
let it' := it.nextn tk.prefix.length in
|
||
monad_parsec.lift $ λ _, parsec.result.ok (mk_raw_res it it') it' none
|
||
|
||
def number_or_string_lit : basic_parser :=
|
||
number' <|> string_lit'
|
||
|
||
def token_cont (it : string.iterator) (tk : token_config) : basic_parser :=
|
||
do id ← ident',
|
||
it' ← left_over,
|
||
-- if a token is both a symbol and a valid identifier (i.e. a keyword),
|
||
-- we want it to be recognized as a symbol
|
||
if it.offset + tk.prefix.length ≥ it'.offset then
|
||
mk_consume_token tk it
|
||
else pure id
|
||
|
||
def token : basic_parser :=
|
||
do it ← left_over,
|
||
cache ← get_cache,
|
||
-- NOTE: using `catch` instead of `<|>` so that error messages from the second block are preferred
|
||
catch (do
|
||
-- check token cache
|
||
some tkc ← pure cache.token_cache | failure,
|
||
guard (it.offset = tkc.start_it.offset),
|
||
-- hackishly update parsec position
|
||
monad_parsec.lift (λ it, parsec.result.ok () tkc.stop_it none),
|
||
put_cache {cache with hit := cache.hit + 1},
|
||
pure tkc.tk
|
||
) (λ _, do
|
||
-- cache failed, update cache
|
||
|
||
ident_start ← observing $ lookahead (satisfy is_id_first <|> ch id_begin_escape),
|
||
tk ← match_token,
|
||
tk ← match tk, ident_start with
|
||
| some tk@{suffix_parser := some _, ..}, _ :=
|
||
error "token: not implemented" --str tk *> monad_parsec.lift r
|
||
| some tk, except.ok _ := token_cont it tk
|
||
| some tk, except.error _ := mk_consume_token tk it
|
||
| none, except.ok _ := ident'
|
||
| none, except.error _ := number_or_string_lit,
|
||
tk ← with_trailing tk,
|
||
new_it ← left_over,
|
||
put_cache {cache with token_cache := some ⟨it, new_it, tk⟩, miss := cache.miss + 1},
|
||
pure tk
|
||
)
|
||
|
||
def peek_token : basic_parser_m (except (parsec.message syntax) syntax) :=
|
||
observing (try (lookahead token))
|
||
|
||
variable [monad_basic_parser m]
|
||
|
||
def symbol_core (sym : string) (lbp : nat) (ex : dlist string) : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx@(syntax.atom ⟨_, sym'⟩) ← token | error "" ex it,
|
||
when (sym ≠ sym') $
|
||
error sym' ex it,
|
||
pure stx
|
||
} <?> sym
|
||
|
||
@[inline] def symbol (sym : string) (lbp := 0) : parser :=
|
||
let sym := sym.trim in
|
||
symbol_core sym lbp (dlist.singleton sym)
|
||
|
||
instance symbol.tokens (sym lbp) : parser.has_tokens (symbol sym lbp : parser) :=
|
||
⟨[⟨sym.trim, lbp, none⟩]⟩
|
||
instance symbol.view (sym lbp) : parser.has_view (option syntax_atom) (symbol sym lbp : parser) :=
|
||
{ view := λ stx, match stx with
|
||
| syntax.atom atom := some atom
|
||
| _ := none,
|
||
review := λ a, (syntax.atom <$> a).get_or_else syntax.missing }
|
||
instance symbol.view_default (sym lbp) : parser.has_view_default (symbol sym lbp : parser) _
|
||
(some {info := none, val := sym.trim}) := ⟨⟩
|
||
|
||
def number.parser : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx ← token,
|
||
some _ ← pure $ try_view number stx | error "" (dlist.singleton "number") it,
|
||
pure stx
|
||
} <?> "number"
|
||
|
||
instance number.parser.tokens : parser.has_tokens (number.parser : parser) := default _
|
||
instance number.parser.view : parser.has_view number.view (number.parser : parser) :=
|
||
{..number.has_view}
|
||
|
||
def number.view.to_nat : number.view → nat
|
||
| (number.view.base10 (some atom)) := atom.val.to_nat
|
||
| _ := 1138 -- should never happen, but let's still choose a grep-able number
|
||
|
||
def number.view.of_nat (n : nat) : number.view :=
|
||
number.view.base10 (some {val := to_string n})
|
||
|
||
def string_lit.parser : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx ← token,
|
||
some _ ← pure $ try_view string_lit stx | error "" (dlist.singleton "number") it,
|
||
pure stx
|
||
} <?> "string"
|
||
|
||
instance string_lit.parser.tokens : parser.has_tokens (string_lit.parser : parser) := default _
|
||
instance string_lit.parser.view : parser.has_view string_lit.view (string_lit.parser : parser) :=
|
||
{..string_lit.has_view}
|
||
|
||
def ident.parser : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx ← token,
|
||
if stx.is_of_kind ident then pure stx
|
||
else error "" (dlist.singleton "identifier") it
|
||
} <?> "identifier"
|
||
|
||
instance ident.parser.tokens : parser.has_tokens (ident.parser : parser) := default _
|
||
instance ident.parser.view : parser.has_view ident.view (ident.parser : parser) :=
|
||
{..ident.has_view}
|
||
|
||
def ident_part.view.to_string : ident_part.view → string
|
||
| (ident_part.view.default (some atom)) := atom.val
|
||
| (ident_part.view.escaped {escaped := some atom, ..}) := atom.val
|
||
| _ := "ident_part.view: invalid input"
|
||
|
||
def ident.view.components : ident.view → list string
|
||
| {part := part, suffix := none} := [part.to_string]
|
||
| {part := part, suffix := some suffix} := part.to_string :: (view ident suffix.ident).components
|
||
|
||
def ident.view.to_name (id : ident.view) : name :=
|
||
id.components.foldl name.mk_string name.anonymous
|
||
|
||
/-- Read identifier without consulting the token table. -/
|
||
def raw_ident.parser : parser :=
|
||
lift $ ident' >>= with_trailing
|
||
|
||
instance raw_ident.parser.tokens : parser.has_tokens (raw_ident.parser : parser) := default _
|
||
instance raw_ident.parser.view : parser.has_view ident.view (raw_ident.parser : parser) :=
|
||
{..ident.has_view}
|
||
|
||
/-- Check if the following token is the symbol _or_ identifier `sym`. Useful for
|
||
parsing local tokens that have not been added to the token table (but may have
|
||
been so by some unrelated code).
|
||
|
||
For example, the universe `max` function is parsed using this combinator so that
|
||
it can still be used as an identifier outside of universes (but registering it
|
||
as a token in a term syntax would not break the universe parser). -/
|
||
def symbol_or_ident (sym : string) : parser :=
|
||
lift $ try $ do
|
||
it ← left_over,
|
||
stx ← token,
|
||
let sym' := match stx with
|
||
| syntax.atom ⟨_, sym'⟩ := some sym'
|
||
| syntax.raw_node {kind := @ident, ..} :=
|
||
(match view ident stx with
|
||
| {part := ident_part.view.default (some ⟨_, sym'⟩),
|
||
suffix := none} := some sym'
|
||
| _ := none)
|
||
| _ := none,
|
||
when (sym' ≠ some sym) $
|
||
error "" (dlist.singleton (repr sym)) it,
|
||
pure stx
|
||
|
||
instance symbol_or_ident.tokens (sym) : parser.has_tokens (symbol_or_ident sym : parser) :=
|
||
default _
|
||
instance symbol_or_ident.view (sym) : parser.has_view syntax (symbol_or_ident sym : parser) := default _
|
||
|
||
/-- A unicode symbol with an ASCII fallback -/
|
||
@[derive has_tokens has_view]
|
||
def unicode_symbol (unicode ascii : string) (lbp := 0) : parser :=
|
||
lift $ any_of [symbol unicode lbp, symbol ascii lbp]
|
||
-- use unicode variant by default
|
||
instance unicode_symbol.view_default (u a lbp) : parser.has_view_default (unicode_symbol u a lbp : parser) _ (syntax.atom ⟨none, u⟩) := ⟨⟩
|
||
|
||
def indexed {α : Type} (map : token_map α) : m (list α) :=
|
||
lift $ do
|
||
except.ok tk ← peek_token | error "",
|
||
n ← match tk with
|
||
| syntax.atom ⟨_, s⟩ := pure $ mk_simple_name s
|
||
| syntax.raw_node n := pure n.kind.name
|
||
| _ := error "",
|
||
option.to_monad $ map.find n
|
||
|
||
namespace syntax
|
||
open lean.format
|
||
|
||
-- Now that we have `ident.view`, this function is much easier to define
|
||
protected mutual def to_format, to_format_lst
|
||
with to_format : syntax → format
|
||
| (atom ⟨_, s⟩) := to_fmt $ repr s
|
||
| stx@(raw_node n) :=
|
||
let scopes := match n.scopes with [] := to_fmt "" | _ := bracket "{" (join_sep n.scopes ", ") "}" in
|
||
if n.kind.name = `lean.parser.no_kind then sbracket $ scopes ++ join_sep (to_format_lst n.args) line
|
||
else if n.kind.name = `lean.parser.ident then
|
||
to_fmt "`" ++ to_fmt (view ident stx).to_name ++ scopes
|
||
else let shorter_name := n.kind.name.replace_prefix `lean.parser name.anonymous
|
||
in paren $ join_sep ((to_fmt shorter_name ++ scopes) :: to_format_lst n.args) line
|
||
| missing := "<missing>"
|
||
with to_format_lst : list syntax → list format
|
||
| [] := []
|
||
| (s::ss) := to_format s :: to_format_lst ss
|
||
|
||
instance : has_to_format syntax := ⟨syntax.to_format⟩
|
||
instance : has_to_string syntax := ⟨to_string ∘ to_fmt⟩
|
||
|
||
end syntax
|
||
end «parser»
|
||
end lean
|