225 lines
8.3 KiB
Text
225 lines
8.3 KiB
Text
/-
|
||
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
|
||
Released under Apache 2.0 license as described in the file LICENSE.
|
||
Author: Sebastian Ullrich
|
||
|
||
Tokenizer for the Lean language
|
||
|
||
Even though our parser architecture does not statically depend on a tokenizer but works directly on
|
||
the input string, we still use a "tokenizer" parser in the Lean parser in some circumstances:
|
||
* to distinguish between identifiers and keywords
|
||
* for error recovery: advance until next command token
|
||
* ...?
|
||
-/
|
||
|
||
prelude
|
||
import init.lean.parser.combinators
|
||
|
||
namespace lean
|
||
namespace parser
|
||
open monad_parsec combinators string has_view
|
||
|
||
def match_token : basic_parser_m (option token_config) :=
|
||
do st ← get,
|
||
it ← left_over,
|
||
pure $ prod.snd <$> st.tokens.match_prefix it
|
||
|
||
private def finish_comment_block_aux : nat → nat → basic_parser_m unit
|
||
| nesting (n+1) :=
|
||
str "/-" *> finish_comment_block_aux (nesting + 1) n <|>
|
||
str "-/" *>
|
||
(if nesting = 1 then pure ()
|
||
else finish_comment_block_aux (nesting - 1) n) <|>
|
||
any *> finish_comment_block_aux nesting n
|
||
| _ _ := error "unreachable"
|
||
|
||
def finish_comment_block (nesting := 1) : basic_parser_m unit :=
|
||
do r ← remaining,
|
||
finish_comment_block_aux nesting (r+1) <?> "end of comment block"
|
||
|
||
private def whitespace_aux : nat → basic_parser_m unit
|
||
| (n+1) :=
|
||
do whitespace,
|
||
str "--" *> take_while' (= '\n') *> whitespace_aux n <|>
|
||
-- a "/--" doc comment is an actual token, not whitespace
|
||
try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n <|>
|
||
pure ()
|
||
| 0 := error "unreachable"
|
||
|
||
variables {m : Type → Type}
|
||
local notation `parser` := m syntax
|
||
local notation `lift` := @monad_lift basic_parser_m _ _ _
|
||
|
||
/-- Skip whitespace and comments. -/
|
||
def whitespace : basic_parser_m unit :=
|
||
hidden $ do
|
||
start ← left_over,
|
||
-- every `whitespace_aux` loop reads at least one char
|
||
whitespace_aux (start.remaining+1)
|
||
|
||
section
|
||
variables [monad m] [monad_parsec syntax m]
|
||
|
||
def as_substring {α : Type} (p : m α) : m substring :=
|
||
do start ← left_over,
|
||
p,
|
||
stop ← left_over,
|
||
pure ⟨start, stop⟩
|
||
|
||
variables [monad_state parser_state m] [monad_basic_read m]
|
||
|
||
def with_source_info {α : Type} (r : m α) (trailing_ws := tt) : m (α × source_info) :=
|
||
do it ← left_over,
|
||
let leading : substring := ⟨it, it⟩, -- NOTE: will be adjusted by `syntax.update_leading`
|
||
a ← r,
|
||
-- TODO(Sebastian): less greedy, more natural whitespace assignment
|
||
-- E.g. only read up to the next line break
|
||
trailing ← lift $ as_substring $ if trailing_ws then whitespace else pure (),
|
||
it2 ← left_over,
|
||
pure (a, ⟨leading, it.offset, trailing⟩)
|
||
|
||
/-- Match an arbitrary parser and return the consumed string in a `syntax.atom`. -/
|
||
def raw {α : Type} (p : m α) (trailing_ws := ff) : parser :=
|
||
try $ do
|
||
(ss, info) ← with_source_info (as_substring p) trailing_ws,
|
||
pure $ syntax.atom ⟨info, ss.to_string⟩
|
||
|
||
instance raw.tokens {α} (p : m α) (t) : parser.has_tokens (raw p t : parser) := default _
|
||
instance raw.view {α} (p : m α) (t) : parser.has_view (raw p t : parser) syntax := default _
|
||
|
||
end
|
||
|
||
@[pattern] def base10_lit : syntax_node_kind := ⟨`lean.parser.base10_lit⟩
|
||
|
||
--TODO(Sebastian): other bases
|
||
private def number' : basic_parser_m (source_info → syntax) :=
|
||
do num ← take_while1 char.is_digit,
|
||
pure $ λ i, syntax.node ⟨base10_lit, [syntax.atom ⟨i, num⟩]⟩
|
||
|
||
set_option class.instance_max_depth 200
|
||
|
||
@[derive has_tokens has_view]
|
||
def ident_part.parser : basic_parser_m syntax :=
|
||
node_choice! ident_part {
|
||
escaped: node! ident_part_escaped [
|
||
esc_begin: raw $ ch id_begin_escape,
|
||
escaped: raw $ take_until1 is_id_end_escape,
|
||
esc_end: raw $ ch id_end_escape
|
||
],
|
||
default: lookahead (satisfy is_id_first) *> raw (take_while is_id_rest)
|
||
}
|
||
|
||
@[derive has_tokens has_view]
|
||
def ident_suffix.parser : rec_t unit syntax basic_parser_m syntax :=
|
||
-- consume '.' only when followed by a character starting an ident_part
|
||
try (lookahead (ch '.' *> (ch id_begin_escape *> pure () <|> satisfy is_id_first *> pure ()))) *>
|
||
node! ident_suffix [«.»: raw $ ch '.', ident: recurse ()]
|
||
|
||
private mutual def update_trailing, update_trailing_lst
|
||
with update_trailing : substring → syntax → syntax
|
||
| trailing (syntax.atom a@⟨some info, _⟩) := syntax.atom {a with info := some {info with trailing := trailing}}
|
||
| trailing (syntax.node n@⟨k, args⟩) := syntax.node {n with args := update_trailing_lst trailing args}
|
||
| trailing stx := stx
|
||
with update_trailing_lst : substring → list syntax → list syntax
|
||
| trailing [] := []
|
||
| trailing [stx] := [update_trailing trailing stx]
|
||
| trailing (stx::stxs) := stx :: update_trailing_lst trailing stxs
|
||
|
||
def ident' : basic_parser_m (source_info → syntax) :=
|
||
do stx ← with_recurse () $ λ _, node! ident [part: monad_lift ident_part.parser, suffix: optional ident_suffix.parser],
|
||
pure $ λ info, update_trailing info.trailing stx
|
||
|
||
private def symbol' : basic_parser_m (source_info → syntax) :=
|
||
do tk ← match_token,
|
||
match tk with
|
||
-- constant-length token
|
||
| some ⟨tk, _, none⟩ :=
|
||
do str tk,
|
||
pure $ λ i, syntax.atom ⟨some i, tk⟩
|
||
-- variable-length token
|
||
| some ⟨tk, _, some r⟩ := error "symbol': not implemented" --str tk *> monad_parsec.lift r
|
||
| none := monad_parsec.eoi *> error "end of file" <|> error "token"
|
||
|
||
def token : basic_parser_m syntax :=
|
||
do (r, i) ← with_source_info $ do {
|
||
-- NOTE the order: if a token is both a symbol and a valid identifier (i.e. a keyword),
|
||
-- we want it to be recognized as a symbol
|
||
f::_ ← longest_match [symbol', ident'] <|> list.ret <$> number' | error "token: unreachable",
|
||
pure f
|
||
},
|
||
pure (r i)
|
||
|
||
variable [monad_basic_read m]
|
||
|
||
def symbol (sym : string) (lbp := 0) : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx@(syntax.atom ⟨_, sym'⟩) ← token | error "" (dlist.singleton (repr sym)) it,
|
||
when (sym ≠ sym') $
|
||
error ("token " ++ repr sym') (dlist.singleton (repr sym)) it,
|
||
pure stx
|
||
} <?> repr sym
|
||
|
||
instance syntax_atom.is_view : tysyntax.is_view syntax_atom :=
|
||
{ view := λ stx, match stx with
|
||
| syntax.atom atom := some atom
|
||
| _ := none,
|
||
review := syntax.atom }
|
||
instance symbol.tokens (sym lbp) : parser.has_tokens (symbol sym lbp : parser) :=
|
||
⟨[⟨sym, lbp, none⟩]⟩
|
||
instance symbol.view (sym lbp) : parser.has_view (symbol sym lbp : parser) syntax_atom :=
|
||
{..syntax_atom.is_view}
|
||
instance symbol.view_default (sym lbp) : parser.has_view_default (symbol sym lbp : parser) _
|
||
{info := none, val := sym} := ⟨⟩
|
||
|
||
def number : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx@(syntax.node ⟨base10_lit, _⟩) ← token | error "" (dlist.singleton "number") it,
|
||
pure stx
|
||
} <?> "number"
|
||
|
||
instance number.tokens : parser.has_tokens (number : parser) := default _
|
||
instance number.view : parser.has_view (number : parser) syntax := default _
|
||
|
||
def ident.parser : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx@(syntax.node ⟨ident, _⟩) ← token | error "" (dlist.singleton "identifier") it,
|
||
pure stx
|
||
} <?> "identifier"
|
||
|
||
instance ident.parser.tokens : parser.has_tokens (ident.parser : parser) := default _
|
||
instance ident.parser.view : parser.has_view (ident.parser : parser) ident.view :=
|
||
{..ident.view.is_view}
|
||
|
||
/-- Check if the following token is the symbol _or_ identifier `sym`. Useful for
|
||
parsing local tokens that have not been added to the token table (but may have
|
||
been so by some unrelated code).
|
||
|
||
For example, the universe `max` function is parsed using this combinator so that
|
||
it can still be used as an identifier outside of universes (but registering it
|
||
as a token in a term syntax would not break the universe parser). -/
|
||
def symbol_or_ident (sym : string) : parser :=
|
||
lift $ try $ do
|
||
it ← left_over,
|
||
stx ← token,
|
||
let sym' := match stx with
|
||
| syntax.atom ⟨_, sym'⟩ := some sym'
|
||
| syntax.node ⟨ident, _⟩ := do {
|
||
id ← view_with ident.view stx,
|
||
ident_part.view.default (syntax.atom ⟨_, sym'⟩) ← view id.part | none,
|
||
none ← view id.suffix | none,
|
||
some sym'
|
||
}
|
||
| _ := none,
|
||
when (sym' ≠ some sym) $
|
||
error "" (dlist.singleton (repr sym)) it,
|
||
pure stx
|
||
|
||
instance symbol_or_ident.tokens (sym) : parser.has_tokens (symbol_or_ident sym : parser) :=
|
||
default _
|
||
instance symbol_or_ident.view (sym) : parser.has_view (symbol_or_ident sym : parser) syntax := default _
|
||
|
||
end «parser»
|
||
end lean
|