lean4-htt/library/init/lean/parser/token.lean

225 lines
8.3 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/-
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Sebastian Ullrich
Tokenizer for the Lean language
Even though our parser architecture does not statically depend on a tokenizer but works directly on
the input string, we still use a "tokenizer" parser in the Lean parser in some circumstances:
* to distinguish between identifiers and keywords
* for error recovery: advance until next command token
* ...?
-/
prelude
import init.lean.parser.combinators
namespace lean
namespace parser
open monad_parsec combinators string has_view
def match_token : basic_parser_m (option token_config) :=
do st ← get,
it ← left_over,
pure $ prod.snd <$> st.tokens.match_prefix it
private def finish_comment_block_aux : nat → nat → basic_parser_m unit
| nesting (n+1) :=
str "/-" *> finish_comment_block_aux (nesting + 1) n <|>
str "-/" *>
(if nesting = 1 then pure ()
else finish_comment_block_aux (nesting - 1) n) <|>
any *> finish_comment_block_aux nesting n
| _ _ := error "unreachable"
def finish_comment_block (nesting := 1) : basic_parser_m unit :=
do r ← remaining,
finish_comment_block_aux nesting (r+1) <?> "end of comment block"
private def whitespace_aux : nat → basic_parser_m unit
| (n+1) :=
do whitespace,
str "--" *> take_while' (= '\n') *> whitespace_aux n <|>
-- a "/--" doc comment is an actual token, not whitespace
try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n <|>
pure ()
| 0 := error "unreachable"
variables {m : Type → Type}
local notation `parser` := m syntax
local notation `lift` := @monad_lift basic_parser_m _ _ _
/-- Skip whitespace and comments. -/
def whitespace : basic_parser_m unit :=
hidden $ do
start ← left_over,
-- every `whitespace_aux` loop reads at least one char
whitespace_aux (start.remaining+1)
section
variables [monad m] [monad_parsec syntax m]
def as_substring {α : Type} (p : m α) : m substring :=
do start ← left_over,
p,
stop ← left_over,
pure ⟨start, stop⟩
variables [monad_state parser_state m] [monad_basic_read m]
def with_source_info {α : Type} (r : m α) (trailing_ws := tt) : m (α × source_info) :=
do it ← left_over,
let leading : substring := ⟨it, it⟩, -- NOTE: will be adjusted by `syntax.update_leading`
a ← r,
-- TODO(Sebastian): less greedy, more natural whitespace assignment
-- E.g. only read up to the next line break
trailing ← lift $ as_substring $ if trailing_ws then whitespace else pure (),
it2 ← left_over,
pure (a, ⟨leading, it.offset, trailing⟩)
/-- Match an arbitrary parser and return the consumed string in a `syntax.atom`. -/
def raw {α : Type} (p : m α) (trailing_ws := ff) : parser :=
try $ do
(ss, info) ← with_source_info (as_substring p) trailing_ws,
pure $ syntax.atom ⟨info, ss.to_string⟩
instance raw.tokens {α} (p : m α) (t) : parser.has_tokens (raw p t : parser) := default _
instance raw.view {α} (p : m α) (t) : parser.has_view (raw p t : parser) syntax := default _
end
@[pattern] def base10_lit : syntax_node_kind := ⟨`lean.parser.base10_lit⟩
--TODO(Sebastian): other bases
private def number' : basic_parser_m (source_info → syntax) :=
do num ← take_while1 char.is_digit,
pure $ λ i, syntax.node ⟨base10_lit, [syntax.atom ⟨i, num⟩]⟩
set_option class.instance_max_depth 200
@[derive has_tokens has_view]
def ident_part.parser : basic_parser_m syntax :=
node_choice! ident_part {
escaped: node! ident_part_escaped [
esc_begin: raw $ ch id_begin_escape,
escaped: raw $ take_until1 is_id_end_escape,
esc_end: raw $ ch id_end_escape
],
default: lookahead (satisfy is_id_first) *> raw (take_while is_id_rest)
}
@[derive has_tokens has_view]
def ident_suffix.parser : rec_t unit syntax basic_parser_m syntax :=
-- consume '.' only when followed by a character starting an ident_part
try (lookahead (ch '.' *> (ch id_begin_escape *> pure () <|> satisfy is_id_first *> pure ()))) *>
node! ident_suffix [«.»: raw $ ch '.', ident: recurse ()]
private mutual def update_trailing, update_trailing_lst
with update_trailing : substring → syntax → syntax
| trailing (syntax.atom a@⟨some info, _⟩) := syntax.atom {a with info := some {info with trailing := trailing}}
| trailing (syntax.node n@⟨k, args⟩) := syntax.node {n with args := update_trailing_lst trailing args}
| trailing stx := stx
with update_trailing_lst : substring → list syntax → list syntax
| trailing [] := []
| trailing [stx] := [update_trailing trailing stx]
| trailing (stx::stxs) := stx :: update_trailing_lst trailing stxs
def ident' : basic_parser_m (source_info → syntax) :=
do stx ← with_recurse () $ λ _, node! ident [part: monad_lift ident_part.parser, suffix: optional ident_suffix.parser],
pure $ λ info, update_trailing info.trailing stx
private def symbol' : basic_parser_m (source_info → syntax) :=
do tk ← match_token,
match tk with
-- constant-length token
| some ⟨tk, _, none⟩ :=
do str tk,
pure $ λ i, syntax.atom ⟨some i, tk⟩
-- variable-length token
| some ⟨tk, _, some r⟩ := error "symbol': not implemented" --str tk *> monad_parsec.lift r
| none := monad_parsec.eoi *> error "end of file" <|> error "token"
def token : basic_parser_m syntax :=
do (r, i) ← with_source_info $ do {
-- NOTE the order: if a token is both a symbol and a valid identifier (i.e. a keyword),
-- we want it to be recognized as a symbol
f::_ ← longest_match [symbol', ident'] <|> list.ret <$> number' | error "token: unreachable",
pure f
},
pure (r i)
variable [monad_basic_read m]
def symbol (sym : string) (lbp := 0) : parser :=
lift $ try $ do {
it ← left_over,
stx@(syntax.atom ⟨_, sym'⟩) ← token | error "" (dlist.singleton (repr sym)) it,
when (sym ≠ sym') $
error ("token " ++ repr sym') (dlist.singleton (repr sym)) it,
pure stx
} <?> repr sym
instance syntax_atom.is_view : tysyntax.is_view syntax_atom :=
{ view := λ stx, match stx with
| syntax.atom atom := some atom
| _ := none,
review := syntax.atom }
instance symbol.tokens (sym lbp) : parser.has_tokens (symbol sym lbp : parser) :=
⟨[⟨sym, lbp, none⟩]⟩
instance symbol.view (sym lbp) : parser.has_view (symbol sym lbp : parser) syntax_atom :=
{..syntax_atom.is_view}
instance symbol.view_default (sym lbp) : parser.has_view_default (symbol sym lbp : parser) _
{info := none, val := sym} := ⟨⟩
def number : parser :=
lift $ try $ do {
it ← left_over,
stx@(syntax.node ⟨base10_lit, _⟩) ← token | error "" (dlist.singleton "number") it,
pure stx
} <?> "number"
instance number.tokens : parser.has_tokens (number : parser) := default _
instance number.view : parser.has_view (number : parser) syntax := default _
def ident.parser : parser :=
lift $ try $ do {
it ← left_over,
stx@(syntax.node ⟨ident, _⟩) ← token | error "" (dlist.singleton "identifier") it,
pure stx
} <?> "identifier"
instance ident.parser.tokens : parser.has_tokens (ident.parser : parser) := default _
instance ident.parser.view : parser.has_view (ident.parser : parser) ident.view :=
{..ident.view.is_view}
/-- Check if the following token is the symbol _or_ identifier `sym`. Useful for
parsing local tokens that have not been added to the token table (but may have
been so by some unrelated code).
For example, the universe `max` function is parsed using this combinator so that
it can still be used as an identifier outside of universes (but registering it
as a token in a term syntax would not break the universe parser). -/
def symbol_or_ident (sym : string) : parser :=
lift $ try $ do
it ← left_over,
stx ← token,
let sym' := match stx with
| syntax.atom ⟨_, sym'⟩ := some sym'
| syntax.node ⟨ident, _⟩ := do {
id ← view_with ident.view stx,
ident_part.view.default (syntax.atom ⟨_, sym'⟩) ← view id.part | none,
none ← view id.suffix | none,
some sym'
}
| _ := none,
when (sym' ≠ some sym) $
error "" (dlist.singleton (repr sym)) it,
pure stx
instance symbol_or_ident.tokens (sym) : parser.has_tokens (symbol_or_ident sym : parser) :=
default _
instance symbol_or_ident.view (sym) : parser.has_view (symbol_or_ident sym : parser) syntax := default _
end «parser»
end lean