lean4-htt/library/init/lean/parser/token.lean

340 lines
12 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/-
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Sebastian Ullrich
Tokenizer for the Lean language
Even though our parser architecture does not statically depend on a tokenizer but works directly on
the input string, we still use a "tokenizer" parser in the Lean parser in some circumstances:
* to distinguish between identifiers and keywords
* for error recovery: advance until next command token
* ...?
-/
prelude
import init.lean.parser.combinators init.lean.parser.string_literal
namespace lean
namespace parser
open monad_parsec combinators string has_view
def match_token : basic_parser_m (option token_config) :=
do cfg ← read,
it ← left_over,
pure $ prod.snd <$> cfg.tokens.match_prefix it
private def finish_comment_block_aux : nat → nat → basic_parser_m unit
| nesting (n+1) :=
str "/-" *> finish_comment_block_aux (nesting + 1) n
<|>
str "-/" *> (if nesting = 1 then pure () else finish_comment_block_aux (nesting - 1) n)
<|>
any *> finish_comment_block_aux nesting n
| _ _ := error "unreachable"
def finish_comment_block (nesting := 1) : basic_parser_m unit :=
do r ← remaining,
finish_comment_block_aux nesting (r+1) <?> "end of comment block"
private def whitespace_aux : nat → basic_parser_m unit
| (n+1) :=
do whitespace,
str "--" *> take_while' (≠ '\n') *> whitespace_aux n
<|>
-- a "/--" doc comment is an actual token, not whitespace
try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n
<|>
pure ()
| 0 := error "unreachable"
variables {m : Type → Type}
local notation `parser` := m syntax
local notation `lift` := @monad_lift basic_parser_m _ _ _
/-- Skip whitespace and comments. -/
def whitespace : basic_parser_m unit :=
hidden $ do
start ← left_over,
-- every `whitespace_aux` loop reads at least one char
whitespace_aux (start.remaining+1)
section
variables [monad m] [monad_parsec syntax m]
@[inline] def as_substring {α : Type} (p : m α) : m substring :=
do start ← left_over,
p,
stop ← left_over,
pure ⟨start, stop⟩
variables [monad_basic_parser m]
private mutual def update_trailing, update_trailing_lst
with update_trailing : substring → syntax → syntax
| trailing (syntax.atom a@⟨some info, _⟩) := syntax.atom {a with info := some {info with trailing := trailing}}
| trailing (syntax.ident id@{info := some info, ..}) := syntax.ident {id with info := some {info with trailing := trailing}}
| trailing (syntax.raw_node n) := syntax.raw_node {n with args := update_trailing_lst trailing n.args}
| trailing stx := stx
with update_trailing_lst : substring → list syntax → list syntax
| trailing [] := []
| trailing [stx] := [update_trailing trailing stx]
| trailing (stx::stxs) := stx :: update_trailing_lst trailing stxs
def with_trailing (stx : syntax) : m syntax :=
do -- TODO(Sebastian): less greedy, more natural whitespace assignment
-- E.g. only read up to the next line break
trailing ← lift $ as_substring $ whitespace,
pure $ update_trailing trailing stx
def mk_raw_res (start stop : string.iterator) : syntax :=
let ss : substring := ⟨start, stop⟩ in
syntax.atom ⟨some {leading := ⟨start, start⟩, pos := start.offset, trailing := ⟨stop, stop⟩}, ss.to_string⟩
/-- Match an arbitrary parser and return the consumed string in a `syntax.atom`. -/
@[inline] def raw {α : Type} (p : m α) (trailing_ws := ff) : parser := do
start ← left_over,
p,
stop ← left_over,
let stx := mk_raw_res start stop,
if trailing_ws then with_trailing stx else pure stx
instance raw.tokens {α} (p : m α) (t) : parser.has_tokens (raw p t : parser) := default _
instance raw.view {α} (p : m α) (t) : parser.has_view (option syntax_atom) (raw p t : parser) :=
{ view := λ stx, match stx with
| syntax.atom atom := some atom
| _ := none,
review := λ a, (syntax.atom <$> a).get_or_else syntax.missing }
/-- Like `raw (str s)`, but default to `s` in views. -/
@[inline, derive has_tokens has_view]
def raw_str (s : string) (trailing_ws := ff) : parser :=
raw (str s) trailing_ws
instance raw_str.view_default (s) (t) : parser.has_view_default (raw_str s t : parser) (option syntax_atom) (some {val := s}) :=
⟨⟩
end
set_option class.instance_max_depth 200
@[derive has_tokens has_view]
def detail_ident_part.parser : basic_parser_m syntax :=
node_choice! detail_ident_part {
escaped: node! detail_ident_part_escaped [
esc_begin: raw_str id_begin_escape.to_string,
escaped: raw $ take_until1 is_id_end_escape,
esc_end: raw_str id_end_escape.to_string,
],
default: raw $ satisfy is_id_first *> take_while is_id_rest
}
@[derive has_tokens has_view]
def detail_ident_suffix.parser : rec_t unit syntax basic_parser_m syntax :=
-- consume '.' only when followed by a character starting an detail_ident_part
try (lookahead (ch '.' *> (ch id_begin_escape <|> satisfy is_id_first)))
*> node! detail_ident_suffix [«.»: raw_str ".", ident: recurse ()]
def detail_ident' : rec_t unit syntax basic_parser_m syntax :=
node! detail_ident [part: monad_lift detail_ident_part.parser, suffix: optional detail_ident_suffix.parser]
/-- A parser that gives a more detailed view of `syntax_ident.raw_val`. Not used by default for
performance reasons. -/
def detail_ident.parser : basic_parser_m syntax :=
rec_t.run_parsec detail_ident' $ λ _, detail_ident'
private def ident' : basic_parser :=
do
start ← left_over,
s ← id_part,
n ← foldl name.mk_string (mk_simple_name s) $ do {
-- consume '.' only when followed by a character starting an detail_ident_part
try (lookahead (ch '.' *> (ch id_begin_escape <|> satisfy is_id_first))),
ch '.',
id_part
},
stop ← left_over,
pure $ syntax.ident {
info := some {leading := ⟨start, start⟩, pos := start.offset, trailing := ⟨stop, stop⟩},
raw_val := ⟨start, stop⟩,
val := n
}
--TODO(Sebastian): other bases
def number' : basic_parser :=
node_choice! number {
base10: raw (take_while1 char.is_digit),
}
def string_lit' : basic_parser :=
node! string_lit [val: raw parse_string_literal]
private def mk_consume_token (tk : token_config) (it : string.iterator) : basic_parser :=
let it' := it.nextn tk.prefix.length in
monad_parsec.lift $ λ _, parsec.result.ok (mk_raw_res it it') it' none
def number_or_string_lit : basic_parser :=
number' <|> string_lit'
def token_cont (it : string.iterator) (tk : token_config) : basic_parser :=
do id ← ident',
it' ← left_over,
-- if a token is both a symbol and a valid identifier (i.e. a keyword),
-- we want it to be recognized as a symbol
if it.offset + tk.prefix.length ≥ it'.offset then
mk_consume_token tk it
else pure id
def token : basic_parser :=
do it ← left_over,
cache ← get_cache,
-- NOTE: using `catch` instead of `<|>` so that error messages from the second block are preferred
catch (do
-- check token cache
some tkc ← pure cache.token_cache | failure,
guard (it.offset = tkc.start_it.offset),
-- hackishly update parsec position
monad_parsec.lift (λ it, parsec.result.ok () tkc.stop_it none),
put_cache {cache with hit := cache.hit + 1},
pure tkc.tk
) (λ _, do
-- cache failed, update cache
ident_start ← observing $ lookahead (satisfy is_id_first <|> ch id_begin_escape),
tk ← match_token,
tk ← match tk, ident_start with
| some tk@{suffix_parser := some _, ..}, _ :=
error "token: not implemented" --str tk *> monad_parsec.lift r
| some tk, except.ok _ := token_cont it tk
| some tk, except.error _ := mk_consume_token tk it
| none, except.ok _ := ident'
| none, except.error _ := number_or_string_lit,
tk ← with_trailing tk,
new_it ← left_over,
put_cache {cache with token_cache := some ⟨it, new_it, tk⟩, miss := cache.miss + 1},
pure tk
)
def peek_token : basic_parser_m (except (parsec.message syntax) syntax) :=
observing (try (lookahead token))
variable [monad_basic_parser m]
def symbol_core (sym : string) (lbp : nat) (ex : dlist string) : parser :=
lift $ try $ do {
it ← left_over,
stx@(syntax.atom ⟨_, sym'⟩) ← token | error "" ex it,
when (sym ≠ sym') $
error sym' ex it,
pure stx
} <?> sym
@[inline] def symbol (sym : string) (lbp := 0) : parser :=
let sym := sym.trim in
symbol_core sym lbp (dlist.singleton sym)
instance symbol.tokens (sym lbp) : parser.has_tokens (symbol sym lbp : parser) :=
⟨[⟨sym.trim, lbp, none⟩]⟩
instance symbol.view (sym lbp) : parser.has_view (option syntax_atom) (symbol sym lbp : parser) :=
{ view := λ stx, match stx with
| syntax.atom atom := some atom
| _ := none,
review := λ a, (syntax.atom <$> a).get_or_else syntax.missing }
instance symbol.view_default (sym lbp) : parser.has_view_default (symbol sym lbp : parser) _
(some {info := none, val := sym.trim}) := ⟨⟩
def number.parser : parser :=
lift $ try $ do {
it ← left_over,
stx ← token,
some _ ← pure $ try_view number stx | error "" (dlist.singleton "number") it,
pure stx
} <?> "number"
instance number.parser.tokens : parser.has_tokens (number.parser : parser) := default _
instance number.parser.view : parser.has_view number.view (number.parser : parser) :=
{..number.has_view}
def number.view.to_nat : number.view → nat
| (number.view.base10 (some atom)) := atom.val.to_nat
| _ := 1138 -- should never happen, but let's still choose a grep-able number
def number.view.of_nat (n : nat) : number.view :=
number.view.base10 (some {val := to_string n})
def string_lit.parser : parser :=
lift $ try $ do {
it ← left_over,
stx ← token,
some _ ← pure $ try_view string_lit stx | error "" (dlist.singleton "number") it,
pure stx
} <?> "string"
instance string_lit.parser.tokens : parser.has_tokens (string_lit.parser : parser) := default _
instance string_lit.parser.view : parser.has_view string_lit.view (string_lit.parser : parser) :=
{..string_lit.has_view}
def ident.parser : parser :=
lift $ try $ do {
it ← left_over,
stx@(syntax.ident _) ← token | error "" (dlist.singleton "identifier") it,
pure stx
} <?> "identifier"
instance ident.parser.tokens : parser.has_tokens (ident.parser : parser) := default _
instance ident.parser.view : parser.has_view syntax_ident (ident.parser : parser) :=
{ view := λ stx, match stx with
| syntax.ident id := id
| _ := {raw_val := substring.of_string "NOT_AN_IDENT", val := `NOT_AN_IDENT},
review := syntax.ident }
/-- Read identifier without consulting the token table. -/
def raw_ident.parser : parser :=
lift $ ident' >>= with_trailing
instance raw_ident.parser.tokens : parser.has_tokens (raw_ident.parser : parser) := default _
instance raw_ident.parser.view : parser.has_view syntax_ident (raw_ident.parser : parser) :=
{..(ident.parser.view : has_view _ (_ : parser))}
/-- Check if the following token is the symbol _or_ identifier `sym`. Useful for
parsing local tokens that have not been added to the token table (but may have
been so by some unrelated code).
For example, the universe `max` function is parsed using this combinator so that
it can still be used as an identifier outside of universes (but registering it
as a token in a term syntax would not break the universe parser). -/
def symbol_or_ident (sym : string) : parser :=
lift $ try $ do
it ← left_over,
stx ← token,
let sym' := match stx with
| syntax.atom ⟨_, sym'⟩ := some sym'
| syntax.ident id := some id.raw_val.to_string
| _ := none,
when (sym' ≠ some sym) $
error "" (dlist.singleton (repr sym)) it,
pure stx
instance symbol_or_ident.tokens (sym) : parser.has_tokens (symbol_or_ident sym : parser) :=
default _
instance symbol_or_ident.view (sym) : parser.has_view syntax (symbol_or_ident sym : parser) := default _
/-- A unicode symbol with an ASCII fallback -/
@[derive has_tokens has_view]
def unicode_symbol (unicode ascii : string) (lbp := 0) : parser :=
lift $ any_of [symbol unicode lbp, symbol ascii lbp]
-- use unicode variant by default
instance unicode_symbol.view_default (u a lbp) : parser.has_view_default (unicode_symbol u a lbp : parser) _ (syntax.atom ⟨none, u⟩) := ⟨⟩
def indexed {α : Type} (map : token_map α) : m (list α) :=
lift $ do
except.ok tk ← peek_token | error "",
n ← match tk with
| syntax.atom ⟨_, s⟩ := pure $ mk_simple_name s
| syntax.ident _ := pure `ident
| syntax.raw_node n := pure n.kind.name
| _ := error "",
option.to_monad $ map.find n
end «parser»
end lean