379 lines
14 KiB
Text
379 lines
14 KiB
Text
/-
|
||
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
|
||
Released under Apache 2.0 license as described in the file LICENSE.
|
||
Author: Sebastian Ullrich
|
||
|
||
Tokenizer for the Lean language
|
||
|
||
Even though our parser architecture does not statically depend on a tokenizer but works directly on
|
||
the input string, we still use a "tokenizer" parser in the Lean parser in some circumstances:
|
||
* to distinguish between identifiers and keywords
|
||
* for error recovery: advance until next command token
|
||
* ...?
|
||
-/
|
||
|
||
prelude
|
||
import init.lean.parser.combinators init.lean.parser.string_literal
|
||
|
||
namespace lean
|
||
namespace parser
|
||
open monad_parsec combinators string has_view
|
||
|
||
def match_token : basic_parser_m (option token_config) :=
|
||
do cfg ← read,
|
||
it ← left_over,
|
||
pure $ prod.snd <$> cfg.tokens.match_prefix it
|
||
|
||
private def finish_comment_block_aux : nat → nat → basic_parser_m unit
|
||
| nesting (n+1) :=
|
||
str "/-" *> finish_comment_block_aux (nesting + 1) n
|
||
<|>
|
||
str "-/" *> (if nesting = 1 then pure () else finish_comment_block_aux (nesting - 1) n)
|
||
<|>
|
||
any *> finish_comment_block_aux nesting n
|
||
| _ _ := error "unreachable"
|
||
|
||
def finish_comment_block (nesting := 1) : basic_parser_m unit :=
|
||
do r ← remaining,
|
||
finish_comment_block_aux nesting (r+1) <?> "end of comment block"
|
||
|
||
private def whitespace_aux : nat → basic_parser_m unit
|
||
| (n+1) :=
|
||
do whitespace,
|
||
str "--" *> take_while' (≠ '\n') *> whitespace_aux n
|
||
<|>
|
||
-- a "/--" doc comment is an actual token, not whitespace
|
||
try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n
|
||
<|>
|
||
pure ()
|
||
| 0 := error "unreachable"
|
||
|
||
variables {m : Type → Type}
|
||
local notation `parser` := m syntax
|
||
local notation `lift` := @monad_lift basic_parser_m _ _ _
|
||
|
||
/-- Skip whitespace and comments. -/
|
||
def whitespace : basic_parser_m unit :=
|
||
hidden $ do
|
||
start ← left_over,
|
||
-- every `whitespace_aux` loop reads at least one char
|
||
whitespace_aux (start.remaining+1)
|
||
|
||
section
|
||
variables [monad m] [monad_parsec syntax m]
|
||
|
||
@[inline] def as_substring {α : Type} (p : m α) : m substring :=
|
||
do start ← left_over,
|
||
p,
|
||
stop ← left_over,
|
||
pure ⟨start, stop⟩
|
||
|
||
variables [monad_basic_parser m]
|
||
|
||
private mutual def update_trailing, update_trailing_lst
|
||
with update_trailing : substring → syntax → syntax
|
||
| trailing (syntax.atom a@⟨some info, _⟩) := syntax.atom {a with info := some {info with trailing := trailing}}
|
||
| trailing (syntax.ident id@{info := some info, ..}) := syntax.ident {id with info := some {info with trailing := trailing}}
|
||
| trailing (syntax.raw_node n) := syntax.raw_node {n with args := update_trailing_lst trailing n.args}
|
||
| trailing stx := stx
|
||
with update_trailing_lst : substring → list syntax → list syntax
|
||
| trailing [] := []
|
||
| trailing [stx] := [update_trailing trailing stx]
|
||
| trailing (stx::stxs) := stx :: update_trailing_lst trailing stxs
|
||
|
||
def with_trailing (stx : syntax) : m syntax :=
|
||
do -- TODO(Sebastian): less greedy, more natural whitespace assignment
|
||
-- E.g. only read up to the next line break
|
||
trailing ← lift $ as_substring $ whitespace,
|
||
pure $ update_trailing trailing stx
|
||
|
||
def mk_raw_res (start stop : string.iterator) : syntax :=
|
||
let ss : substring := ⟨start, stop⟩ in
|
||
syntax.atom ⟨some {leading := ⟨start, start⟩, pos := start.offset, trailing := ⟨stop, stop⟩}, ss.to_string⟩
|
||
|
||
/-- Match an arbitrary parser and return the consumed string in a `syntax.atom`. -/
|
||
@[inline] def raw {α : Type} (p : m α) (trailing_ws := ff) : parser := do
|
||
start ← left_over,
|
||
p,
|
||
stop ← left_over,
|
||
let stx := mk_raw_res start stop,
|
||
if trailing_ws then with_trailing stx else pure stx
|
||
|
||
instance raw.tokens {α} (p : m α) (t) : parser.has_tokens (raw p t : parser) := default _
|
||
instance raw.view {α} (p : m α) (t) : parser.has_view (option syntax_atom) (raw p t : parser) :=
|
||
{ view := λ stx, match stx with
|
||
| syntax.atom atom := some atom
|
||
| _ := none,
|
||
review := λ a, (syntax.atom <$> a).get_or_else syntax.missing }
|
||
|
||
/-- Like `raw (str s)`, but default to `s` in views. -/
|
||
@[inline, derive has_tokens has_view]
|
||
def raw_str (s : string) (trailing_ws := ff) : parser :=
|
||
raw (str s) trailing_ws
|
||
|
||
instance raw_str.view_default (s) (t) : parser.has_view_default (raw_str s t : parser) (option syntax_atom) (some {val := s}) :=
|
||
⟨⟩
|
||
|
||
end
|
||
|
||
set_option class.instance_max_depth 200
|
||
|
||
@[derive has_tokens has_view]
|
||
def detail_ident_part.parser : basic_parser_m syntax :=
|
||
node_choice! detail_ident_part {
|
||
escaped: node! detail_ident_part_escaped [
|
||
esc_begin: raw_str id_begin_escape.to_string,
|
||
escaped: raw $ take_until1 is_id_end_escape,
|
||
esc_end: raw_str id_end_escape.to_string,
|
||
],
|
||
default: raw $ satisfy is_id_first *> take_while is_id_rest
|
||
}
|
||
|
||
@[derive has_tokens has_view]
|
||
def detail_ident_suffix.parser : rec_t unit syntax basic_parser_m syntax :=
|
||
-- consume '.' only when followed by a character starting an detail_ident_part
|
||
try (lookahead (ch '.' *> (ch id_begin_escape <|> satisfy is_id_first)))
|
||
*> node! detail_ident_suffix [«.»: raw_str ".", ident: recurse ()]
|
||
|
||
def detail_ident' : rec_t unit syntax basic_parser_m syntax :=
|
||
node! detail_ident [part: monad_lift detail_ident_part.parser, suffix: optional detail_ident_suffix.parser]
|
||
|
||
/-- A parser that gives a more detailed view of `syntax_ident.raw_val`. Not used by default for
|
||
performance reasons. -/
|
||
def detail_ident.parser : basic_parser_m syntax :=
|
||
rec_t.run_parsec detail_ident' $ λ _, detail_ident'
|
||
|
||
private def ident' : basic_parser :=
|
||
do
|
||
start ← left_over,
|
||
s ← id_part,
|
||
n ← foldl name.mk_string (mk_simple_name s) $ do {
|
||
-- consume '.' only when followed by a character starting an detail_ident_part
|
||
try (lookahead (ch '.' *> (ch id_begin_escape <|> satisfy is_id_first))),
|
||
ch '.',
|
||
id_part
|
||
},
|
||
stop ← left_over,
|
||
pure $ syntax.ident {
|
||
info := some {leading := ⟨start, start⟩, pos := start.offset, trailing := ⟨stop, stop⟩},
|
||
raw_val := ⟨start, stop⟩,
|
||
val := n
|
||
}
|
||
|
||
-- the node macro doesn't seem to like these...
|
||
--TODO(Sebastian): these should probably generate better error messages
|
||
def parse_bin_lit : basic_parser_m unit :=
|
||
ch '0' *> (ch 'b' <|> ch 'B') *> many1' (ch '0' <|> ch '1')
|
||
|
||
def parse_oct_lit : basic_parser_m string :=
|
||
ch '0' *> (ch 'o' <|> ch 'O') *> take_while1 (λ c, c ≥ '0' && c < '8')
|
||
|
||
def parse_hex_lit : basic_parser_m string :=
|
||
ch '0' *> (ch 'x' <|> ch 'X') *> take_while1 (λ c, c.is_digit || c.is_alpha)
|
||
|
||
--TODO(Sebastian): other bases
|
||
def number' : basic_parser :=
|
||
node_longest_choice! number {
|
||
base10: raw $ take_while1 char.is_digit,
|
||
base2: raw parse_bin_lit,
|
||
base8: raw parse_oct_lit,
|
||
base16: raw parse_hex_lit,
|
||
}
|
||
|
||
def string_lit' : basic_parser :=
|
||
node! string_lit [val: raw parse_string_literal]
|
||
|
||
private def mk_consume_token (tk : token_config) (it : string.iterator) : basic_parser :=
|
||
let it' := it.nextn tk.prefix.length in
|
||
monad_parsec.lift $ λ _, parsec.result.ok (mk_raw_res it it') it' none
|
||
|
||
def number_or_string_lit : basic_parser :=
|
||
number' <|> string_lit'
|
||
|
||
def token_cont (it : string.iterator) (tk : token_config) : basic_parser :=
|
||
do id ← ident',
|
||
it' ← left_over,
|
||
-- if a token is both a symbol and a valid identifier (i.e. a keyword),
|
||
-- we want it to be recognized as a symbol
|
||
if it.offset + tk.prefix.length ≥ it'.offset then
|
||
mk_consume_token tk it
|
||
else pure id
|
||
|
||
def token : basic_parser :=
|
||
do it ← left_over,
|
||
cache ← get_cache,
|
||
-- NOTE: using `catch` instead of `<|>` so that error messages from the second block are preferred
|
||
catch (do
|
||
-- check token cache
|
||
some tkc ← pure cache.token_cache | failure,
|
||
guard (it.offset = tkc.start_it.offset),
|
||
-- hackishly update parsec position
|
||
monad_parsec.lift (λ it, parsec.result.ok () tkc.stop_it none),
|
||
put_cache {cache with hit := cache.hit + 1},
|
||
pure tkc.tk
|
||
) (λ _, do
|
||
-- cache failed, update cache
|
||
|
||
ident_start ← observing $ lookahead (satisfy is_id_first <|> ch id_begin_escape),
|
||
tk ← match_token,
|
||
tk ← match tk, ident_start with
|
||
| some tk@{suffix_parser := some _, ..}, _ :=
|
||
error "token: not implemented" --str tk *> monad_parsec.lift r
|
||
| some tk, except.ok _ := token_cont it tk
|
||
| some tk, except.error _ := mk_consume_token tk it
|
||
| none, except.ok _ := ident'
|
||
| none, except.error _ := number_or_string_lit,
|
||
tk ← with_trailing tk,
|
||
new_it ← left_over,
|
||
put_cache {cache with token_cache := some ⟨it, new_it, tk⟩, miss := cache.miss + 1},
|
||
pure tk
|
||
)
|
||
|
||
def peek_token : basic_parser_m (except (parsec.message syntax) syntax) :=
|
||
observing (try (lookahead token))
|
||
|
||
variable [monad_basic_parser m]
|
||
|
||
def symbol_core (sym : string) (lbp : nat) (ex : dlist string) : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx@(syntax.atom ⟨_, sym'⟩) ← token | error "" ex it,
|
||
when (sym ≠ sym') $
|
||
error sym' ex it,
|
||
pure stx
|
||
} <?> sym
|
||
|
||
@[inline] def symbol (sym : string) (lbp := 0) : parser :=
|
||
let sym := sym.trim in
|
||
symbol_core sym lbp (dlist.singleton sym)
|
||
|
||
instance symbol.tokens (sym lbp) : parser.has_tokens (symbol sym lbp : parser) :=
|
||
⟨[⟨sym.trim, lbp, none⟩]⟩
|
||
instance symbol.view (sym lbp) : parser.has_view (option syntax_atom) (symbol sym lbp : parser) :=
|
||
{ view := λ stx, match stx with
|
||
| syntax.atom atom := some atom
|
||
| _ := none,
|
||
review := λ a, (syntax.atom <$> a).get_or_else syntax.missing }
|
||
instance symbol.view_default (sym lbp) : parser.has_view_default (symbol sym lbp : parser) _
|
||
(some {info := none, val := sym.trim}) := ⟨⟩
|
||
|
||
def number.parser : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx ← token,
|
||
some _ ← pure $ try_view number stx | error "" (dlist.singleton "number") it,
|
||
pure stx
|
||
} <?> "number"
|
||
|
||
instance number.parser.tokens : parser.has_tokens (number.parser : parser) := default _
|
||
instance number.parser.view : parser.has_view number.view (number.parser : parser) :=
|
||
{..number.has_view}
|
||
|
||
private def to_nat_core (base : nat) : string.iterator → nat → nat → nat
|
||
| it 0 r := r
|
||
| it (i+1) r :=
|
||
let c := it.curr in
|
||
let val := if c.is_digit then
|
||
c.to_nat - '0'.to_nat
|
||
else if c ≥ 'a' ∧ c ≤ 'f' then
|
||
c.to_nat - 'a'.to_nat
|
||
else
|
||
c.to_nat - 'A'.to_nat in
|
||
let r := r*base + val in
|
||
to_nat_core it.next i r
|
||
|
||
private def to_nat_base (s : string) (base : nat) : nat :=
|
||
to_nat_core base s.mk_iterator s.length 0
|
||
|
||
def number.view.to_nat : number.view → nat
|
||
| (number.view.base10 (some atom)) := atom.val.to_nat
|
||
| (number.view.base2 (some atom)) := to_nat_base atom.val 2
|
||
| (number.view.base8 (some atom)) := to_nat_base atom.val 8
|
||
| (number.view.base16 (some atom)) := to_nat_base atom.val 16
|
||
| _ := 1138 -- should never happen, but let's still choose a grep-able number
|
||
|
||
def number.view.of_nat (n : nat) : number.view :=
|
||
number.view.base10 (some {val := to_string n})
|
||
|
||
def string_lit.parser : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx ← token,
|
||
some _ ← pure $ try_view string_lit stx | error "" (dlist.singleton "string") it,
|
||
pure stx
|
||
} <?> "string"
|
||
|
||
instance string_lit.parser.tokens : parser.has_tokens (string_lit.parser : parser) := default _
|
||
instance string_lit.parser.view : parser.has_view string_lit.view (string_lit.parser : parser) :=
|
||
{..string_lit.has_view}
|
||
|
||
def string_lit.view.value (lit : string_lit.view) : option string := do
|
||
atom ← lit.val,
|
||
except.ok s ← pure $ parsec.parse (parse_string_literal : parsec' _) atom.val
|
||
| failure,
|
||
pure s
|
||
|
||
def ident.parser : parser :=
|
||
lift $ try $ do {
|
||
it ← left_over,
|
||
stx@(syntax.ident _) ← token | error "" (dlist.singleton "identifier") it,
|
||
pure stx
|
||
} <?> "identifier"
|
||
|
||
instance ident.parser.tokens : parser.has_tokens (ident.parser : parser) := default _
|
||
instance ident.parser.view : parser.has_view syntax_ident (ident.parser : parser) :=
|
||
{ view := λ stx, match stx with
|
||
| syntax.ident id := id
|
||
| _ := {raw_val := substring.of_string "NOT_AN_IDENT", val := `NOT_AN_IDENT},
|
||
review := syntax.ident }
|
||
|
||
/-- Read identifier without consulting the token table. -/
|
||
def raw_ident.parser : parser :=
|
||
lift $ ident' >>= with_trailing
|
||
|
||
instance raw_ident.parser.tokens : parser.has_tokens (raw_ident.parser : parser) := default _
|
||
instance raw_ident.parser.view : parser.has_view syntax_ident (raw_ident.parser : parser) :=
|
||
{..(ident.parser.view : has_view _ (_ : parser))}
|
||
|
||
/-- Check if the following token is the symbol _or_ identifier `sym`. Useful for
|
||
parsing local tokens that have not been added to the token table (but may have
|
||
been so by some unrelated code).
|
||
|
||
For example, the universe `max` function is parsed using this combinator so that
|
||
it can still be used as an identifier outside of universes (but registering it
|
||
as a token in a term syntax would not break the universe parser). -/
|
||
def symbol_or_ident (sym : string) : parser :=
|
||
lift $ try $ do
|
||
it ← left_over,
|
||
stx ← token,
|
||
let sym' := match stx with
|
||
| syntax.atom ⟨_, sym'⟩ := some sym'
|
||
| syntax.ident id := some id.raw_val.to_string
|
||
| _ := none,
|
||
when (sym' ≠ some sym) $
|
||
error "" (dlist.singleton (repr sym)) it,
|
||
pure stx
|
||
|
||
instance symbol_or_ident.tokens (sym) : parser.has_tokens (symbol_or_ident sym : parser) :=
|
||
default _
|
||
instance symbol_or_ident.view (sym) : parser.has_view syntax (symbol_or_ident sym : parser) := default _
|
||
|
||
/-- A unicode symbol with an ASCII fallback -/
|
||
@[derive has_tokens has_view]
|
||
def unicode_symbol (unicode ascii : string) (lbp := 0) : parser :=
|
||
lift $ any_of [symbol unicode lbp, symbol ascii lbp]
|
||
-- use unicode variant by default
|
||
instance unicode_symbol.view_default (u a lbp) : parser.has_view_default (unicode_symbol u a lbp : parser) _ (syntax.atom ⟨none, u⟩) := ⟨⟩
|
||
|
||
def indexed {α : Type} (map : token_map α) : m (list α) :=
|
||
lift $ do
|
||
except.ok tk ← peek_token | error "",
|
||
n ← match tk with
|
||
| syntax.atom ⟨_, s⟩ := pure $ mk_simple_name s
|
||
| syntax.ident _ := pure `ident
|
||
| syntax.raw_node n := pure n.kind.name
|
||
| _ := error "",
|
||
option.to_monad $ map.find n
|
||
|
||
end «parser»
|
||
end lean
|