chore(library/init/lean/parser): add a few comments

This commit is contained in:
Leonardo de Moura 2018-10-11 15:54:57 -07:00
parent aff6d58659
commit 15a25d5aa9
6 changed files with 97 additions and 56 deletions

View file

@ -35,7 +35,12 @@ structure token_config :=
/- An optional parser that is activated after matching `prefix`.
It should return a syntax tree with a "hole" for the
`source_info` surrounding the token, which will be supplied
by the `token` parser. -/
by the `token` parser.
Remark: `suffix_parser` has many applications for example for parsing
hexdecimal numbers, `prefix` is `0x` and `suffix_parser` is the parser `digit*`.
We also use it to parse string literals: here `prefix` is just `"`.
-/
(suffix_parser : option (parsec' (source_info → syntax)) := none)
-- Backtrackable state
@ -50,6 +55,7 @@ structure token_cache_entry :=
structure parser_cache :=
(token_cache : option token_cache_entry := none)
/- Remark: if we have a node in the trie with `some token_config`, the string induced by the path is equal to the `token_config.prefix`. -/
structure parser_config :=
(tokens : trie token_config)
(filename : string)
@ -80,12 +86,8 @@ variable {ρ : Type}
class has_tokens (r : ρ) := mk {} ::
(tokens : list token_config)
def donotinline {α : Type} (a : α) (f : αα := id) :=
f (f a)
-- do NOT inline this function
def tokens (r : ρ) [has_tokens r] :=
donotinline (has_tokens.tokens r)
@[noinline] def tokens (r : ρ) [has_tokens r] :=
has_tokens.tokens r
instance has_tokens.inhabited (r : ρ) : inhabited (has_tokens r) :=
⟨⟨[]⟩⟩
@ -176,6 +178,7 @@ instance command_parser_m.basic_parser (ρ : Type) [has_lift_t ρ parser_config]
⟨λ _ x cfg rec, x.run ↑cfg⟩
end
/- The `nat` at `rec_t` is the lbp` -/
@[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec monad_basic_parser]
def term_parser_m := rec_t nat syntax $ command_parser_m parser_config
abbreviation term_parser := term_parser_m syntax
@ -189,6 +192,7 @@ instance trailing_term_parser_coe : has_coe term_parser trailing_term_parser :=
⟨λ x _, x⟩
-- This needs to be a separate structure since `term_parser`s cannot contain themselves in their config
-- TODO: we need indexing here.
structure command_parser_config extends parser_config :=
(leading_term_parsers : list term_parser)
(trailing_term_parsers : list trailing_term_parser)

View file

@ -20,18 +20,18 @@ local notation `parser` := m syntax
variables [monad m] [monad_except (parsec.message syntax) m] [monad_parsec syntax m] [alternative m]
def node' (k : option syntax_node_kind) (rs : list parser) : parser :=
do (args, _) ← rs.mfoldl (λ (p : list syntax × nat) r, do
(args, remaining) ← pure p,
do args ← rs.mfoldl (λ (p : list syntax) r, do
args ← pure p,
-- on error, append partial syntax tree to previous successful parses and rethrow
a ← catch r $ λ msg,
let args := msg.custom :: args in
throw {msg with custom := syntax.node ⟨k, args.reverse⟩},
pure (a::args, remaining - 1)
) ([], rs.length),
pure (a::args)
) [],
pure $ syntax.node ⟨k, args.reverse⟩
@[reducible] def seq : list parser → parser := node' none
@[reducible] def node (k : syntax_node_kind) : list parser → parser := node' k
@[reducible] def node (k : syntax_node_kind) : list parser → parser := node' (some k)
instance node'.tokens (k) (rs : list parser) [parser.has_tokens rs] : parser.has_tokens (node' k rs) :=
⟨tokens rs⟩
@ -39,6 +39,8 @@ instance node'.tokens (k) (rs : list parser) [parser.has_tokens rs] : parser.has
instance node.view (k) (rs : list parser) [i : has_view k α] : parser.has_view (node k rs) α :=
{ view := i.view, review := i.review }
-- Each parser combinator comes equipped with `has_view` and `has_tokens` instances
private def many1_aux (p : parser) : list syntax → nat → parser
| as 0 := error "unreachable"
| as (n+1) := do
@ -66,6 +68,7 @@ instance many.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (man
⟨tokens r⟩
instance many.view (r : parser) [has_view r α] : parser.has_view (many r) (list α) :=
/- Remark: `many1.view` can handle empty list. -/
{..many1.view r}
private def sep_by_aux (p : m syntax) (sep : parser) (allow_trailing_sep : bool) : bool → list syntax → nat → parser
@ -164,15 +167,22 @@ instance longest_match.view (rs : list parser) : parser.has_view (longest_match
/-- Parse a list `[p1, ..., pn]` of parsers as `p1 <|> ... <|> pn`.
The result will be wrapped in a node with the the index of the successful
parser as the name. -/
def choice_aux : list parser → nat → parser
| [] _ := error "choice: empty list"
| (r::rs) i :=
do { stx ← r,
pure $ syntax.node ⟨some ⟨name.mk_numeral name.anonymous i⟩, [stx]⟩ }
<|> choice_aux rs (i+1)
def choice (rs : list parser) : parser :=
rs.enum.foldr
(λ ⟨i, r⟩ r', (λ stx, syntax.node ⟨some ⟨name.mk_numeral name.anonymous i⟩, [stx]⟩) <$> r <|> r')
-- use `foldr` so that any other error is preferred over this one
(error "choice: empty list")
choice_aux rs 0
instance choice.tokens (rs : list parser) [parser.has_tokens rs] : parser.has_tokens (choice rs) :=
⟨tokens rs⟩
/- Remark: `choice` does not have `has_view` instance because we only use it at the pratt combinator
which doesn't need the view. -/
instance try.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (try r) :=
⟨tokens r⟩
instance try.view (r : parser) [i : parser.has_view r α] : parser.has_view (try r) α :=

View file

@ -256,10 +256,10 @@ namespace monad_parsec
open parsec_t
variables {m : Type → Type} [monad m] [monad_parsec μ m] [inhabited μ] {α β : Type}
@[inline] def error {α : Type} (unexpected : string) (expected : dlist string := dlist.empty) (it : option iterator := none) (custom : μ := default _) : m α :=
def error {α : Type} (unexpected : string) (expected : dlist string := dlist.empty) (it : option iterator := none) (custom : μ := default _) : m α :=
lift $ λ it', result.error { unexpected := unexpected, expected := expected, it := it.get_or_else it', custom := custom } ff
@[inline] def left_over : m iterator :=
def left_over : m iterator :=
lift $ λ it, result.mk_eps it it
/-- Return the number of characters left to be parsed. -/
@ -310,7 +310,7 @@ do it ← left_over,
if p c then error (repr c)
else pure ()
@[inline] def eoi_error (it : iterator) : result μ α :=
def eoi_error (it : iterator) : result μ α :=
result.error { it := it, unexpected := "end of input", custom := default _ } ff
def curr : m char :=
@ -360,7 +360,7 @@ This parser consumes no input if it fails (even if a partial match).
Note: The behaviour of this parser is different to that the `string` parser in the Parsec_t Μ M Haskell library,
as this one is all-or-nothing.
-/
@[inline] def str (s : string) : m string :=
def str (s : string) : m string :=
if s.is_empty then pure ""
else lift $ λ it, match str_aux s.length s.mk_iterator it with
| some it' := result.ok s it' none
@ -377,7 +377,7 @@ def take (n : nat) : m string :=
if n = 0 then pure ""
else lift $ take_aux n ""
@[inline] private def mk_string_result (r : string) (it : iterator) : result μ string :=
private def mk_string_result (r : string) (it : iterator) : result μ string :=
if r.is_empty then result.mk_eps r it
else result.ok r it none
@ -415,7 +415,7 @@ take_while (λ c, !p c)
def take_until1 (p : char → bool) : m string :=
take_while1 (λ c, !p c)
@[inline] private def mk_consumed_result (consumed : bool) (it : iterator) : result μ unit :=
private def mk_consumed_result (consumed : bool) (it : iterator) : result μ unit :=
if consumed then result.ok () it none
else result.mk_eps () it

View file

@ -9,10 +9,6 @@ import init.lean.name init.lean.parser.parsec
namespace lean
namespace parser
/-- De Bruijn index relative to surrounding 'bind' macros -/
@[reducible] def var_offset :=
@[reducible] def macro_scope_id :=
--TODO(Sebastian): move
structure substring :=
(start : string.iterator)
@ -36,11 +32,18 @@ structure syntax_node_kind :=
@[pattern] def choice : syntax_node_kind := ⟨`lean.parser.choice⟩
/-
Parsers create `syntax_node`'s with the following property:
- If `args` contains a `syntax.missing`, then all subsequent elements are also `syntax.missing`.
- We believe the first argument in `args` is not `syntax.missing`. TODO: check this.
Remark: We do create `syntax_node`'s with an empty `args` field.
-/
structure syntax_node (syntax : Type) :=
-- TODO: add `lean.parser.list` kind, and remove option. Then `none` = `lean.parser.seq`
(kind : option syntax_node_kind) (args : list syntax)
inductive syntax
/- any non-ident atom -/
| atom (val : syntax_atom)
| node (val : syntax_node syntax)
| missing
@ -48,9 +51,6 @@ inductive syntax
instance : inhabited syntax :=
⟨syntax.missing⟩
instance coe_string_syntax : has_coe string syntax :=
⟨λ s, syntax.atom ⟨none, s⟩⟩
def substring.to_string (s : substring) : string :=
(s.start.extract s.stop).get_or_else ""
@ -61,6 +61,9 @@ def is_of_kind (k : syntax_node_kind) : syntax → bool
| (syntax.node ⟨some k', _⟩) := k.name = k'.name
| _ := ff
-- Remark: this function must be updated whenever `ident` parser is modified.
-- This function was defined before we had the `ident` parser.
-- TODO: move it to the `ident` parser file and use the view defined there.
private def ident_to_format : syntax → format
| stx := option.get_or_else (do
syntax.node ⟨_, [syntax.node ⟨_, [syntax.node ⟨some ⟨idx⟩, part⟩]⟩, suffix]⟩ ← pure stx | failure,
@ -82,12 +85,15 @@ with to_format : syntax → format
| stx@(node {kind := some kind, args := args, ..}) :=
if kind.name = `lean.parser.ident
then to_fmt "`" ++ ident_to_format stx
else paren $ join_sep (to_fmt (kind.name.replace_prefix `lean.parser name.anonymous) :: to_format_lst args) line
else let shorter_name := kind.name.replace_prefix `lean.parser name.anonymous
in paren $ join_sep (to_fmt shorter_name :: to_format_lst args) line
| missing := "<missing>"
with to_format_lst : list syntax → list format
| [] := []
| (s::ss) := to_format s :: to_format_lst ss
/- Remark: the state `string.iterator` is the `source_info.trailing.stop` of the previous token,
or the beginning of the string. -/
private mutual def update_leading_aux, update_leading_lst
with update_leading_aux : syntax → state string.iterator syntax
| (atom a@{info := some info, ..}) := do
@ -103,14 +109,27 @@ with update_leading_lst : list syntax → state string.iterator (list syntax)
/-- Set `source_info.leading` according to the trailing stop of the preceding token.
The result is a round-tripping syntax tree IF, in the input syntax tree,
* all leading stops, atom contents, and trailing starts are correct
* trailing stops are between the trailing start and the next leading stop. -/
* trailing stops are between the trailing start and the next leading stop.
Remark: after parsing all `source_info.leading` fields are empty.
The syntax argument is the output produced by the parser for `source`.
This function "fixes" the `source.leanding` field.
Note that, the `source_info.trailing` fields are correct.
The implementation of this function relies on this property. -/
def update_leading (source : string) : syntax → syntax :=
λ stx, prod.fst $ (update_leading_aux stx).run source.mk_iterator
/-- Retrieve the left-most leaf in the syntax tree -/
def is_empty_node : syntax → bool
| (node ⟨_, []⟩) := tt
| _ := ff
/-- Retrieve the left-most leaf in the syntax tree. -/
def get_head_atom : syntax → option syntax_atom
| (atom a) := some a
| (node ⟨_, n::_⟩) := n.get_head_atom
-- TODO: handle case where `n` is an empty `syntax_node`
-- We will have to create a mutual recursion here Arghhhh
| (node ⟨_, n::ns⟩) := n.get_head_atom
| _ := none
def get_pos (stx : syntax) : option parsec.position :=
@ -118,13 +137,13 @@ do a ← stx.get_head_atom,
i ← a.info,
pure i.pos
def reprint_with_info : option source_info → string → string
| (some info) inner := info.leading.to_string ++ inner ++ info.trailing.to_string
| none inner := inner
def reprint_atom : syntax_atom → string
| ⟨some info, s⟩ := info.leading.to_string ++ s ++ info.trailing.to_string
| ⟨none, s⟩ := s
mutual def reprint, reprint_lst
with reprint : syntax → option string
| (atom ⟨info, s⟩) := reprint_with_info info s
| (atom a) := reprint_atom a
| (node ⟨some k, ns⟩) :=
if k.name = choice.name then match ns with
-- should never happen

View file

@ -26,10 +26,12 @@ do cfg ← read,
private def finish_comment_block_aux : nat → nat → basic_parser_m unit
| nesting (n+1) :=
str "/-" *> finish_comment_block_aux (nesting + 1) n <|>
str "/-" *> finish_comment_block_aux (nesting + 1) n
<|>
str "-/" *>
(if nesting = 1 then pure ()
else finish_comment_block_aux (nesting - 1) n) <|>
if nesting = 1 then pure ()
else finish_comment_block_aux (nesting - 1) n
<|>
any *> finish_comment_block_aux nesting n
| _ _ := error "unreachable"
@ -40,9 +42,11 @@ do r ← remaining,
private def whitespace_aux : nat → basic_parser_m unit
| (n+1) :=
do whitespace,
str "--" *> take_while' (≠ '\n') *> whitespace_aux n <|>
str "--" *> take_while' (≠ '\n') *> whitespace_aux n
<|>
-- a "/--" doc comment is an actual token, not whitespace
try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n <|>
try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n
<|>
pure ()
| 0 := error "unreachable"
@ -78,18 +82,23 @@ with update_trailing_lst : substring → list syntax → list syntax
| trailing [stx] := [update_trailing trailing stx]
| trailing (stx::stxs) := stx :: update_trailing_lst trailing stxs
@[inline] def with_trailing (stx : syntax) : m syntax :=
def with_trailing (stx : syntax) : m syntax :=
do -- TODO(Sebastian): less greedy, more natural whitespace assignment
-- E.g. only read up to the next line break
trailing ← lift $ as_substring $ whitespace,
pure $ update_trailing trailing stx
def mk_raw_res (start stop : string.iterator) : syntax :=
let ss : substring := ⟨start, stop⟩ in
syntax.atom ⟨some {leading := ⟨start, start⟩, pos := start.offset, trailing := ⟨stop, stop⟩}, ss.to_string⟩
/-- Match an arbitrary parser and return the consumed string in a `syntax.atom`. -/
@[inline] def raw {α : Type} (p : m α) (trailing_ws := ff) : parser :=
try $ do
it ← left_over,
ss ← as_substring p,
let stx := syntax.atom ⟨some {leading := ⟨it, it⟩, pos := it.offset, trailing := ⟨it, it⟩}, ss.to_string⟩,
start ← left_over,
p,
stop ← left_over,
let stx := mk_raw_res start stop,
if trailing_ws then with_trailing stx else pure stx
instance raw.tokens {α} (p : m α) (t) : parser.has_tokens (raw p t : parser) := default _
@ -100,12 +109,12 @@ instance raw.view {α} (p : m α) (t) : parser.has_view (raw p t : parser) (opti
review := λ a, (syntax.atom <$> a).get_or_else syntax.missing }
/-- Like `raw (str s)`, but default to `s` in views. -/
@[inline, derive has_tokens has_view]
@[derive has_tokens has_view]
def raw_str (s : string) (trailing_ws := ff) : parser :=
raw (str s) trailing_ws
instance raw_str.view_default (s) (t) : parser.has_view_default (raw_str s t : parser) (option syntax_atom)
(some {val := s}) := ⟨⟩
instance raw_str.view_default (s) (t) : parser.has_view_default (raw_str s t : parser) (option syntax_atom) (some {val := s}) :=
⟨⟩
end
@ -119,14 +128,14 @@ node_choice! ident_part {
escaped: raw $ take_until1 is_id_end_escape,
esc_end: raw_str id_end_escape.to_string,
],
default: lookahead (satisfy is_id_first) *> raw (take_while is_id_rest)
default: raw $ satisfy is_id_first *> take_while is_id_rest
}
@[derive has_tokens has_view]
def ident_suffix.parser : rec_t unit syntax basic_parser_m syntax :=
-- consume '.' only when followed by a character starting an ident_part
try (lookahead (ch '.' *> (ch id_begin_escape *> pure () <|> satisfy is_id_first *> pure ()))) *>
node! ident_suffix [«.»: raw_str ".", ident: recurse ()]
try (lookahead (ch '.' *> (ch id_begin_escape <|> satisfy is_id_first)))
*> node! ident_suffix [«.»: raw_str ".", ident: recurse ()]
def ident'' : rec_t unit syntax basic_parser_m syntax :=
node! ident [part: monad_lift ident_part.parser, suffix: optional ident_suffix.parser]
@ -277,8 +286,7 @@ instance symbol_or_ident.view (sym) : parser.has_view (symbol_or_ident sym : par
def unicode_symbol (unicode ascii : string) (lbp := 0) : parser :=
lift $ any_of [symbol unicode lbp, symbol ascii lbp]
-- use unicode variant by default
instance unicode_symbol.view_default (u a lbp) : parser.has_view_default (unicode_symbol u a lbp : parser) _
(u : syntax) := ⟨⟩
instance unicode_symbol.view_default (u a lbp) : parser.has_view_default (unicode_symbol u a lbp : parser) _ (syntax.atom ⟨none, u⟩) := ⟨⟩
end «parser»
end lean

View file

@ -29,7 +29,7 @@ csimp_cfg::csimp_cfg() {
m_inline_threshold = 1;
m_float_cases_app = true;
m_float_cases = true;
m_float_cases_threshold = 40;
m_float_cases_threshold = 10;
m_inline_jp_threshold = 2;
}