From 15a25d5aa9d2fe3ab89dbdc7df7cd9ea53fc0704 Mon Sep 17 00:00:00 2001 From: Leonardo de Moura Date: Thu, 11 Oct 2018 15:54:57 -0700 Subject: [PATCH] chore(library/init/lean/parser): add a few comments --- library/init/lean/parser/basic.lean | 18 ++++---- library/init/lean/parser/combinators.lean | 28 +++++++++---- library/init/lean/parser/parsec.lean | 12 +++--- library/init/lean/parser/syntax.lean | 51 ++++++++++++++++------- library/init/lean/parser/token.lean | 42 +++++++++++-------- src/library/compiler/csimp.cpp | 2 +- 6 files changed, 97 insertions(+), 56 deletions(-) diff --git a/library/init/lean/parser/basic.lean b/library/init/lean/parser/basic.lean index 67cf4305bc..7fac098191 100644 --- a/library/init/lean/parser/basic.lean +++ b/library/init/lean/parser/basic.lean @@ -35,7 +35,12 @@ structure token_config := /- An optional parser that is activated after matching `prefix`. It should return a syntax tree with a "hole" for the `source_info` surrounding the token, which will be supplied - by the `token` parser. -/ + by the `token` parser. + + Remark: `suffix_parser` has many applications for example for parsing + hexdecimal numbers, `prefix` is `0x` and `suffix_parser` is the parser `digit*`. + We also use it to parse string literals: here `prefix` is just `"`. +-/ (suffix_parser : option (parsec' (source_info → syntax)) := none) -- Backtrackable state @@ -50,6 +55,7 @@ structure token_cache_entry := structure parser_cache := (token_cache : option token_cache_entry := none) +/- Remark: if we have a node in the trie with `some token_config`, the string induced by the path is equal to the `token_config.prefix`. -/ structure parser_config := (tokens : trie token_config) (filename : string) @@ -80,12 +86,8 @@ variable {ρ : Type} class has_tokens (r : ρ) := mk {} :: (tokens : list token_config) -def donotinline {α : Type} (a : α) (f : α → α := id) := -f (f a) - --- do NOT inline this function -def tokens (r : ρ) [has_tokens r] := -donotinline (has_tokens.tokens r) +@[noinline] def tokens (r : ρ) [has_tokens r] := +has_tokens.tokens r instance has_tokens.inhabited (r : ρ) : inhabited (has_tokens r) := ⟨⟨[]⟩⟩ @@ -176,6 +178,7 @@ instance command_parser_m.basic_parser (ρ : Type) [has_lift_t ρ parser_config] ⟨λ _ x cfg rec, x.run ↑cfg⟩ end +/- The `nat` at `rec_t` is the lbp` -/ @[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec monad_basic_parser] def term_parser_m := rec_t nat syntax $ command_parser_m parser_config abbreviation term_parser := term_parser_m syntax @@ -189,6 +192,7 @@ instance trailing_term_parser_coe : has_coe term_parser trailing_term_parser := ⟨λ x _, x⟩ -- This needs to be a separate structure since `term_parser`s cannot contain themselves in their config +-- TODO: we need indexing here. structure command_parser_config extends parser_config := (leading_term_parsers : list term_parser) (trailing_term_parsers : list trailing_term_parser) diff --git a/library/init/lean/parser/combinators.lean b/library/init/lean/parser/combinators.lean index 2368327fb5..e1e7847c90 100644 --- a/library/init/lean/parser/combinators.lean +++ b/library/init/lean/parser/combinators.lean @@ -20,18 +20,18 @@ local notation `parser` := m syntax variables [monad m] [monad_except (parsec.message syntax) m] [monad_parsec syntax m] [alternative m] def node' (k : option syntax_node_kind) (rs : list parser) : parser := -do (args, _) ← rs.mfoldl (λ (p : list syntax × nat) r, do - (args, remaining) ← pure p, +do args ← rs.mfoldl (λ (p : list syntax) r, do + args ← pure p, -- on error, append partial syntax tree to previous successful parses and rethrow a ← catch r $ λ msg, let args := msg.custom :: args in throw {msg with custom := syntax.node ⟨k, args.reverse⟩}, - pure (a::args, remaining - 1) - ) ([], rs.length), + pure (a::args) + ) [], pure $ syntax.node ⟨k, args.reverse⟩ @[reducible] def seq : list parser → parser := node' none -@[reducible] def node (k : syntax_node_kind) : list parser → parser := node' k +@[reducible] def node (k : syntax_node_kind) : list parser → parser := node' (some k) instance node'.tokens (k) (rs : list parser) [parser.has_tokens rs] : parser.has_tokens (node' k rs) := ⟨tokens rs⟩ @@ -39,6 +39,8 @@ instance node'.tokens (k) (rs : list parser) [parser.has_tokens rs] : parser.has instance node.view (k) (rs : list parser) [i : has_view k α] : parser.has_view (node k rs) α := { view := i.view, review := i.review } +-- Each parser combinator comes equipped with `has_view` and `has_tokens` instances + private def many1_aux (p : parser) : list syntax → nat → parser | as 0 := error "unreachable" | as (n+1) := do @@ -66,6 +68,7 @@ instance many.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (man ⟨tokens r⟩ instance many.view (r : parser) [has_view r α] : parser.has_view (many r) (list α) := +/- Remark: `many1.view` can handle empty list. -/ {..many1.view r} private def sep_by_aux (p : m syntax) (sep : parser) (allow_trailing_sep : bool) : bool → list syntax → nat → parser @@ -164,15 +167,22 @@ instance longest_match.view (rs : list parser) : parser.has_view (longest_match /-- Parse a list `[p1, ..., pn]` of parsers as `p1 <|> ... <|> pn`. The result will be wrapped in a node with the the index of the successful parser as the name. -/ +def choice_aux : list parser → nat → parser +| [] _ := error "choice: empty list" +| (r::rs) i := + do { stx ← r, + pure $ syntax.node ⟨some ⟨name.mk_numeral name.anonymous i⟩, [stx]⟩ } + <|> choice_aux rs (i+1) + def choice (rs : list parser) : parser := -rs.enum.foldr - (λ ⟨i, r⟩ r', (λ stx, syntax.node ⟨some ⟨name.mk_numeral name.anonymous i⟩, [stx]⟩) <$> r <|> r') - -- use `foldr` so that any other error is preferred over this one - (error "choice: empty list") +choice_aux rs 0 instance choice.tokens (rs : list parser) [parser.has_tokens rs] : parser.has_tokens (choice rs) := ⟨tokens rs⟩ +/- Remark: `choice` does not have `has_view` instance because we only use it at the pratt combinator + which doesn't need the view. -/ + instance try.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (try r) := ⟨tokens r⟩ instance try.view (r : parser) [i : parser.has_view r α] : parser.has_view (try r) α := diff --git a/library/init/lean/parser/parsec.lean b/library/init/lean/parser/parsec.lean index 6e721f83e2..8ab979b37d 100644 --- a/library/init/lean/parser/parsec.lean +++ b/library/init/lean/parser/parsec.lean @@ -256,10 +256,10 @@ namespace monad_parsec open parsec_t variables {m : Type → Type} [monad m] [monad_parsec μ m] [inhabited μ] {α β : Type} -@[inline] def error {α : Type} (unexpected : string) (expected : dlist string := dlist.empty) (it : option iterator := none) (custom : μ := default _) : m α := +def error {α : Type} (unexpected : string) (expected : dlist string := dlist.empty) (it : option iterator := none) (custom : μ := default _) : m α := lift $ λ it', result.error { unexpected := unexpected, expected := expected, it := it.get_or_else it', custom := custom } ff -@[inline] def left_over : m iterator := +def left_over : m iterator := lift $ λ it, result.mk_eps it it /-- Return the number of characters left to be parsed. -/ @@ -310,7 +310,7 @@ do it ← left_over, if p c then error (repr c) else pure () -@[inline] def eoi_error (it : iterator) : result μ α := +def eoi_error (it : iterator) : result μ α := result.error { it := it, unexpected := "end of input", custom := default _ } ff def curr : m char := @@ -360,7 +360,7 @@ This parser consumes no input if it fails (even if a partial match). Note: The behaviour of this parser is different to that the `string` parser in the Parsec_t Μ M Haskell library, as this one is all-or-nothing. -/ -@[inline] def str (s : string) : m string := +def str (s : string) : m string := if s.is_empty then pure "" else lift $ λ it, match str_aux s.length s.mk_iterator it with | some it' := result.ok s it' none @@ -377,7 +377,7 @@ def take (n : nat) : m string := if n = 0 then pure "" else lift $ take_aux n "" -@[inline] private def mk_string_result (r : string) (it : iterator) : result μ string := +private def mk_string_result (r : string) (it : iterator) : result μ string := if r.is_empty then result.mk_eps r it else result.ok r it none @@ -415,7 +415,7 @@ take_while (λ c, !p c) def take_until1 (p : char → bool) : m string := take_while1 (λ c, !p c) -@[inline] private def mk_consumed_result (consumed : bool) (it : iterator) : result μ unit := +private def mk_consumed_result (consumed : bool) (it : iterator) : result μ unit := if consumed then result.ok () it none else result.mk_eps () it diff --git a/library/init/lean/parser/syntax.lean b/library/init/lean/parser/syntax.lean index 92e0c7b63f..dac3ae8538 100644 --- a/library/init/lean/parser/syntax.lean +++ b/library/init/lean/parser/syntax.lean @@ -9,10 +9,6 @@ import init.lean.name init.lean.parser.parsec namespace lean namespace parser -/-- De Bruijn index relative to surrounding 'bind' macros -/ -@[reducible] def var_offset := ℕ -@[reducible] def macro_scope_id := ℕ - --TODO(Sebastian): move structure substring := (start : string.iterator) @@ -36,11 +32,18 @@ structure syntax_node_kind := @[pattern] def choice : syntax_node_kind := ⟨`lean.parser.choice⟩ +/- +Parsers create `syntax_node`'s with the following property: +- If `args` contains a `syntax.missing`, then all subsequent elements are also `syntax.missing`. +- We believe the first argument in `args` is not `syntax.missing`. TODO: check this. + +Remark: We do create `syntax_node`'s with an empty `args` field. +-/ structure syntax_node (syntax : Type) := +-- TODO: add `lean.parser.list` kind, and remove option. Then `none` = `lean.parser.seq` (kind : option syntax_node_kind) (args : list syntax) inductive syntax -/- any non-ident atom -/ | atom (val : syntax_atom) | node (val : syntax_node syntax) | missing @@ -48,9 +51,6 @@ inductive syntax instance : inhabited syntax := ⟨syntax.missing⟩ -instance coe_string_syntax : has_coe string syntax := -⟨λ s, syntax.atom ⟨none, s⟩⟩ - def substring.to_string (s : substring) : string := (s.start.extract s.stop).get_or_else "" @@ -61,6 +61,9 @@ def is_of_kind (k : syntax_node_kind) : syntax → bool | (syntax.node ⟨some k', _⟩) := k.name = k'.name | _ := ff +-- Remark: this function must be updated whenever `ident` parser is modified. +-- This function was defined before we had the `ident` parser. +-- TODO: move it to the `ident` parser file and use the view defined there. private def ident_to_format : syntax → format | stx := option.get_or_else (do syntax.node ⟨_, [syntax.node ⟨_, [syntax.node ⟨some ⟨idx⟩, part⟩]⟩, suffix]⟩ ← pure stx | failure, @@ -82,12 +85,15 @@ with to_format : syntax → format | stx@(node {kind := some kind, args := args, ..}) := if kind.name = `lean.parser.ident then to_fmt "`" ++ ident_to_format stx - else paren $ join_sep (to_fmt (kind.name.replace_prefix `lean.parser name.anonymous) :: to_format_lst args) line + else let shorter_name := kind.name.replace_prefix `lean.parser name.anonymous + in paren $ join_sep (to_fmt shorter_name :: to_format_lst args) line | missing := "" with to_format_lst : list syntax → list format | [] := [] | (s::ss) := to_format s :: to_format_lst ss +/- Remark: the state `string.iterator` is the `source_info.trailing.stop` of the previous token, + or the beginning of the string. -/ private mutual def update_leading_aux, update_leading_lst with update_leading_aux : syntax → state string.iterator syntax | (atom a@{info := some info, ..}) := do @@ -103,14 +109,27 @@ with update_leading_lst : list syntax → state string.iterator (list syntax) /-- Set `source_info.leading` according to the trailing stop of the preceding token. The result is a round-tripping syntax tree IF, in the input syntax tree, * all leading stops, atom contents, and trailing starts are correct - * trailing stops are between the trailing start and the next leading stop. -/ + * trailing stops are between the trailing start and the next leading stop. + + Remark: after parsing all `source_info.leading` fields are empty. + The syntax argument is the output produced by the parser for `source`. + This function "fixes" the `source.leanding` field. + + Note that, the `source_info.trailing` fields are correct. + The implementation of this function relies on this property. -/ def update_leading (source : string) : syntax → syntax := λ stx, prod.fst $ (update_leading_aux stx).run source.mk_iterator -/-- Retrieve the left-most leaf in the syntax tree -/ +def is_empty_node : syntax → bool +| (node ⟨_, []⟩) := tt +| _ := ff + +/-- Retrieve the left-most leaf in the syntax tree. -/ def get_head_atom : syntax → option syntax_atom | (atom a) := some a -| (node ⟨_, n::_⟩) := n.get_head_atom +-- TODO: handle case where `n` is an empty `syntax_node` +-- We will have to create a mutual recursion here Arghhhh +| (node ⟨_, n::ns⟩) := n.get_head_atom | _ := none def get_pos (stx : syntax) : option parsec.position := @@ -118,13 +137,13 @@ do a ← stx.get_head_atom, i ← a.info, pure i.pos -def reprint_with_info : option source_info → string → string -| (some info) inner := info.leading.to_string ++ inner ++ info.trailing.to_string -| none inner := inner +def reprint_atom : syntax_atom → string +| ⟨some info, s⟩ := info.leading.to_string ++ s ++ info.trailing.to_string +| ⟨none, s⟩ := s mutual def reprint, reprint_lst with reprint : syntax → option string -| (atom ⟨info, s⟩) := reprint_with_info info s +| (atom a) := reprint_atom a | (node ⟨some k, ns⟩) := if k.name = choice.name then match ns with -- should never happen diff --git a/library/init/lean/parser/token.lean b/library/init/lean/parser/token.lean index 36b51cb5b8..1d6500431c 100644 --- a/library/init/lean/parser/token.lean +++ b/library/init/lean/parser/token.lean @@ -26,10 +26,12 @@ do cfg ← read, private def finish_comment_block_aux : nat → nat → basic_parser_m unit | nesting (n+1) := - str "/-" *> finish_comment_block_aux (nesting + 1) n <|> + str "/-" *> finish_comment_block_aux (nesting + 1) n + <|> str "-/" *> - (if nesting = 1 then pure () - else finish_comment_block_aux (nesting - 1) n) <|> + if nesting = 1 then pure () + else finish_comment_block_aux (nesting - 1) n + <|> any *> finish_comment_block_aux nesting n | _ _ := error "unreachable" @@ -40,9 +42,11 @@ do r ← remaining, private def whitespace_aux : nat → basic_parser_m unit | (n+1) := do whitespace, - str "--" *> take_while' (≠ '\n') *> whitespace_aux n <|> + str "--" *> take_while' (≠ '\n') *> whitespace_aux n + <|> -- a "/--" doc comment is an actual token, not whitespace - try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n <|> + try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n + <|> pure () | 0 := error "unreachable" @@ -78,18 +82,23 @@ with update_trailing_lst : substring → list syntax → list syntax | trailing [stx] := [update_trailing trailing stx] | trailing (stx::stxs) := stx :: update_trailing_lst trailing stxs -@[inline] def with_trailing (stx : syntax) : m syntax := +def with_trailing (stx : syntax) : m syntax := do -- TODO(Sebastian): less greedy, more natural whitespace assignment -- E.g. only read up to the next line break trailing ← lift $ as_substring $ whitespace, pure $ update_trailing trailing stx +def mk_raw_res (start stop : string.iterator) : syntax := +let ss : substring := ⟨start, stop⟩ in +syntax.atom ⟨some {leading := ⟨start, start⟩, pos := start.offset, trailing := ⟨stop, stop⟩}, ss.to_string⟩ + /-- Match an arbitrary parser and return the consumed string in a `syntax.atom`. -/ @[inline] def raw {α : Type} (p : m α) (trailing_ws := ff) : parser := try $ do - it ← left_over, - ss ← as_substring p, - let stx := syntax.atom ⟨some {leading := ⟨it, it⟩, pos := it.offset, trailing := ⟨it, it⟩}, ss.to_string⟩, + start ← left_over, + p, + stop ← left_over, + let stx := mk_raw_res start stop, if trailing_ws then with_trailing stx else pure stx instance raw.tokens {α} (p : m α) (t) : parser.has_tokens (raw p t : parser) := default _ @@ -100,12 +109,12 @@ instance raw.view {α} (p : m α) (t) : parser.has_view (raw p t : parser) (opti review := λ a, (syntax.atom <$> a).get_or_else syntax.missing } /-- Like `raw (str s)`, but default to `s` in views. -/ -@[inline, derive has_tokens has_view] +@[derive has_tokens has_view] def raw_str (s : string) (trailing_ws := ff) : parser := raw (str s) trailing_ws -instance raw_str.view_default (s) (t) : parser.has_view_default (raw_str s t : parser) (option syntax_atom) - (some {val := s}) := ⟨⟩ +instance raw_str.view_default (s) (t) : parser.has_view_default (raw_str s t : parser) (option syntax_atom) (some {val := s}) := +⟨⟩ end @@ -119,14 +128,14 @@ node_choice! ident_part { escaped: raw $ take_until1 is_id_end_escape, esc_end: raw_str id_end_escape.to_string, ], - default: lookahead (satisfy is_id_first) *> raw (take_while is_id_rest) + default: raw $ satisfy is_id_first *> take_while is_id_rest } @[derive has_tokens has_view] def ident_suffix.parser : rec_t unit syntax basic_parser_m syntax := -- consume '.' only when followed by a character starting an ident_part -try (lookahead (ch '.' *> (ch id_begin_escape *> pure () <|> satisfy is_id_first *> pure ()))) *> -node! ident_suffix [«.»: raw_str ".", ident: recurse ()] +try (lookahead (ch '.' *> (ch id_begin_escape <|> satisfy is_id_first))) +*> node! ident_suffix [«.»: raw_str ".", ident: recurse ()] def ident'' : rec_t unit syntax basic_parser_m syntax := node! ident [part: monad_lift ident_part.parser, suffix: optional ident_suffix.parser] @@ -277,8 +286,7 @@ instance symbol_or_ident.view (sym) : parser.has_view (symbol_or_ident sym : par def unicode_symbol (unicode ascii : string) (lbp := 0) : parser := lift $ any_of [symbol unicode lbp, symbol ascii lbp] -- use unicode variant by default -instance unicode_symbol.view_default (u a lbp) : parser.has_view_default (unicode_symbol u a lbp : parser) _ - (u : syntax) := ⟨⟩ +instance unicode_symbol.view_default (u a lbp) : parser.has_view_default (unicode_symbol u a lbp : parser) _ (syntax.atom ⟨none, u⟩) := ⟨⟩ end «parser» end lean diff --git a/src/library/compiler/csimp.cpp b/src/library/compiler/csimp.cpp index a3b8b217b5..ea61241e30 100644 --- a/src/library/compiler/csimp.cpp +++ b/src/library/compiler/csimp.cpp @@ -29,7 +29,7 @@ csimp_cfg::csimp_cfg() { m_inline_threshold = 1; m_float_cases_app = true; m_float_cases = true; - m_float_cases_threshold = 40; + m_float_cases_threshold = 10; m_inline_jp_threshold = 2; }