From 15a25d5aa9d2fe3ab89dbdc7df7cd9ea53fc0704 Mon Sep 17 00:00:00 2001
From: Leonardo de Moura <leonardo@microsoft.com>
Date: Thu, 11 Oct 2018 15:54:57 -0700
Subject: [PATCH] chore(library/init/lean/parser): add a few comments

---
 library/init/lean/parser/basic.lean       | 18 ++++----
 library/init/lean/parser/combinators.lean | 28 +++++++++----
 library/init/lean/parser/parsec.lean      | 12 +++---
 library/init/lean/parser/syntax.lean      | 51 ++++++++++++++++-------
 library/init/lean/parser/token.lean       | 42 +++++++++++--------
 src/library/compiler/csimp.cpp            |  2 +-
 6 files changed, 97 insertions(+), 56 deletions(-)

diff --git a/library/init/lean/parser/basic.lean b/library/init/lean/parser/basic.lean
index 67cf4305bc..7fac098191 100644
--- a/library/init/lean/parser/basic.lean
+++ b/library/init/lean/parser/basic.lean
@@ -35,7 +35,12 @@ structure token_config :=
 /- An optional parser that is activated after matching `prefix`.
    It should return a syntax tree with a "hole" for the
    `source_info` surrounding the token, which will be supplied
-   by the `token` parser. -/
+   by the `token` parser.
+
+   Remark: `suffix_parser` has many applications for example for parsing
+   hexdecimal numbers, `prefix` is `0x` and `suffix_parser` is the parser `digit*`.
+   We also use it to parse string literals: here `prefix` is just `"`.
+-/
 (suffix_parser : option (parsec' (source_info → syntax)) := none)
 
 -- Backtrackable state
@@ -50,6 +55,7 @@ structure token_cache_entry :=
 structure parser_cache :=
 (token_cache : option token_cache_entry := none)
 
+/- Remark: if we have a node in the trie with `some token_config`, the string induced by the path is equal to the `token_config.prefix`. -/
 structure parser_config :=
 (tokens : trie token_config)
 (filename : string)
@@ -80,12 +86,8 @@ variable {ρ : Type}
 class has_tokens (r : ρ) := mk {} ::
 (tokens : list token_config)
 
-def donotinline {α : Type} (a : α) (f : α → α := id) :=
-f (f a)
-
--- do NOT inline this function
-def tokens (r : ρ) [has_tokens r] :=
-donotinline (has_tokens.tokens r)
+@[noinline] def tokens (r : ρ) [has_tokens r] :=
+has_tokens.tokens r
 
 instance has_tokens.inhabited (r : ρ) : inhabited (has_tokens r) :=
 ⟨⟨[]⟩⟩
@@ -176,6 +178,7 @@ instance command_parser_m.basic_parser (ρ : Type) [has_lift_t ρ parser_config]
 ⟨λ _ x cfg rec, x.run ↑cfg⟩
 end
 
+/- The `nat` at `rec_t` is the lbp` -/
 @[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec monad_basic_parser]
 def term_parser_m := rec_t nat syntax $ command_parser_m parser_config
 abbreviation term_parser := term_parser_m syntax
@@ -189,6 +192,7 @@ instance trailing_term_parser_coe : has_coe term_parser trailing_term_parser :=
 ⟨λ x _, x⟩
 
 -- This needs to be a separate structure since `term_parser`s cannot contain themselves in their config
+-- TODO: we need indexing here.
 structure command_parser_config extends parser_config :=
 (leading_term_parsers : list term_parser)
 (trailing_term_parsers : list trailing_term_parser)
diff --git a/library/init/lean/parser/combinators.lean b/library/init/lean/parser/combinators.lean
index 2368327fb5..e1e7847c90 100644
--- a/library/init/lean/parser/combinators.lean
+++ b/library/init/lean/parser/combinators.lean
@@ -20,18 +20,18 @@ local notation `parser` := m syntax
 variables [monad m] [monad_except (parsec.message syntax) m] [monad_parsec syntax m] [alternative m]
 
 def node' (k : option syntax_node_kind) (rs : list parser) : parser :=
-do (args, _) ← rs.mfoldl (λ (p : list syntax × nat) r, do
-     (args, remaining) ← pure p,
+do args ← rs.mfoldl (λ (p : list syntax) r, do
+     args ← pure p,
      -- on error, append partial syntax tree to previous successful parses and rethrow
      a ← catch r $ λ msg,
        let args := msg.custom :: args in
        throw {msg with custom := syntax.node ⟨k, args.reverse⟩},
-     pure (a::args, remaining - 1)
-   ) ([], rs.length),
+     pure (a::args)
+   ) [],
    pure $ syntax.node ⟨k, args.reverse⟩
 
 @[reducible] def seq : list parser → parser := node' none
-@[reducible] def node (k : syntax_node_kind) : list parser → parser := node' k
+@[reducible] def node (k : syntax_node_kind) : list parser → parser := node' (some k)
 
 instance node'.tokens (k) (rs : list parser) [parser.has_tokens rs] : parser.has_tokens (node' k rs) :=
 ⟨tokens rs⟩
@@ -39,6 +39,8 @@ instance node'.tokens (k) (rs : list parser) [parser.has_tokens rs] : parser.has
 instance node.view (k) (rs : list parser) [i : has_view k α] : parser.has_view (node k rs) α :=
 { view := i.view, review := i.review }
 
+-- Each parser combinator comes equipped with `has_view` and `has_tokens` instances
+
 private def many1_aux (p : parser) : list syntax → nat → parser
 | as 0     := error "unreachable"
 | as (n+1) := do
@@ -66,6 +68,7 @@ instance many.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (man
 ⟨tokens r⟩
 
 instance many.view (r : parser) [has_view r α] : parser.has_view (many r) (list α) :=
+/- Remark: `many1.view` can handle empty list. -/
 {..many1.view r}
 
 private def sep_by_aux (p : m syntax) (sep : parser) (allow_trailing_sep : bool) : bool → list syntax → nat → parser
@@ -164,15 +167,22 @@ instance longest_match.view (rs : list parser) : parser.has_view (longest_match
 /-- Parse a list `[p1, ..., pn]` of parsers as `p1 <|> ... <|> pn`.
     The result will be wrapped in a node with the the index of the successful
     parser as the name. -/
+def choice_aux : list parser → nat → parser
+| []      _ := error "choice: empty list"
+| (r::rs) i :=
+  do { stx ← r,
+       pure $ syntax.node ⟨some ⟨name.mk_numeral name.anonymous i⟩, [stx]⟩ }
+  <|> choice_aux rs (i+1)
+
 def choice (rs : list parser) : parser :=
-rs.enum.foldr
-  (λ ⟨i, r⟩ r', (λ stx, syntax.node ⟨some ⟨name.mk_numeral name.anonymous i⟩, [stx]⟩) <$> r <|> r')
-  -- use `foldr` so that any other error is preferred over this one
-  (error "choice: empty list")
+choice_aux rs 0
 
 instance choice.tokens (rs : list parser) [parser.has_tokens rs] : parser.has_tokens (choice rs) :=
 ⟨tokens rs⟩
 
+/- Remark: `choice` does not have `has_view` instance because we only use it at the pratt combinator
+   which doesn't need the view. -/
+
 instance try.tokens (r : parser) [parser.has_tokens r] : parser.has_tokens (try r) :=
 ⟨tokens r⟩
 instance try.view (r : parser) [i : parser.has_view r α] : parser.has_view (try r) α :=
diff --git a/library/init/lean/parser/parsec.lean b/library/init/lean/parser/parsec.lean
index 6e721f83e2..8ab979b37d 100644
--- a/library/init/lean/parser/parsec.lean
+++ b/library/init/lean/parser/parsec.lean
@@ -256,10 +256,10 @@ namespace monad_parsec
 open parsec_t
 variables {m : Type → Type} [monad m] [monad_parsec μ m] [inhabited μ] {α β : Type}
 
-@[inline] def error {α : Type} (unexpected : string) (expected : dlist string := dlist.empty) (it : option iterator := none) (custom : μ := default _) : m α :=
+def error {α : Type} (unexpected : string) (expected : dlist string := dlist.empty) (it : option iterator := none) (custom : μ := default _) : m α :=
 lift $ λ it', result.error { unexpected := unexpected, expected := expected, it := it.get_or_else it', custom := custom } ff
 
-@[inline] def left_over : m iterator :=
+def left_over : m iterator :=
 lift $ λ it, result.mk_eps it it
 
 /-- Return the number of characters left to be parsed. -/
@@ -310,7 +310,7 @@ do it ← left_over,
        if p c then error (repr c)
        else pure ()
 
-@[inline] def eoi_error (it : iterator) : result μ α :=
+def eoi_error (it : iterator) : result μ α :=
 result.error { it := it, unexpected := "end of input", custom := default _ } ff
 
 def curr : m char :=
@@ -360,7 +360,7 @@ This parser consumes no input if it fails (even if a partial match).
 Note: The behaviour of this parser is different to that the `string` parser in the Parsec_t Μ M Haskell library,
 as this one is all-or-nothing.
 -/
-@[inline] def str (s : string) : m string :=
+def str (s : string) : m string :=
 if s.is_empty then pure ""
 else lift $ λ it, match str_aux s.length s.mk_iterator it with
   | some it' := result.ok s it' none
@@ -377,7 +377,7 @@ def take (n : nat) : m string :=
 if n = 0 then pure ""
 else lift $ take_aux n ""
 
-@[inline] private def mk_string_result (r : string) (it : iterator) : result μ string :=
+private def mk_string_result (r : string) (it : iterator) : result μ string :=
 if r.is_empty then result.mk_eps r it
 else result.ok r it none
 
@@ -415,7 +415,7 @@ take_while (λ c, !p c)
 def take_until1 (p : char → bool) : m string :=
 take_while1 (λ c, !p c)
 
-@[inline] private def mk_consumed_result (consumed : bool) (it : iterator) : result μ unit :=
+private def mk_consumed_result (consumed : bool) (it : iterator) : result μ unit :=
 if consumed then result.ok () it none
 else result.mk_eps () it
 
diff --git a/library/init/lean/parser/syntax.lean b/library/init/lean/parser/syntax.lean
index 92e0c7b63f..dac3ae8538 100644
--- a/library/init/lean/parser/syntax.lean
+++ b/library/init/lean/parser/syntax.lean
@@ -9,10 +9,6 @@ import init.lean.name init.lean.parser.parsec
 namespace lean
 namespace parser
 
-/-- De Bruijn index relative to surrounding 'bind' macros -/
-@[reducible] def var_offset := ℕ
-@[reducible] def macro_scope_id := ℕ
-
 --TODO(Sebastian): move
 structure substring :=
 (start : string.iterator)
@@ -36,11 +32,18 @@ structure syntax_node_kind :=
 
 @[pattern] def choice : syntax_node_kind := ⟨`lean.parser.choice⟩
 
+/-
+Parsers create `syntax_node`'s with the following property:
+- If `args` contains a `syntax.missing`, then all subsequent elements are also `syntax.missing`.
+- We believe the first argument in `args` is not `syntax.missing`. TODO: check this.
+
+Remark: We do create `syntax_node`'s with an empty `args` field.
+-/
 structure syntax_node (syntax : Type) :=
+-- TODO: add `lean.parser.list` kind, and remove option. Then `none` = `lean.parser.seq`
 (kind : option syntax_node_kind) (args : list syntax)
 
 inductive syntax
-/- any non-ident atom -/
 | atom (val : syntax_atom)
 | node (val : syntax_node syntax)
 | missing
@@ -48,9 +51,6 @@ inductive syntax
 instance : inhabited syntax :=
 ⟨syntax.missing⟩
 
-instance coe_string_syntax : has_coe string syntax :=
-⟨λ s, syntax.atom ⟨none, s⟩⟩
-
 def substring.to_string (s : substring) : string :=
 (s.start.extract s.stop).get_or_else ""
 
@@ -61,6 +61,9 @@ def is_of_kind (k : syntax_node_kind) : syntax → bool
 | (syntax.node ⟨some k', _⟩) := k.name = k'.name
 | _ := ff
 
+-- Remark: this function must be updated whenever `ident` parser is modified.
+-- This function was defined before we had the `ident` parser.
+-- TODO: move it to the `ident` parser file and use the view defined there.
 private def ident_to_format : syntax → format
 | stx := option.get_or_else (do
   syntax.node ⟨_, [syntax.node ⟨_, [syntax.node ⟨some ⟨idx⟩, part⟩]⟩, suffix]⟩ ← pure stx | failure,
@@ -82,12 +85,15 @@ with to_format : syntax → format
 | stx@(node {kind := some kind, args := args, ..}) :=
   if kind.name = `lean.parser.ident
   then to_fmt "`" ++ ident_to_format stx
-  else paren $ join_sep (to_fmt (kind.name.replace_prefix `lean.parser name.anonymous) :: to_format_lst args) line
+  else let shorter_name := kind.name.replace_prefix `lean.parser name.anonymous
+       in paren $ join_sep (to_fmt shorter_name :: to_format_lst args) line
 | missing := "<missing>"
 with to_format_lst : list syntax → list format
 | []      := []
 | (s::ss) := to_format s :: to_format_lst ss
 
+/- Remark: the state `string.iterator` is the `source_info.trailing.stop` of the previous token,
+   or the beginning of the string. -/
 private mutual def update_leading_aux, update_leading_lst
 with update_leading_aux : syntax → state string.iterator syntax
 | (atom a@{info := some info, ..}) := do
@@ -103,14 +109,27 @@ with update_leading_lst : list syntax → state string.iterator (list syntax)
 /-- Set `source_info.leading` according to the trailing stop of the preceding token.
     The result is a round-tripping syntax tree IF, in the input syntax tree,
     * all leading stops, atom contents, and trailing starts are correct
-    * trailing stops are between the trailing start and the next leading stop. -/
+    * trailing stops are between the trailing start and the next leading stop.
+
+    Remark: after parsing all `source_info.leading` fields are empty.
+    The syntax argument is the output produced by the parser for `source`.
+    This function "fixes" the `source.leanding` field.
+
+    Note that, the `source_info.trailing` fields are correct.
+    The implementation of this function relies on this property. -/
 def update_leading (source : string) : syntax → syntax :=
 λ stx, prod.fst $ (update_leading_aux stx).run source.mk_iterator
 
-/-- Retrieve the left-most leaf in the syntax tree -/
+def is_empty_node : syntax → bool
+| (node ⟨_, []⟩) := tt
+| _              := ff
+
+/-- Retrieve the left-most leaf in the syntax tree. -/
 def get_head_atom : syntax → option syntax_atom
 | (atom a) := some a
-| (node ⟨_, n::_⟩) := n.get_head_atom
+-- TODO: handle case where `n` is an empty `syntax_node`
+-- We will have to create a mutual recursion here Arghhhh
+| (node ⟨_, n::ns⟩) := n.get_head_atom
 | _ := none
 
 def get_pos (stx : syntax) : option parsec.position :=
@@ -118,13 +137,13 @@ do a ← stx.get_head_atom,
    i ← a.info,
    pure i.pos
 
-def reprint_with_info : option source_info → string → string
-| (some info) inner := info.leading.to_string ++ inner ++ info.trailing.to_string
-| none        inner := inner
+def reprint_atom : syntax_atom → string
+| ⟨some info, s⟩ := info.leading.to_string ++ s ++ info.trailing.to_string
+| ⟨none, s⟩      := s
 
 mutual def reprint, reprint_lst
 with reprint : syntax → option string
-| (atom ⟨info, s⟩) := reprint_with_info info s
+| (atom a) := reprint_atom a
 | (node ⟨some k, ns⟩) :=
   if k.name = choice.name then match ns with
   -- should never happen
diff --git a/library/init/lean/parser/token.lean b/library/init/lean/parser/token.lean
index 36b51cb5b8..1d6500431c 100644
--- a/library/init/lean/parser/token.lean
+++ b/library/init/lean/parser/token.lean
@@ -26,10 +26,12 @@ do cfg ← read,
 
 private def finish_comment_block_aux : nat → nat → basic_parser_m unit
 | nesting (n+1) :=
-  str "/-" *> finish_comment_block_aux (nesting + 1) n <|>
+  str "/-" *> finish_comment_block_aux (nesting + 1) n
+  <|>
   str "-/" *>
-    (if nesting = 1 then pure ()
-     else finish_comment_block_aux (nesting - 1) n) <|>
+  if nesting = 1 then pure ()
+  else finish_comment_block_aux (nesting - 1) n
+  <|>
   any *> finish_comment_block_aux nesting n
 | _ _ := error "unreachable"
 
@@ -40,9 +42,11 @@ do r ← remaining,
 private def whitespace_aux : nat → basic_parser_m unit
 | (n+1) :=
 do whitespace,
-   str "--" *> take_while' (≠ '\n') *> whitespace_aux n <|>
+   str "--" *> take_while' (≠ '\n') *> whitespace_aux n
+   <|>
    -- a "/--" doc comment is an actual token, not whitespace
-   try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n <|>
+   try (str "/-" *> not_followed_by (str "-")) *> finish_comment_block *> whitespace_aux n
+   <|>
    pure ()
 | 0 := error "unreachable"
 
@@ -78,18 +82,23 @@ with update_trailing_lst : substring → list syntax → list syntax
 | trailing [stx] := [update_trailing trailing stx]
 | trailing (stx::stxs) := stx :: update_trailing_lst trailing stxs
 
-@[inline] def with_trailing (stx : syntax) : m syntax :=
+def with_trailing (stx : syntax) : m syntax :=
 do -- TODO(Sebastian): less greedy, more natural whitespace assignment
    -- E.g. only read up to the next line break
    trailing ← lift $ as_substring $ whitespace,
    pure $ update_trailing trailing stx
 
+def mk_raw_res (start stop : string.iterator) : syntax :=
+let ss : substring := ⟨start, stop⟩ in
+syntax.atom ⟨some {leading := ⟨start, start⟩, pos := start.offset, trailing := ⟨stop, stop⟩}, ss.to_string⟩
+
 /-- Match an arbitrary parser and return the consumed string in a `syntax.atom`. -/
 @[inline] def raw {α : Type} (p : m α) (trailing_ws := ff) : parser :=
 try $ do
-  it ← left_over,
-  ss ← as_substring p,
-  let stx := syntax.atom ⟨some {leading := ⟨it, it⟩, pos := it.offset, trailing := ⟨it, it⟩}, ss.to_string⟩,
+  start ← left_over,
+  p,
+  stop ← left_over,
+  let stx := mk_raw_res start stop,
   if trailing_ws then with_trailing stx else pure stx
 
 instance raw.tokens {α} (p : m α) (t) : parser.has_tokens (raw p t : parser) := default _
@@ -100,12 +109,12 @@ instance raw.view {α} (p : m α) (t) : parser.has_view (raw p t : parser) (opti
   review := λ a, (syntax.atom <$> a).get_or_else syntax.missing }
 
 /-- Like `raw (str s)`, but default to `s` in views. -/
-@[inline, derive has_tokens has_view]
+@[derive has_tokens has_view]
 def raw_str (s : string) (trailing_ws := ff) : parser :=
 raw (str s) trailing_ws
 
-instance raw_str.view_default (s) (t) : parser.has_view_default (raw_str s t : parser) (option syntax_atom)
-  (some {val := s}) := ⟨⟩
+instance raw_str.view_default (s) (t) : parser.has_view_default (raw_str s t : parser) (option syntax_atom) (some {val := s}) :=
+⟨⟩
 
 end
 
@@ -119,14 +128,14 @@ node_choice! ident_part {
     escaped: raw $ take_until1 is_id_end_escape,
     esc_end: raw_str id_end_escape.to_string,
   ],
-  default: lookahead (satisfy is_id_first) *> raw (take_while is_id_rest)
+  default: raw $ satisfy is_id_first *> take_while is_id_rest
 }
 
 @[derive has_tokens has_view]
 def ident_suffix.parser : rec_t unit syntax basic_parser_m syntax :=
 -- consume '.' only when followed by a character starting an ident_part
-try (lookahead (ch '.' *> (ch id_begin_escape *> pure () <|> satisfy is_id_first *> pure ()))) *>
-node! ident_suffix [«.»: raw_str ".", ident: recurse ()]
+try (lookahead (ch '.' *> (ch id_begin_escape <|> satisfy is_id_first)))
+*> node! ident_suffix [«.»: raw_str ".", ident: recurse ()]
 
 def ident'' : rec_t unit syntax basic_parser_m syntax :=
 node! ident [part: monad_lift ident_part.parser, suffix: optional ident_suffix.parser]
@@ -277,8 +286,7 @@ instance symbol_or_ident.view (sym) : parser.has_view (symbol_or_ident sym : par
 def unicode_symbol (unicode ascii : string) (lbp := 0) : parser :=
 lift $ any_of [symbol unicode lbp, symbol ascii lbp]
 -- use unicode variant by default
-instance unicode_symbol.view_default (u a lbp) : parser.has_view_default (unicode_symbol u a lbp : parser) _
-  (u : syntax) := ⟨⟩
+instance unicode_symbol.view_default (u a lbp) : parser.has_view_default (unicode_symbol u a lbp : parser) _ (syntax.atom ⟨none, u⟩) := ⟨⟩
 
 end «parser»
 end lean
diff --git a/src/library/compiler/csimp.cpp b/src/library/compiler/csimp.cpp
index a3b8b217b5..ea61241e30 100644
--- a/src/library/compiler/csimp.cpp
+++ b/src/library/compiler/csimp.cpp
@@ -29,7 +29,7 @@ csimp_cfg::csimp_cfg() {
     m_inline_threshold                = 1;
     m_float_cases_app                 = true;
     m_float_cases                     = true;
-    m_float_cases_threshold           = 40;
+    m_float_cases_threshold           = 10;
     m_inline_jp_threshold             = 2;
 }