From 2e5ea16e2f7f48209c57cfd56a1e3461e0766c9e Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Thu, 27 Sep 2018 15:27:00 -0700 Subject: [PATCH] perf(library/init/lean/parser): cache consecutive calls to `token` at the same position Parser performance improved by about 33% --- library/init/lean/parser/basic.lean | 33 ++++++++++++++++++++++------ library/init/lean/parser/module.lean | 9 +++++--- library/init/lean/parser/token.lean | 32 ++++++++++++++++++++------- tests/lean/parser1.lean | 8 +++---- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/library/init/lean/parser/basic.lean b/library/init/lean/parser/basic.lean index 05d6628810..363883db25 100644 --- a/library/init/lean/parser/basic.lean +++ b/library/init/lean/parser/basic.lean @@ -37,19 +37,41 @@ structure token_config := by the `token` parser. -/ (suffix_parser : option (parsec' (source_info → syntax)) := none) +-- Backtrackable state structure parser_state := (messages : message_log) +structure token_cache_entry := +(start_it stop_it : string.iterator) +(tk : syntax) + +-- Non-backtrackable state +structure parser_cache := +(token_cache : option token_cache_entry := none) + structure parser_config := (tokens : trie token_config) (filename : string) +@[derive monad alternative monad_state monad_parsec monad_except] +def parser_core_t (m : Type → Type) [monad m] := +state_t parser_state $ parsec_t syntax $ state_t parser_cache $ m + @[derive monad alternative monad_reader monad_state monad_parsec monad_except] -def parser_t (ρ : Type) (m : Type → Type) [monad m] := reader_t ρ $ state_t parser_state $ parsec_t syntax m +def parser_t (ρ : Type) (m : Type → Type) [monad m] := reader_t ρ $ parser_core_t m abbreviation basic_parser_m := parser_t parser_config id abbreviation basic_parser := basic_parser_m syntax abbreviation monad_basic_parser := has_monad_lift_t basic_parser_m +section +local attribute [reducible] parser_t parser_core_t +def get_cache : basic_parser_m parser_cache := +monad_lift (get : state_t parser_cache id _) + +def put_cache : parser_cache → basic_parser_m punit := +λ c, monad_lift (put c : state_t parser_cache id _) +end + -- an arbitrary `parser` type; parsers are usually some monad stack based on `basic_parser_m` returning `syntax` variable {ρ : Type} @@ -91,16 +113,13 @@ def message_of_parsec_message {μ : Type} (cfg : parser_config) (msg : parsec.me -- FIXME: translate position {filename := cfg.filename, pos := ⟨0, 0⟩, text := to_string msg} -section -local attribute [reducible] parser_t /-- Run parser stack, returning a partial syntax tree in case of a fatal error -/ -protected def run {m : Type → Type} {α ρ : Type} [monad m] [has_coe_t ρ parser_config] (cfg : ρ) (st : parser_state) (s : string) (r : parser_t ρ m α) : +protected def run {m : Type → Type} {α ρ : Type} [monad m] [has_coe_t ρ parser_config] (cfg : ρ) (s : string) (r : parser_t ρ m α) : m (sum α syntax × message_log) := -do r ← ((r.run cfg).run st).parse_with_eoi s, +do (r, _) ← (((r.run cfg).run {messages:=message_log.empty}).parse s).run {}, pure $ match r with | except.ok (a, st) := (sum.inl a, st.messages) | except.error msg := (sum.inr msg.custom, message_log.empty.add (message_of_parsec_message cfg msg)) -end open coroutine open monad_parsec @@ -145,7 +164,7 @@ def parse.view : syntax → option parse.view_ty on a recursive call to `command.parser`, i.e. it forgets about locally registered parsers, but that's not an issue for our intended uses of it. -/ @[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec] -def command_parser_m (ρ : Type) := reader_t ρ $ rec_t unit syntax $ state_t parser_state $ parsec_t syntax id +def command_parser_m (ρ : Type) := reader_t ρ $ rec_t unit syntax $ parser_core_t id section local attribute [reducible] parser_t command_parser_m diff --git a/library/init/lean/parser/module.lean b/library/init/lean/parser/module.lean index f74ca74d87..036ca044ae 100644 --- a/library/init/lean/parser/module.lean +++ b/library/init/lean/parser/module.lean @@ -30,17 +30,20 @@ structure module_parser_output := (messages : message_log) (cfg : module_parser_config) +section +local attribute [reducible] parser_core_t /- NOTE: missing the `reader_t` from `parser_t` because the `coroutine` already provides `monad_reader module_parser_config`. -/ @[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_coroutine] -def module_parser_m := state_t parser_state $ parsec_t syntax $ coroutine module_parser_config module_parser_output +def module_parser_m := parser_core_t $ coroutine module_parser_config module_parser_output abbreviation module_parser := module_parser_m syntax +end instance module_parser_m.lift_parser_t (ρ : Type) [has_lift_t module_parser_config ρ] : has_monad_lift (parser_t ρ id) module_parser_m := -{ monad_lift := λ α x st it, do +{ monad_lift := λ α x st it nb_st, do cfg ← read, - pure (((x.run ↑cfg).run st) it) } + pure ((((x.run ↑cfg).run st) it).run nb_st) } namespace module def yield_command (cmd : syntax) : module_parser_m unit := diff --git a/library/init/lean/parser/token.lean b/library/init/lean/parser/token.lean index 6a6d9a6eb1..a13505a948 100644 --- a/library/init/lean/parser/token.lean +++ b/library/init/lean/parser/token.lean @@ -143,14 +143,30 @@ do stx ← (node_choice! number { } : basic_parser), pure $ λ info, update_trailing info.trailing stx -def token : basic_parser_m syntax := -do (r, i) ← with_source_info $ do { - -- NOTE the order: if a token is both a symbol and a valid identifier (i.e. a keyword), - -- we want it to be recognized as a symbol - f::_ ← longest_match [symbol', ident'] <|> list.ret <$> number' | error "token: unreachable", - pure f - }, - pure (r i) +def token : basic_parser := +do it ← left_over, + cache ← get_cache, + -- NOTE: using `catch` instead of `<|>` so that error messages from the second block are preferred + catch (do + -- check token cache + some tkc ← pure cache.token_cache | failure, + guard (it.offset = tkc.start_it.offset), + -- hackishly update parsec position + monad_parsec.lift (λ it, parsec.result.ok () tkc.stop_it none), + pure tkc.tk + ) (λ _, do + -- cache failed, update cache + (r, i) ← with_source_info $ do { + -- NOTE the order: if a token is both a symbol and a valid identifier (i.e. a keyword), + -- we want it to be recognized as a symbol + f::_ ← longest_match [symbol', ident'] <|> list.ret <$> number' | error "token: unreachable", + pure f + }, + let tk := r i, + new_it ← left_over, + put_cache {cache with token_cache := some ⟨it, new_it, tk⟩}, + pure tk + ) variable [monad_basic_parser m] diff --git a/tests/lean/parser1.lean b/tests/lean/parser1.lean index 2ac767ca5e..6a1ba862b8 100644 --- a/tests/lean/parser1.lean +++ b/tests/lean/parser1.lean @@ -31,7 +31,7 @@ do t ← parser.mk_token_trie $ parser.tokens term.builtin_leading_parsers ++ parser.tokens term.builtin_trailing_parsers, pure $ { - filename := "