/- Copyright (c) 2018 Microsoft Corporation. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. Author: Sebastian Ullrich Tokenizer for the Lean language Even though our reader architecture does not statically depend on a tokenizer but works directly on the input string, we still use a "tokenizer" parser in the Lean reader in some circumstances: * to distinguish between identifiers and keywords * for error recovery: advance until next command token * ...? -/ prelude import init.lean.parser.reader.basic init.util namespace lean.parser namespace reader open lean.parser.monad_parsec open string def match_token : read_m (option token_config) := do st ← get, it ← left_over, -- the slowest longest prefix matcher on Earth pure $ st.tokens.foldl (λ acc tk, if tk.prefix.mk_iterator.is_prefix_of_remaining it then match acc with | some tk' := if tk.prefix > tk'.prefix then tk else tk' | none := tk else acc) none private def finish_comment_block_aux : nat → nat → read_m unit | nesting (n+1) := str "/-" *> finish_comment_block_aux (nesting + 1) n <|> str "-/" *> (if nesting = 1 then pure () else finish_comment_block_aux (nesting - 1) n) <|> any *> finish_comment_block_aux nesting n | _ _ := error "unreachable" def finish_comment_block (nesting := 1) : read_m unit := do r ← remaining, finish_comment_block_aux nesting (r+1) "end of comment block" private def whitespace_aux : nat → read_m unit | (n+1) := do tk ← whitespace *> match_token, (match tk with | some ⟨"--", _⟩ := str "--" *> take_while' (= '\n') *> whitespace_aux n | some ⟨"/-", _⟩ := str "/-" *> finish_comment_block *> whitespace_aux n | _ := pure ()) | 0 := error "unreachable" /-- Skip whitespace and comments. -/ def whitespace : read_m substring := -- every `whitespace_aux` loop reads at least one char do start ← left_over, whitespace_aux (start.remaining+1), stop ← left_over, pure ⟨start, stop⟩ def with_source_info {α : Type} (r : read_m α) : read_m (α × source_info) := do token_start ← reader_state.token_start <$> get, whitespace, it ← left_over, a ← r, -- TODO(Sebastian): less greedy, more natural whitespace assignement -- E.g. only read up to the next line break trailing ← whitespace, it2 ← left_over, modify $ λ st, {st with token_start := it2}, pure (a, ⟨⟨token_start, it⟩, it.offset, trailing⟩) /-- Match a string literally without consulting the token table. -/ def raw_symbol (sym : string) : reader := try $ do (_, info) ← with_source_info $ str sym, pure $ syntax.atom ⟨info, atomic_val.string sym⟩ instance raw_symbol.tokens (s) : reader.has_tokens (raw_symbol s) := ⟨[]⟩ instance raw_symbol.view (s) : reader.has_view (raw_symbol s) syntax := default _ --TODO(Sebastian): other bases private def number' : read_m (source_info → syntax) := do num ← take_while1 char.is_digit, pure $ λ i, syntax.node ⟨`base10_lit, [syntax.atom ⟨i, atomic_val.string num⟩]⟩ private def ident' : read_m (source_info → syntax) := do n ← identifier, pure $ λ i, syntax.ident ⟨i, n, none, none⟩ private def symbol' : read_m (source_info → syntax) := do tk ← match_token, match tk with -- constant-length token | some ⟨tk, none⟩ := do str tk, pure $ λ i, syntax.atom ⟨some i, atomic_val.string tk⟩ -- variable-length token | some ⟨tk, some r⟩ := error "not implemented" --str tk *> monad_parsec.lift r | none := error def token : read_m syntax := do (r, i) ← with_source_info $ do { -- NOTE the order: if a token is both a symbol and a valid identifier (i.e. a keyword), -- we want it to be recognized as a symbol f::_ ← longest_match [symbol', ident', number'] | failure, pure f }, pure (r i) --TODO(Sebastian): error messages def symbol (sym : string) : reader := try $ do it ← left_over, stx@(syntax.atom ⟨_, atomic_val.string sym'⟩) ← token | error "" (dlist.singleton (repr sym)) it, when (sym ≠ sym') $ error "" (dlist.singleton (repr sym)) it, pure stx instance symbol.tokens (sym : string) : reader.has_tokens (symbol sym) := ⟨[⟨sym, none⟩]⟩ instance symbol.view (s) : reader.has_view (symbol s) syntax := default _ instance symbol_coe : has_coe string reader := ⟨symbol⟩ def number : reader := try $ do it ← left_over, stx@(syntax.node ⟨`base10_lit, _⟩) ← token | error "" (dlist.singleton "number") it, pure stx instance number.tokens : reader.has_tokens number := ⟨[]⟩ instance number.view : reader.has_view number syntax := default _ def ident : reader := try $ do it ← left_over, stx@(syntax.ident _) ← token | error "" (dlist.singleton "identifier") it, pure stx instance ident.tokens : reader.has_tokens ident := ⟨[]⟩ instance ident.view : reader.has_view ident syntax := default _ end reader end lean.parser