lean4-htt/library/init/lean/parser/basic.lean

/-
Copyright (c) 2018 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Sebastian Ullrich

Parser for the Lean language
-/
prelude
import init.lean.parser.parsec init.lean.parser.syntax init.lean.parser.rec
import init.lean.parser.trie
import init.lean.parser.identifier init.data.rbmap init.lean.message
import init.control.coroutine

namespace lean
namespace parser

/- Maximum standard precedence. This is the precedence of function application.
   In the standard Lean language, only the token `.` has a left-binding power greater
   than `max_prec` (so that field accesses like `g (h x).f` are parsed as `g ((h x).f)`,
   not `(g (h x)).f`). -/
def max_prec : nat := 1024

structure token_config :=
(«prefix» : string)
/- Left-binding power used by the term parser. The term parser operates in the context
   of a right-binding power between 0 (used by parentheses and on the top-level) and
   (usually) `max_prec` (used by function application). After parsing an initial term,
   it continues parsing and expanding that term only when the left-binding power of
   the next token is greater than the current right-binding power. For example, it
   never continues parsing an argument after the initial parse, unless a token with
   lbp > max_prec is encountered. Conversely, the term parser will always continue
   parsing inside parentheses until it finds a token with lbp 0 (such as `)`). -/
(lbp : nat := 0)
-- reading a token should not need any state
/- An optional parser that is activated after matching `prefix`.
   It should return a syntax tree with a "hole" for the
   `source_info` surrounding the token, which will be supplied
   by the `token` parser.

   Remark: `suffix_parser` has many applications for example for parsing
   hexdecimal numbers, `prefix` is `0x` and `suffix_parser` is the parser `digit*`.
   We also use it to parse string literals: here `prefix` is just `"`.
-/
(suffix_parser : option (parsec' (source_info → syntax)) := none)

-- Backtrackable state
structure parser_state :=
(messages : message_log)

structure token_cache_entry :=
(start_it stop_it : string.iterator)
(tk : syntax)

-- Non-backtrackable state
structure parser_cache :=
(token_cache : option token_cache_entry := none)

/- Remark: if we have a node in the trie with `some token_config`, the string induced by the path is equal to the `token_config.prefix`. -/
structure parser_config :=
(tokens : trie token_config)
(filename : string)

@[derive monad alternative monad_state monad_parsec monad_except]
def parser_core_t (m : Type → Type) [monad m] :=
state_t parser_state $ parsec_t syntax $ state_t parser_cache $ m

@[derive monad alternative monad_reader monad_state monad_parsec monad_except]
def parser_t (ρ : Type) (m : Type → Type) [monad m] := reader_t ρ $ parser_core_t m
@[derive monad alternative monad_reader monad_state monad_parsec monad_except]
abbreviation basic_parser_m := parser_t parser_config id
abbreviation basic_parser := basic_parser_m syntax
abbreviation monad_basic_parser := has_monad_lift_t basic_parser_m

section
local attribute [reducible] parser_t parser_core_t
def get_cache : basic_parser_m parser_cache :=
monad_lift (get : state_t parser_cache id _)

def put_cache : parser_cache → basic_parser_m punit :=
λ c, monad_lift (put c : state_t parser_cache id _)
end

 -- an arbitrary `parser` type; parsers are usually some monad stack based on `basic_parser_m` returning `syntax`
variable {ρ : Type}

class has_tokens (r : ρ) := mk {} ::
(tokens : list token_config)

@[noinline] def tokens (r : ρ) [has_tokens r] :=
has_tokens.tokens r

instance has_tokens.inhabited (r : ρ) : inhabited (has_tokens r) :=
⟨⟨[]⟩⟩

@[nospecialize]
instance list.nil.tokens : parser.has_tokens ([] : list ρ) :=
default _

@[nospecialize]
instance list.cons.tokens (r : ρ) (rs : list ρ) [parser.has_tokens r] [parser.has_tokens rs] :
  parser.has_tokens (r::rs) :=
⟨tokens r ++ tokens rs⟩

class has_view (r : ρ) (α : out_param Type) :=
(view : syntax → α)
(review : α → syntax)

export has_view (view review)

def try_view {α : Type} (k : syntax_node_kind) [has_view k α] (stx : syntax) : option α :=
if stx.is_of_kind k then some (has_view.view k stx) else none

instance has_view.default (r : ρ) : inhabited (parser.has_view r syntax) :=
⟨{ view := id, review := id }⟩

class has_view_default (r : ρ) (α : out_param Type) [has_view r α] (default : α) := mk {}

def message_of_parsec_message {μ : Type} (cfg : parser_config) (msg : parsec.message μ) : message :=
-- FIXME: translate position
{filename := cfg.filename, pos := ⟨0, 0⟩, text := to_string msg}

/-- Run parser stack, returning a partial syntax tree in case of a fatal error -/
protected def run {m : Type → Type} {α ρ : Type} [monad m] [has_coe_t ρ parser_config] (cfg : ρ) (s : string) (r : parser_t ρ m α) :
m (sum α syntax × message_log) :=
do (r, _) ← (((r.run cfg).run {messages:=message_log.empty}).parse s).run {},
pure $ match r with
| except.ok (a, st) := (sum.inl a, st.messages)
| except.error msg  := (sum.inr msg.custom, message_log.empty.add (message_of_parsec_message cfg msg))

open coroutine
open monad_parsec
open parser.has_view
variables {α : Type} {m : Type → Type}
local notation `parser` := m syntax

def log_message {μ : Type} [monad m] [monad_reader ρ m] [has_lift_t ρ parser_config] [monad_state parser_state m]
  (msg : parsec.message μ) : m unit :=
do cfg ← read,
   modify (λ st, {st with messages := st.messages.add (message_of_parsec_message ↑cfg msg)})

def mk_token_trie (tokens : list token_config) : except string (trie token_config) :=
do -- the only hardcoded tokens, because they are never directly mentioned by a `parser`
   let builtin_tokens : list token_config := [{«prefix» := "/-"}, {«prefix» := "--"}],
   t ← (builtin_tokens ++ tokens).mfoldl (λ (t : trie token_config) tk,
     match t.find tk.prefix with
     | some tk' := (match tk.lbp, tk'.lbp with
       | l, 0  := pure $ t.insert tk.prefix tk
       | 0, _  := pure t
       | l, l' := if l = l' then pure t else throw $
         "invalid token '" ++ tk.prefix ++ "', has been defined with precedences " ++
         to_string l ++ " and " ++ to_string l')
     | none := pure $ t.insert tk.prefix tk)
     trie.mk,
   pure t

structure parse.view_ty :=
(root : syntax)
(eoi  : syntax)

def parse.view : syntax → option parse.view_ty
| (syntax.node ⟨none, [root, eoi]⟩) := some ⟨root, eoi⟩
| _ := none


/- Monad stacks used in multiple files -/

/- NOTE: We move `rec_t` under `parser_t`'s `reader_t` so that `term_parser`, which does not
   have access to `command_parser`'s ρ (=`command_parser_config`) can still recurse into it
   (for command quotations). This means that the `command_parser_config` will be reset
   on a recursive call to `command.parser`, i.e. it forgets about locally registered parsers,
   but that's not an issue for our intended uses of it. -/
@[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec]
def command_parser_m (ρ : Type) := reader_t ρ $ rec_t unit syntax $ parser_core_t id

section
local attribute [reducible] parser_t command_parser_m
instance command_parser_m.monad_reader_adapter (ρ ρ' : Type) :
  monad_reader_adapter ρ ρ' (command_parser_m ρ) (command_parser_m ρ') :=
infer_instance
instance command_parser_m.basic_parser (ρ : Type) [has_lift_t ρ parser_config] : monad_basic_parser (command_parser_m ρ) :=
⟨λ _ x cfg rec, x.run ↑cfg⟩
end

/- The `nat` at `rec_t` is the lbp` -/
@[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec monad_basic_parser]
def term_parser_m := rec_t nat syntax $ command_parser_m parser_config
abbreviation term_parser := term_parser_m syntax

/-- A term parser for a suffix or infix notation that accepts a preceding term. -/
@[derive monad alternative monad_reader monad_state monad_parsec monad_except monad_rec monad_basic_parser]
def trailing_term_parser_m := reader_t syntax term_parser_m
abbreviation trailing_term_parser := trailing_term_parser_m syntax

instance trailing_term_parser_coe : has_coe term_parser trailing_term_parser :=
⟨λ x _, x⟩

local attribute [instance] name.has_lt_quick
/-- A multimap indexed by tokens. Used for indexing parsers by their leading token. -/
def token_map (α : Type) := rbmap name (list α) (<)

def token_map.insert {α : Type} (map : token_map α) (k : name) (v : α) : token_map α :=
match map.find k with
| none    := map.insert k [v]
| some vs := map.insert k (v::vs)

def token_map.of_list {α : Type} : list (name × α) → token_map α
| []          := mk_rbmap _ _ _
| (⟨k,v⟩::xs) := (token_map.of_list xs).insert k v

@[nospecialize]
instance token_map_nil.tokens : parser.has_tokens $ @token_map.of_list ρ [] :=
default _

@[nospecialize]
instance token_map_cons.tokens (k : name) (r : ρ) (rs : list (name × ρ)) [parser.has_tokens r] [parser.has_tokens $ token_map.of_list rs] :
  parser.has_tokens $ token_map.of_list ((k,r)::rs) :=
⟨tokens r ++ tokens (token_map.of_list rs)⟩

-- This needs to be a separate structure since `term_parser`s cannot contain themselves in their config
structure command_parser_config extends parser_config :=
(leading_term_parsers : token_map term_parser)
(trailing_term_parsers : token_map trailing_term_parser)

instance command_parser_config_coe_parser_config : has_coe command_parser_config parser_config :=
⟨command_parser_config.to_parser_config⟩

abbreviation command_parser := command_parser_m command_parser_config syntax

end «parser»
end lean