lean4-htt/src/frontends/lean/scanner.cpp

/*
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.

Author: Leonardo de Moura
*/
#include <cctype>
#include <string>
#include "util/exception.h"
#include "util/utf8.h"
#include "library/type_context.h"
#include "library/message_builder.h"
#include "frontends/lean/scanner.h"
#include "frontends/lean/tokens.h"
#include "frontends/lean/parser_config.h"

namespace lean {
unsigned scanner::get_utf8_size(unsigned char c) {
    unsigned r = ::lean::get_utf8_size(c);
    if (r == 0)
        throw_exception("invalid utf-8 head character");
    return r;
}

unsigned char to_uchar(char c) { return static_cast<unsigned char>(c); }

unsigned utf8_to_unicode(char const * begin, char const * end) {
    unsigned result = 0;
    if (begin == end)
        return result;
    auto it = begin;
    unsigned c = to_uchar(*it);
    ++it;
    if (c < 128)
        return c;
    unsigned mask     = (1u << 6) -1;
    unsigned hmask    = mask;
    unsigned shift    = 0;
    unsigned num_bits = 0;
    while ((c & 0xC0) == 0xC0) {
        c <<= 1;
        c &= 0xff;
        num_bits += 6;
        hmask >>= 1;
        shift++;
        result <<= 6;
        if (it == end)
            return 0;
        result |= *it & mask;
        ++it;
    }
    result |= ((c >> shift) & hmask) << num_bits;
    return result;
}

bool is_greek_unicode(unsigned u) { return 0x391 <= u && u <= 0x3DD; }
bool is_letter_like_unicode(unsigned u) {
    return
        (0x3b1  <= u && u <= 0x3c9 && u != 0x3bb) || // Lower greek, but lambda
        (0x391  <= u && u <= 0x3A9 && u != 0x3A0 && u != 0x3A3) || // Upper greek, but Pi and Sigma
        (0x3ca  <= u && u <= 0x3fb) ||               // Coptic letters
        (0x1f00 <= u && u <= 0x1ffe) ||              // Polytonic Greek Extended Character Set
        (0x2100 <= u && u <= 0x214f);                // Letter like block
}
bool is_sub_script_alnum_unicode(unsigned u) {
    return
        (0x207f <= u && u <= 0x2089) || // n superscript and numberic subscripts
        (0x2090 <= u && u <= 0x209c);   // letter-like subscripts
}

void scanner::fetch_line() {
    m_curr_line.clear();
    if (std::getline(m_stream, m_curr_line)) {
        m_curr_line.push_back('\n');
        m_sline++;
        m_spos  = 0;
        m_upos  = 0;
        m_curr  = m_curr_line[m_spos];
        m_uskip = get_utf8_size(m_curr);
        m_uskip--;
    } else {
        m_last_line = true;
        m_curr      = EOF;
    }
}

void scanner::next() {
    lean_assert(m_curr != EOF);
    m_spos++;
    while (m_spos >= static_cast<int>(m_curr_line.size())) {
        if (m_last_line) {
            m_curr = EOF;
            return;
        } else {
            return fetch_line();
        }
    }
    m_curr = m_curr_line[m_spos];
    if (m_uskip > 0) {
        if (!is_utf8_next(m_curr))
            throw_exception("invalid utf-8 sequence character");
        m_uskip--;
    } else {
        m_upos++;
        m_uskip = get_utf8_size(m_curr);
        m_uskip--;
    }
}

void scanner::check_not_eof(char const * error_msg) {
    if (curr() == EOF) throw_exception(error_msg);
}

[[ noreturn ]] void scanner::throw_exception(char const * msg) {
    pos_info pos = {m_sline, m_upos};
    while (curr() != EOF && !std::isspace(curr()))
        next();
    throw parser_exception(msg, m_stream_name.c_str(), pos);
}

static char const * g_end_error_str_msg = "unexpected end of string";
static char const * g_end_error_char_msg = "unexpected end of character";

optional<unsigned> scanner::try_hex_to_unsigned(char c) {
    if ('0' <= c && c <= '9')
        return optional<unsigned>(c - '0');
    else if ('a' <= c && c <= 'f')
        return optional<unsigned>(10 + c - 'a');
    else if ('A' <= c && c <= 'F')
        return optional<unsigned>(10 + c - 'A');
    else
        return optional<unsigned>();
}

unsigned scanner::hex_to_unsigned(char c) {
    if (auto r = try_hex_to_unsigned(c))
        return *r;
    else
        throw_exception("invalid hexadecimal digit");
}

char scanner::read_quoted_char(char const * error_msg) {
    lean_assert(curr() == '\\');
    next();
    check_not_eof(error_msg);
    char c = curr();
    if (c != '\\' && c != '\"' && c != 'n' && c != 't' && c != '\'' && c != 'x')
        throw_exception("invalid escape sequence");
    if (c == 'n') {
        return '\n';
    } else if (c == 't') {
        return '\t';
    } else if (c == 'x') {
        next();
        char c = curr();
        unsigned v = hex_to_unsigned(c);
        next();
        c = curr();
        v = 16*v + hex_to_unsigned(c);
        return static_cast<char>(v);
    } else {
        return c;
    }
}

token_kind scanner::read_string() {
    lean_assert(curr() == '\"');
    next();
    m_buffer.clear();
    while (true) {
        check_not_eof(g_end_error_str_msg);
        char c = curr();
        if (c == '\"') {
            next();
            return token_kind::String;
        } else if (c == '\\') {
            c = read_quoted_char(g_end_error_str_msg);
        }
        m_buffer += c;
        next();
    }
}

auto scanner::read_char() -> token_kind {
    lean_assert(curr() == '\"');
    next();
    char c = curr();
    if (c == '\\')
        c = read_quoted_char(g_end_error_char_msg);
    next();
    if (curr() != '"')
        throw_exception("invalid character, \" expected");
    next();
    m_buffer.clear();
    m_buffer += c;
    return token_kind::Char;
}

auto scanner::read_quoted_symbol() -> token_kind {
    lean_assert(curr() == '`');
    next();
    if (std::isdigit(curr()))
        throw_exception("first character of a quoted symbols cannot be a digit");
    m_buffer.clear();
    bool start = true;
    bool trailing_space = false;
    while (true) {
        check_not_eof("unexpected quoted identifier");
        char c = curr();
        next();
        switch (c) {
            case '`':
                if (start)
                    throw_exception("unexpected end of quoted symbol");
                m_name_val = name(m_buffer.c_str());
                return token_kind::QuotedSymbol;
            case '\"':
            case '\n':
            case '\t':
                throw_exception("invalid quoted symbol, invalid character");
            case ' ':
                if (!start)
                    trailing_space = true;
                m_buffer += c;
                break;
            default:
                start = false;
                if (trailing_space)
                    throw_exception("unexpected space inside of quoted symbol");
                m_buffer += c;
        }
    }
}

bool scanner::is_next_digit() {
    lean_assert(curr() != EOF);
    if (m_spos + 1 < static_cast<int>(m_curr_line.size()))
        return std::isdigit(m_curr_line[m_spos+1]);
    else
        return false;
}

auto scanner::read_hex_number() -> token_kind {
    lean_assert(curr() == 'x');
    next();
    m_num_val = 0;
    bool found = false;
    while (true) {
        char c = curr();
        if (auto d = try_hex_to_unsigned(c)) {
            found = true;
            m_num_val = 16 * m_num_val + *d;
            next();
        } else {
            break;
        }
    }
    if (!found)
        throw exception("invalid hexadecimal numeral, hexadecimal digit expected");
    return token_kind::Decimal;
}

optional<unsigned> scanner::try_digit_to_unsigned(int base, char c) {
    lean_assert(base == 2 || base == 8 || base == 10 || base == 16);
    if ('0' <= c && c <= '9') {
        if (base == 2 && c >= '2')
            throw_exception("invalid binary digit");
        if (base == 8 && c >= '8')
            throw_exception("invalid octal digit");
        return optional<unsigned>(c - '0');
    } else if (base == 16 && 'a' <= c && c <= 'f') {
        return optional<unsigned>(10 + c - 'a');
    } else if (base == 16 && 'A' <= c && c <= 'F') {
        return optional<unsigned>(10 + c - 'A');
    }
    return optional<unsigned>();
}

auto scanner::read_number() -> token_kind {
    lean_assert('0' <= curr() && curr() <= '9');
    mpq q(1);
    char c = curr();
    next();
    m_num_val = c - '0';

    // Check for alternate base when first digit is zero.
    int base = 10;
    if (m_num_val == 0) {
        switch (curr()) {
        case 'b':
        case 'B':
            base = 2;
            next();
            break;
        case 'o':
        case 'O':
            base = 8;
            next();
            break;
        case 'x':
        case 'X':
            base = 16;
            next();
            break;
        }
        // Read digit after prefix.
        if (base != 10) {
            if (auto r = try_digit_to_unsigned(base, curr())) {
                next();
                m_num_val = *r;
            } else {
                throw_exception("invalid numeral, expected digit after base prefix");
            }
        }
    }

    bool is_post_decimal = false; // Encountered a decimal point (requires base = 10)

    while (true) {
        c = curr();
        if (auto r = try_digit_to_unsigned(base, c)) {
            m_num_val = base*m_num_val + *r;
            if (is_post_decimal)
                q *= 10;
            next();
        } else if (base == 10 && c == '.') {
            // Num. is not a decimal. It should be at least Num.0
            if (is_next_digit()) {
                if (is_post_decimal)
                    break;
                is_post_decimal = true;
                next();
            } else {
                break;
            }
        } else {
            break;
        }
    }
    if (is_post_decimal)
        m_num_val /= q;
    return is_post_decimal ? token_kind::Decimal : token_kind::Numeral;
}

void scanner::read_single_line_comment() {
    while (true) {
        if (curr() == '\n') {
            next();
            return;
        } else if (curr() == EOF) {
            return;
        } else {
            next();
        }
    }
}

void scanner::read_doc_block_core() {
    m_buffer.clear();
    while (true) {
        check_not_eof("unexpected end of documentation block");
        char c = curr();
        next();
        if (c == '-') {
            if (curr() == '/') {
                next();
                return;
            }
        }
        m_buffer += c;
    }
}

auto scanner::read_doc_block() -> token_kind {
    read_doc_block_core();
    return token_kind::DocBlock;
}

auto scanner::read_mod_doc_block() -> token_kind {
    read_doc_block_core();
    return token_kind::ModDocBlock;
}

void scanner::read_comment_block() {
    unsigned nesting = 1;
    while (true) {
        char c = curr();
        check_not_eof("unexpected end of comment block");
        next();
        if (c == '/') {
            if (curr() == '-') {
                next();
                nesting++;
            }
        } else if (c == '-') {
            if (curr() == '/') {
                next();
                nesting--;
                if (nesting == 0)
                    return;
            }
        }
    }
}

// Read until the end_str is found, store all characters (not including end_str) in m_buffer.
// Throw a parser exception error_msg if end of file is found before end_str.
void scanner::read_until(char const * end_str, char const * error_msg) {
    lean_assert(end_str);
    lean_assert(end_str[0]);
    m_buffer.clear();
    while (true) {
        check_not_eof(error_msg);
        char c = curr_next();
        if (c == end_str[0]) {
            m_aux_buffer.clear();
            m_aux_buffer += c;
            unsigned i = 1;
            while (true) {
                if (!end_str[i])
                    return;
                check_not_eof(error_msg);
                c = curr_next();
                if (c != end_str[i]) {
                    m_buffer += m_aux_buffer;
                    break;
                }
                i++;
            }
        } else {
            m_buffer += c;
        }
    }
}

void scanner::move_back(unsigned offset, unsigned u_offset) {
    lean_assert(m_uskip == 0);
    if (offset != 0) {
        if (curr() == EOF) {
            m_curr = 0;
            m_spos--;
            m_upos--;
            offset--;
            u_offset--;
        }
        if (offset != 0) {
            m_spos -= offset;
            m_upos -= u_offset;
        }
        next();
    }
}

void scanner::next_utf_core(char c, buffer<char> & cs) {
    cs.push_back(c);
    while (m_uskip > 0) {
        next();
        cs.push_back(curr());
    }
}

void scanner::next_utf(buffer<char> & cs) {
    next();
    next_utf_core(curr(), cs);
}

static bool is_id_first(buffer<char> const & cs, unsigned i) {
    if (std::isalpha(cs[i]) || cs[i] == '_' || cs[i] == '\'')
        return true;
    unsigned u = utf8_to_unicode(cs.begin() + i, cs.end());
    return is_letter_like_unicode(u);
}

bool is_id_rest(char const * begin, char const * end) {
    if (std::isalnum(*begin) || *begin == '_' || *begin == '\'')
        return true;
    unsigned u = utf8_to_unicode(begin, end);
    return is_letter_like_unicode(u) || is_sub_script_alnum_unicode(u);
}

static char const * g_error_key_msg = "unexpected token";

auto scanner::read_key_cmd_id() -> token_kind {
    buffer<char> cs;
    next_utf_core(curr(), cs);
    unsigned num_utfs  = 1;
    unsigned id_sz     = 0;
    unsigned id_utf_sz = 0;
    if (is_id_first(cs, 0)) {
        id_sz = cs.size();
        while (true) {
            id_sz     = cs.size();
            id_utf_sz = num_utfs;
            unsigned i = id_sz;
            next_utf(cs);
            num_utfs++;
            if (is_id_rest(&cs[i], cs.end())) {
            } else if (cs[i] == '.') {
                next_utf(cs);
                num_utfs++;
                if (!is_id_first(cs, i+1))
                    break;
            } else {
                break;
            }
        }
    }
    unsigned i = 0;
    token_table const * it  = m_tokens;
    token_info const * info = nullptr;
    unsigned key_sz       = 0;
    unsigned key_utf_sz   = 0;
    while (i < id_sz) {
        it = find(*it, cs[i]);
        i++;
        if (!it)
            break;
        if (auto new_info = value_of(*it)) {
            lean_assert(m_uskip == 0);
            info       = new_info;
            key_sz     = i;
            key_utf_sz = id_utf_sz; // this is imprecise if key_sz < id_sz, but this case is irrelevant
        }
    }

    while (it) {
        if (i == cs.size()) {
            next_utf(cs);
            num_utfs++;
        }
        it = find(*it, cs[i]);
        i++;
        if (!it)
            break;
        if (auto new_info = value_of(*it)) {
            lean_assert(m_uskip == 0);
            info       = new_info;
            key_sz     = i;
            key_utf_sz = num_utfs;
            lean_assert(key_sz > id_sz);
        }
    }

    if (id_sz == 0 && key_sz == 0)
        throw_exception(g_error_key_msg);
    if (id_sz > key_sz) {
        move_back(cs.size() - id_sz, num_utfs - id_utf_sz);
        m_name_val = name();
        std::string & id_part = m_buffer;
        id_part.clear();
        for (unsigned i = 0; i < id_sz; i++) {
            if (cs[i] == '.') {
                m_name_val = name(m_name_val, id_part.c_str());
                id_part.clear();
            } else {
                id_part.push_back(cs[i]);
            }
        }
        m_name_val = name(m_name_val, id_part.c_str());
        return token_kind::Identifier;
    } else {
        move_back(cs.size() - key_sz, num_utfs - key_utf_sz);
        m_token_info = *info;
        return m_token_info.is_command() ? token_kind::CommandKeyword : token_kind::Keyword;
    }
}

static name * g_begin_comment_tk        = nullptr;
static name * g_begin_comment_block_tk  = nullptr;
static name * g_begin_doc_block_tk      = nullptr;
static name * g_begin_mod_doc_block_tk  = nullptr;
static name * g_pound_tk                = nullptr;

void initialize_scanner() {
    g_begin_comment_tk       = new name("--");
    g_begin_comment_block_tk = new name("/-");
    g_begin_doc_block_tk     = new name("/--");
    g_begin_mod_doc_block_tk = new name("/-!");
    g_pound_tk               = new name("#");
}

void finalize_scanner() {
    delete g_begin_comment_tk;
    delete g_begin_comment_block_tk;
    delete g_begin_doc_block_tk;
    delete g_begin_mod_doc_block_tk;
    delete g_pound_tk;
}

auto scanner::scan(environment const & env) -> token_kind {
    m_tokens = &get_token_table(env);
    while (true) {
        char c = curr();
        m_pos  = m_upos;
        m_line = m_sline;
        switch (c) {
        case ' ': case '\r': case '\t': case '\n':
            next();
            break;
        case '\"':
            return read_string();
        case '`':
            if (m_in_notation) {
                return read_quoted_symbol();
            } else {
                return read_key_cmd_id();
            }
        case -1:
            return token_kind::Eof;
        default:
            if (std::isdigit(c)) {
                return read_number();
            } else {
                token_kind k = read_key_cmd_id();
                if (k == token_kind::Keyword) {
                    // We treat '/-', '/--', '/-!', '--' as "keyword".
                    name const & n = m_token_info.value();
                    if (n == *g_begin_comment_tk)
                        read_single_line_comment();
                    else if (n == *g_begin_comment_block_tk)
                        read_comment_block();
                    else if (n == *g_begin_doc_block_tk)
                        return read_doc_block();
                    else if (n == *g_begin_mod_doc_block_tk)
                        return read_mod_doc_block();
                    else if (n == *g_pound_tk && curr() == '"')
                        return read_char();
                    else
                        return k;
                } else {
                    return k;
                }
            }
        }
    }
}

scanner::scanner(std::istream & strm, char const * strm_name):
    m_tokens(nullptr), m_stream(strm) {
    m_stream_name = strm_name ? strm_name : "[unknown]";
    m_sline = 0;
    m_spos  = 0;
    m_upos  = 0;
    m_uskip = 0;
    m_in_notation = false;
    m_last_line = false;
    fetch_line();
    m_line = m_sline;
    m_pos = 0;
}

scanner::scanner(std::istream & strm, char const * strm_name, pos_info const & skip_to_pos) :
        scanner(strm, strm_name) {
    for (unsigned line_no = 1; line_no < skip_to_pos.first; line_no++)
        fetch_line();
    m_line = m_sline;
    for (unsigned col_idx = 0; col_idx < skip_to_pos.second; col_idx++)
        next();
}

std::ostream & operator<<(std::ostream & out, token_kind k) {
    out << static_cast<unsigned>(k);
    return out;
}

token::token(pos_info const & p):m_kind(token_kind::Eof), m_pos(p) {}
token::token(token_kind k, pos_info const & p, token_info const & info):
    m_kind(k), m_pos(p), m_info(new token_info(info)) {
    lean_assert(m_kind == token_kind::Keyword || m_kind == token_kind::CommandKeyword);
}
token::token(token_kind k, pos_info const & p, name const & v):
    m_kind(k), m_pos(p), m_name_val(new name(v)) {
    lean_assert(m_kind == token_kind::QuotedSymbol || m_kind == token_kind::Identifier);
}
token::token(token_kind k, pos_info const & p, mpq const & v):
    m_kind(k), m_pos(p), m_num_val(new mpq (v)) {
    lean_assert(m_kind == token_kind::Decimal || m_kind == token_kind::Numeral);
}

token::token(token_kind k, pos_info const & p, std::string const & v):
    m_kind(k), m_pos(p), m_str_val(new std::string(v)) {
    lean_assert(m_kind == token_kind::String || m_kind == token_kind::Char ||
                m_kind == token_kind::DocBlock || m_kind == token_kind::ModDocBlock);
}

void token::dealloc() {
    switch (kind()) {
    case token_kind::Eof:
        return;
    case token_kind::Keyword: case token_kind::CommandKeyword:
        if (m_info != nullptr) delete m_info;
        return;
    case token_kind::Identifier: case token_kind::QuotedSymbol:
        if (m_name_val != nullptr) delete m_name_val;
        return;
    case token_kind::Numeral: case token_kind::Decimal:
        if (m_num_val != nullptr) delete m_num_val;
        return;
    case token_kind::String: case token_kind::Char:
    case token_kind::DocBlock: case token_kind::ModDocBlock:
        if (m_str_val != nullptr) delete m_str_val;
        return;
    }
}

void token::copy(token const & tk) {
    m_kind = tk.m_kind;
    m_pos  = tk.m_pos;
    switch (tk.kind()) {
    case token_kind::Eof:
        return;
    case token_kind::Keyword: case token_kind::CommandKeyword:
        m_info = new token_info(*tk.m_info);
        return;
    case token_kind::Identifier: case token_kind::QuotedSymbol:
        m_name_val = new name(*tk.m_name_val);
        return;
    case token_kind::Numeral: case token_kind::Decimal:
        m_num_val = new mpq(*tk.m_num_val);
        return;
    case token_kind::String: case token_kind::Char:
    case token_kind::DocBlock: case token_kind::ModDocBlock:
        m_str_val = new std::string(*tk.m_str_val);
        return;
    }
}

void token::steal(token && tk) {
    m_kind = tk.m_kind;
    m_pos  = tk.m_pos;
    switch (tk.kind()) {
    case token_kind::Eof:
        return;
    case token_kind::Keyword: case token_kind::CommandKeyword:
        m_info    = tk.m_info;
        tk.m_info = nullptr;
        return;
    case token_kind::Identifier: case token_kind::QuotedSymbol:
        m_name_val    = tk.m_name_val;
        tk.m_name_val = nullptr;
        return;
    case token_kind::Numeral: case token_kind::Decimal:
        m_num_val    = tk.m_num_val;
        tk.m_num_val = nullptr;
        return;
    case token_kind::String: case token_kind::Char:
    case token_kind::DocBlock: case token_kind::ModDocBlock:
        m_str_val    = tk.m_str_val;
        tk.m_str_val = nullptr;
        return;
    }
}

token read_tokens(environment & env, io_state const & ios, scanner & s, buffer<token> & tk, bool use_exceptions) {
    while (true) {
        try {
            token_kind k = s.scan(env);
            switch (k) {
            case token_kind::Eof:
                return token(s.get_pos_info());
            case token_kind::Keyword:
                tk.push_back(token(k, s.get_pos_info(), s.get_token_info()));
                break;
            case token_kind::CommandKeyword:
                if (s.get_token_info().value() == get_end_tk())
                    tk.push_back(token(k, s.get_pos_info(), s.get_token_info()));
                else
                    return token(k, s.get_pos_info(), s.get_token_info());
                break;
            case token_kind::Identifier: case token_kind::QuotedSymbol:
                tk.push_back(token(k, s.get_pos_info(), s.get_name_val()));
                break;
            case token_kind::Numeral: case token_kind::Decimal:
                tk.push_back(token(k, s.get_pos_info(), s.get_num_val()));
                break;
            case token_kind::String: case token_kind::Char:
            case token_kind::DocBlock: case token_kind::ModDocBlock:
                tk.push_back(token(k, s.get_pos_info(), s.get_str_val()));
                break;
            }
        } catch (throwable & ex) {
            if (use_exceptions) {
                throw;
            } else {
                message_builder builder(env, ios, s.get_stream_name(), s.get_pos_info(), ERROR);
                builder.set_exception(ex);
                builder.report();
            }
        }
    }
}
}