lean4-htt/src/frontends/lean/scanner.cpp
Leonardo de Moura a0a8103804 chore(frontends/lean): go back to 'c' as notation for characters
This suggestion has been discussed at Slack.
We have decided to use #"c" as notation because we wanted to allow `'`
in the beginning of identifiers like in SML and F*. In particular,
we wanted to allow users to use 'a 'b 'c for naming type parameters
like in SML. However, nobody used this notation. In the Lean standard
library, we are using greek letters for naming type parameters.
So, there is no real motivation for the ugly #"c" syntax.
2017-05-02 13:00:51 -07:00

844 lines
25 KiB
C++

/*
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Leonardo de Moura
*/
#include <cctype>
#include <string>
#include "util/exception.h"
#include "util/utf8.h"
#include "library/type_context.h"
#include "library/message_builder.h"
#include "frontends/lean/scanner.h"
#include "frontends/lean/tokens.h"
#include "frontends/lean/parser_config.h"
namespace lean {
unsigned scanner::get_utf8_size(unsigned char c) {
unsigned r = ::lean::get_utf8_size(c);
if (r == 0)
throw_exception("invalid utf-8 head character");
return r;
}
unsigned char to_uchar(char c) { return static_cast<unsigned char>(c); }
unsigned utf8_to_unicode(char const * begin, char const * end) {
unsigned result = 0;
if (begin == end)
return result;
auto it = begin;
unsigned c = to_uchar(*it);
++it;
if (c < 128)
return c;
unsigned mask = (1u << 6) -1;
unsigned hmask = mask;
unsigned shift = 0;
unsigned num_bits = 0;
while ((c & 0xC0) == 0xC0) {
c <<= 1;
c &= 0xff;
num_bits += 6;
hmask >>= 1;
shift++;
result <<= 6;
if (it == end)
return 0;
result |= *it & mask;
++it;
}
result |= ((c >> shift) & hmask) << num_bits;
return result;
}
bool is_greek_unicode(unsigned u) { return 0x391 <= u && u <= 0x3DD; }
bool is_letter_like_unicode(unsigned u) {
return
(0x3b1 <= u && u <= 0x3c9 && u != 0x3bb) || // Lower greek, but lambda
(0x391 <= u && u <= 0x3A9 && u != 0x3A0 && u != 0x3A3) || // Upper greek, but Pi and Sigma
(0x3ca <= u && u <= 0x3fb) || // Coptic letters
(0x1f00 <= u && u <= 0x1ffe) || // Polytonic Greek Extended Character Set
(0x2100 <= u && u <= 0x214f); // Letter like block
}
bool is_sub_script_alnum_unicode(unsigned u) {
return
(0x207f <= u && u <= 0x2089) || // n superscript and numberic subscripts
(0x2090 <= u && u <= 0x209c); // letter-like subscripts
}
void scanner::fetch_line() {
m_curr_line.clear();
if (std::getline(m_stream, m_curr_line)) {
m_curr_line.push_back('\n');
m_sline++;
m_spos = 0;
m_upos = 0;
m_curr = m_curr_line[m_spos];
m_uskip = get_utf8_size(m_curr);
m_uskip--;
} else {
m_last_line = true;
m_curr = EOF;
}
}
void scanner::next() {
lean_assert(m_curr != EOF);
m_spos++;
while (m_spos >= static_cast<int>(m_curr_line.size())) {
if (m_last_line) {
m_curr = EOF;
return;
} else {
return fetch_line();
}
}
m_curr = m_curr_line[m_spos];
if (m_uskip > 0) {
if (!is_utf8_next(m_curr))
throw_exception("invalid utf-8 sequence character");
m_uskip--;
} else {
m_upos++;
m_uskip = get_utf8_size(m_curr);
m_uskip--;
}
}
void scanner::check_not_eof(char const * error_msg) {
if (curr() == EOF) throw_exception(error_msg);
}
[[ noreturn ]] void scanner::throw_exception(char const * msg) {
pos_info pos = {m_sline, m_upos};
while (curr() != EOF && !std::isspace(curr()))
next();
throw parser_exception(msg, m_stream_name.c_str(), pos);
}
static char const * g_end_error_str_msg = "unexpected end of string";
static char const * g_end_error_char_msg = "unexpected end of character";
optional<unsigned> scanner::try_hex_to_unsigned(char c) {
if ('0' <= c && c <= '9')
return optional<unsigned>(c - '0');
else if ('a' <= c && c <= 'f')
return optional<unsigned>(10 + c - 'a');
else if ('A' <= c && c <= 'F')
return optional<unsigned>(10 + c - 'A');
else
return optional<unsigned>();
}
unsigned scanner::hex_to_unsigned(char c) {
if (auto r = try_hex_to_unsigned(c))
return *r;
else
throw_exception("invalid hexadecimal digit");
}
char scanner::read_quoted_char(char const * error_msg) {
lean_assert(curr() == '\\');
next();
check_not_eof(error_msg);
char c = curr();
if (c != '\\' && c != '\"' && c != 'n' && c != 't' && c != '\'' && c != 'x')
throw_exception("invalid escape sequence");
if (c == 'n') {
return '\n';
} else if (c == 't') {
return '\t';
} else if (c == 'x') {
next();
char c = curr();
unsigned v = hex_to_unsigned(c);
next();
c = curr();
v = 16*v + hex_to_unsigned(c);
return static_cast<char>(v);
} else {
return c;
}
}
token_kind scanner::read_string() {
lean_assert(curr() == '\"');
next();
m_buffer.clear();
while (true) {
check_not_eof(g_end_error_str_msg);
char c = curr();
if (c == '\"') {
next();
return token_kind::String;
} else if (c == '\\') {
c = read_quoted_char(g_end_error_str_msg);
}
m_buffer += c;
next();
}
}
auto scanner::read_char() -> token_kind {
char c = curr();
if (c == '\\')
c = read_quoted_char(g_end_error_char_msg);
next();
if (curr() != '\'')
throw_exception("invalid character, ' expected");
next();
m_buffer.clear();
m_buffer += c;
return token_kind::Char;
}
auto scanner::read_quoted_symbol() -> token_kind {
lean_assert(curr() == '`');
next();
if (std::isdigit(curr()))
throw_exception("first character of a quoted symbols cannot be a digit");
m_buffer.clear();
bool start = true;
bool trailing_space = false;
while (true) {
check_not_eof("unexpected quoted identifier");
char c = curr();
next();
switch (c) {
case '`':
if (start)
throw_exception("unexpected end of quoted symbol");
m_name_val = name(m_buffer.c_str());
return token_kind::QuotedSymbol;
case '\"':
case '\n':
case '\t':
throw_exception("invalid quoted symbol, invalid character");
case ' ':
if (!start)
trailing_space = true;
m_buffer += c;
break;
default:
start = false;
if (trailing_space)
throw_exception("unexpected space inside of quoted symbol");
m_buffer += c;
}
}
}
bool scanner::is_next_digit() {
lean_assert(curr() != EOF);
if (m_spos + 1 < static_cast<int>(m_curr_line.size()))
return std::isdigit(m_curr_line[m_spos+1]);
else
return false;
}
auto scanner::read_hex_number() -> token_kind {
lean_assert(curr() == 'x');
next();
m_num_val = 0;
bool found = false;
while (true) {
char c = curr();
if (auto d = try_hex_to_unsigned(c)) {
found = true;
m_num_val = 16 * m_num_val + *d;
next();
} else {
break;
}
}
if (!found)
throw exception("invalid hexadecimal numeral, hexadecimal digit expected");
return token_kind::Decimal;
}
optional<unsigned> scanner::try_digit_to_unsigned(int base, char c) {
lean_assert(base == 2 || base == 8 || base == 10 || base == 16);
if ('0' <= c && c <= '9') {
if (base == 2 && c >= '2')
throw_exception("invalid binary digit");
if (base == 8 && c >= '8')
throw_exception("invalid octal digit");
return optional<unsigned>(c - '0');
} else if (base == 16 && 'a' <= c && c <= 'f') {
return optional<unsigned>(10 + c - 'a');
} else if (base == 16 && 'A' <= c && c <= 'F') {
return optional<unsigned>(10 + c - 'A');
}
return optional<unsigned>();
}
auto scanner::read_number() -> token_kind {
lean_assert('0' <= curr() && curr() <= '9');
mpq q(1);
char c = curr();
next();
m_num_val = c - '0';
// Check for alternate base when first digit is zero.
int base = 10;
if (m_num_val == 0) {
switch (curr()) {
case 'b':
case 'B':
base = 2;
next();
break;
case 'o':
case 'O':
base = 8;
next();
break;
case 'x':
case 'X':
base = 16;
next();
break;
}
// Read digit after prefix.
if (base != 10) {
if (auto r = try_digit_to_unsigned(base, curr())) {
next();
m_num_val = *r;
} else {
throw_exception("invalid numeral, expected digit after base prefix");
}
}
}
bool is_post_decimal = false; // Encountered a decimal point (requires base = 10)
while (true) {
c = curr();
if (auto r = try_digit_to_unsigned(base, c)) {
m_num_val = base*m_num_val + *r;
if (is_post_decimal)
q *= 10;
next();
} else if (base == 10 && c == '.') {
// Num. is not a decimal. It should be at least Num.0
if (is_next_digit()) {
if (is_post_decimal)
break;
is_post_decimal = true;
next();
} else {
break;
}
} else {
break;
}
}
if (is_post_decimal)
m_num_val /= q;
return is_post_decimal ? token_kind::Decimal : token_kind::Numeral;
}
void scanner::read_single_line_comment() {
while (true) {
if (curr() == '\n') {
next();
return;
} else if (curr() == EOF) {
return;
} else {
next();
}
}
}
void scanner::read_doc_block_core() {
m_buffer.clear();
while (true) {
check_not_eof("unexpected end of documentation block");
char c = curr();
next();
if (c == '-') {
if (curr() == '/') {
next();
return;
}
}
m_buffer += c;
}
}
auto scanner::read_doc_block() -> token_kind {
read_doc_block_core();
return token_kind::DocBlock;
}
auto scanner::read_mod_doc_block() -> token_kind {
read_doc_block_core();
return token_kind::ModDocBlock;
}
void scanner::read_comment_block() {
unsigned nesting = 1;
while (true) {
char c = curr();
check_not_eof("unexpected end of comment block");
next();
if (c == '/') {
if (curr() == '-') {
next();
nesting++;
}
} else if (c == '-') {
if (curr() == '/') {
next();
nesting--;
if (nesting == 0)
return;
}
}
}
}
// Read until the end_str is found, store all characters (not including end_str) in m_buffer.
// Throw a parser exception error_msg if end of file is found before end_str.
void scanner::read_until(char const * end_str, char const * error_msg) {
lean_assert(end_str);
lean_assert(end_str[0]);
m_buffer.clear();
while (true) {
check_not_eof(error_msg);
char c = curr_next();
if (c == end_str[0]) {
m_aux_buffer.clear();
m_aux_buffer += c;
unsigned i = 1;
while (true) {
if (!end_str[i])
return;
check_not_eof(error_msg);
c = curr_next();
if (c != end_str[i]) {
m_buffer += m_aux_buffer;
break;
}
i++;
}
} else {
m_buffer += c;
}
}
}
void scanner::move_back(unsigned offset, unsigned u_offset) {
lean_assert(m_uskip == 0);
if (offset != 0) {
if (curr() == EOF) {
m_curr = 0;
m_spos--;
m_upos--;
offset--;
u_offset--;
}
if (offset != 0) {
m_spos -= offset;
m_upos -= u_offset;
}
next();
}
}
void scanner::next_utf_core(char c, buffer<char> & cs) {
cs.push_back(c);
while (m_uskip > 0) {
next();
cs.push_back(curr());
}
}
void scanner::next_utf(buffer<char> & cs) {
next();
next_utf_core(curr(), cs);
}
static bool is_id_first(buffer<char> const & cs, unsigned i) {
if (std::isalpha(cs[i]) || cs[i] == '_')
return true;
unsigned u = utf8_to_unicode(cs.begin() + i, cs.end());
return is_letter_like_unicode(u);
}
bool is_id_rest(char const * begin, char const * end) {
if (std::isalnum(*begin) || *begin == '_' || *begin == '\'')
return true;
unsigned u = utf8_to_unicode(begin, end);
return is_letter_like_unicode(u) || is_sub_script_alnum_unicode(u);
}
static char const * g_error_key_msg = "unexpected token";
void scanner::read_field_idx() {
lean_assert('0' <= curr() && curr() <= '9');
mpz q(1);
char c = curr();
next();
m_num_val = c - '0';
while (true) {
c = curr();
if (auto r = try_digit_to_unsigned(10, c)) {
m_num_val = 10*m_num_val + *r;
next();
} else {
break;
}
}
}
auto scanner::read_key_cmd_id() -> token_kind {
buffer<char> cs;
next_utf_core(curr(), cs);
unsigned num_utfs = 1;
unsigned id_sz = 0;
unsigned id_utf_sz = 0;
if (m_field_notation && cs[0] == '.') {
next_utf(cs);
num_utfs++;
if (std::isdigit(curr())) {
/* field notation `.` <number> */
read_field_idx();
return token_kind::FieldNum;
} else {
if (is_id_first(cs, 1) && cs[1] != '_') {
/* field notation `.` <atomic_id> */
num_utfs--;
cs[0] = cs[1];
cs.pop_back();
while (true) {
id_sz = cs.size();
unsigned i = id_sz;
next_utf(cs);
num_utfs++;
if (!is_id_rest(&cs[i], cs.end())) {
break;
}
}
move_back(cs.size() - id_sz, num_utfs - 1);
cs.shrink(id_sz);
cs.push_back(0);
m_name_val = name(cs.data());
return token_kind::FieldName;
}
}
} else if (is_id_first(cs, 0)) {
id_sz = cs.size();
while (true) {
id_sz = cs.size();
id_utf_sz = num_utfs;
unsigned i = id_sz;
next_utf(cs);
num_utfs++;
if (is_id_rest(&cs[i], cs.end())) {
} else if (cs[i] == '.') {
next_utf(cs);
num_utfs++;
if (!is_id_first(cs, i+1))
break;
} else {
break;
}
}
}
unsigned i = 0;
token_table const * it = m_tokens;
token_info const * info = nullptr;
unsigned key_sz = 0;
unsigned key_utf_sz = 0;
while (i < id_sz) {
it = find(*it, cs[i]);
i++;
if (!it)
break;
if (auto new_info = value_of(*it)) {
lean_assert(m_uskip == 0);
info = new_info;
key_sz = i;
key_utf_sz = id_utf_sz; // this is imprecise if key_sz < id_sz, but this case is irrelevant
}
}
while (it) {
if (i == cs.size()) {
next_utf(cs);
num_utfs++;
}
it = find(*it, cs[i]);
i++;
if (!it)
break;
if (auto new_info = value_of(*it)) {
lean_assert(m_uskip == 0);
info = new_info;
key_sz = i;
key_utf_sz = num_utfs;
lean_assert(key_sz > id_sz);
}
}
if (id_sz == 0 && key_sz == 0)
throw_exception(g_error_key_msg);
if (id_sz > key_sz) {
move_back(cs.size() - id_sz, num_utfs - id_utf_sz);
m_name_val = name();
std::string & id_part = m_buffer;
id_part.clear();
for (unsigned i = 0; i < id_sz; i++) {
if (cs[i] == '.') {
m_name_val = name(m_name_val, id_part.c_str());
id_part.clear();
} else {
id_part.push_back(cs[i]);
}
}
m_name_val = name(m_name_val, id_part.c_str());
return token_kind::Identifier;
} else {
move_back(cs.size() - key_sz, num_utfs - key_utf_sz);
m_token_info = *info;
return m_token_info.is_command() ? token_kind::CommandKeyword : token_kind::Keyword;
}
}
static name * g_begin_comment_tk = nullptr;
static name * g_begin_comment_block_tk = nullptr;
static name * g_begin_doc_block_tk = nullptr;
static name * g_begin_mod_doc_block_tk = nullptr;
static name * g_tick_tk = nullptr;
void initialize_scanner() {
g_begin_comment_tk = new name("--");
g_begin_comment_block_tk = new name("/-");
g_begin_doc_block_tk = new name("/--");
g_begin_mod_doc_block_tk = new name("/-!");
g_tick_tk = new name("'");
}
void finalize_scanner() {
delete g_begin_comment_tk;
delete g_begin_comment_block_tk;
delete g_begin_doc_block_tk;
delete g_begin_mod_doc_block_tk;
delete g_tick_tk;
}
auto scanner::scan(environment const & env) -> token_kind {
m_tokens = &get_token_table(env);
while (true) {
char c = curr();
m_pos = m_upos;
m_line = m_sline;
switch (c) {
case ' ': case '\r': case '\t': case '\n':
next();
break;
case '\"':
return read_string();
case '`':
if (m_in_notation) {
return read_quoted_symbol();
} else {
return read_key_cmd_id();
}
case -1:
return token_kind::Eof;
default:
if (std::isdigit(c)) {
return read_number();
} else {
token_kind k = read_key_cmd_id();
if (k == token_kind::Keyword) {
// We treat '/-', '/--', '/-!', '--' as "keyword".
name const & n = m_token_info.value();
if (n == *g_begin_comment_tk)
read_single_line_comment();
else if (n == *g_begin_comment_block_tk)
read_comment_block();
else if (n == *g_begin_doc_block_tk)
return read_doc_block();
else if (n == *g_begin_mod_doc_block_tk)
return read_mod_doc_block();
else if (n == *g_tick_tk)
return read_char();
else
return k;
} else {
return k;
}
}
}
}
}
scanner::scanner(std::istream & strm, char const * strm_name):
m_tokens(nullptr), m_stream(strm) {
m_stream_name = strm_name ? strm_name : "[unknown]";
m_sline = 0;
m_spos = 0;
m_upos = 0;
m_uskip = 0;
m_in_notation = false;
m_last_line = false;
fetch_line();
if (m_sline == 0) m_sline = 1;
m_line = m_sline;
m_pos = 0;
lean_assert(pos_info(get_line(), get_pos()) == pos_info(1, 0));
}
scanner::scanner(std::istream & strm, char const * strm_name, pos_info const & pos) :
scanner(strm, strm_name) {
skip_to_pos(pos);
}
void scanner::skip_to_pos(pos_info const & pos) {
for (unsigned line_no = 1; line_no < pos.first; line_no++)
fetch_line();
m_line = m_sline;
for (unsigned col_idx = 0; col_idx < pos.second; col_idx++)
next();
m_pos = m_upos; // we assume that the argument is the start of a token
lean_assert(pos == pos_info(get_line(), get_pos()));
}
std::ostream & operator<<(std::ostream & out, token_kind k) {
out << static_cast<unsigned>(k);
return out;
}
token::token(pos_info const & p):m_kind(token_kind::Eof), m_pos(p) {}
token::token(token_kind k, pos_info const & p, token_info const & info):
m_kind(k), m_pos(p), m_info(new token_info(info)) {
lean_assert(m_kind == token_kind::Keyword || m_kind == token_kind::CommandKeyword);
}
token::token(token_kind k, pos_info const & p, name const & v):
m_kind(k), m_pos(p), m_name_val(new name(v)) {
lean_assert(m_kind == token_kind::QuotedSymbol || m_kind == token_kind::Identifier || m_kind == token_kind::FieldName);
}
token::token(token_kind k, pos_info const & p, mpq const & v):
m_kind(k), m_pos(p), m_num_val(new mpq (v)) {
lean_assert(m_kind == token_kind::Decimal || m_kind == token_kind::Numeral || m_kind == token_kind::FieldNum);
}
token::token(token_kind k, pos_info const & p, std::string const & v):
m_kind(k), m_pos(p), m_str_val(new std::string(v)) {
lean_assert(m_kind == token_kind::String || m_kind == token_kind::Char ||
m_kind == token_kind::DocBlock || m_kind == token_kind::ModDocBlock);
}
void token::dealloc() {
switch (kind()) {
case token_kind::Eof:
return;
case token_kind::Keyword: case token_kind::CommandKeyword:
if (m_info != nullptr) delete m_info;
return;
case token_kind::Identifier: case token_kind::QuotedSymbol: case token_kind::FieldName:
if (m_name_val != nullptr) delete m_name_val;
return;
case token_kind::Numeral: case token_kind::Decimal: case token_kind::FieldNum:
if (m_num_val != nullptr) delete m_num_val;
return;
case token_kind::String: case token_kind::Char:
case token_kind::DocBlock: case token_kind::ModDocBlock:
if (m_str_val != nullptr) delete m_str_val;
return;
}
}
void token::copy(token const & tk) {
m_kind = tk.m_kind;
m_pos = tk.m_pos;
switch (tk.kind()) {
case token_kind::Eof:
return;
case token_kind::Keyword: case token_kind::CommandKeyword:
m_info = new token_info(*tk.m_info);
return;
case token_kind::Identifier: case token_kind::QuotedSymbol: case token_kind::FieldName:
m_name_val = new name(*tk.m_name_val);
return;
case token_kind::Numeral: case token_kind::Decimal: case token_kind::FieldNum:
m_num_val = new mpq(*tk.m_num_val);
return;
case token_kind::String: case token_kind::Char:
case token_kind::DocBlock: case token_kind::ModDocBlock:
m_str_val = new std::string(*tk.m_str_val);
return;
}
}
void token::steal(token && tk) {
m_kind = tk.m_kind;
m_pos = tk.m_pos;
switch (tk.kind()) {
case token_kind::Eof:
return;
case token_kind::Keyword: case token_kind::CommandKeyword:
m_info = tk.m_info;
tk.m_info = nullptr;
return;
case token_kind::Identifier: case token_kind::QuotedSymbol: case token_kind::FieldName:
m_name_val = tk.m_name_val;
tk.m_name_val = nullptr;
return;
case token_kind::Numeral: case token_kind::Decimal: case token_kind::FieldNum:
m_num_val = tk.m_num_val;
tk.m_num_val = nullptr;
return;
case token_kind::String: case token_kind::Char:
case token_kind::DocBlock: case token_kind::ModDocBlock:
m_str_val = tk.m_str_val;
tk.m_str_val = nullptr;
return;
}
}
token read_tokens(environment & env, io_state const & ios, scanner & s, buffer<token> & tk, bool use_exceptions) {
while (true) {
try {
token_kind k = s.scan(env);
switch (k) {
case token_kind::Eof:
return token(s.get_pos_info());
case token_kind::Keyword:
tk.push_back(token(k, s.get_pos_info(), s.get_token_info()));
break;
case token_kind::CommandKeyword:
if (s.get_token_info().value() == get_end_tk())
tk.push_back(token(k, s.get_pos_info(), s.get_token_info()));
else
return token(k, s.get_pos_info(), s.get_token_info());
break;
case token_kind::Identifier: case token_kind::QuotedSymbol: case token_kind::FieldName:
tk.push_back(token(k, s.get_pos_info(), s.get_name_val()));
break;
case token_kind::Numeral: case token_kind::Decimal: case token_kind::FieldNum:
tk.push_back(token(k, s.get_pos_info(), s.get_num_val()));
break;
case token_kind::String: case token_kind::Char:
case token_kind::DocBlock: case token_kind::ModDocBlock:
tk.push_back(token(k, s.get_pos_info(), s.get_str_val()));
break;
}
} catch (throwable & ex) {
if (use_exceptions) {
throw;
} else {
message_builder builder(env, ios, s.get_stream_name(), s.get_pos_info(), ERROR);
builder.set_exception(ex);
builder.report();
}
}
}
}
}