From f53fa97c4ae38ac4a0afcb72c40859bf34efcc04 Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Fri, 26 Aug 2016 17:26:11 -0400 Subject: [PATCH] feat(frontends/lean): escape identifiers when pretty-printing --- src/frontends/lean/elaborator.cpp | 2 +- src/frontends/lean/parser.cpp | 4 +- src/frontends/lean/pp.cpp | 18 +++--- src/frontends/lean/scanner.cpp | 62 +------------------ src/frontends/lean/scanner.h | 8 +-- src/util/name.cpp | 82 +++++++++++++++++++++----- src/util/name.h | 20 ++++++- src/util/utf8.cpp | 30 ++++++++++ src/util/utf8.h | 7 +++ tests/lean/escape_id.lean | 17 ++++++ tests/lean/escape_id.lean.expected.out | 12 ++++ 11 files changed, 168 insertions(+), 94 deletions(-) create mode 100644 tests/lean/escape_id.lean create mode 100644 tests/lean/escape_id.lean.expected.out diff --git a/src/frontends/lean/elaborator.cpp b/src/frontends/lean/elaborator.cpp index fbe7ec9a15..660eefa8bc 100644 --- a/src/frontends/lean/elaborator.cpp +++ b/src/frontends/lean/elaborator.cpp @@ -3853,7 +3853,7 @@ static expr resolve_local_name(environment const & env, local_context const & lc } catch (exception &) {} } if (!r) - throw elaborator_exception(src, format("unknown identifier '") + format(id) + format("'")); + throw elaborator_exception(src, format("unknown identifier '") + format(id.escape()) + format("'")); return *r; } diff --git a/src/frontends/lean/parser.cpp b/src/frontends/lean/parser.cpp index 0aaf14ea27..b309a0a73f 100644 --- a/src/frontends/lean/parser.cpp +++ b/src/frontends/lean/parser.cpp @@ -1988,7 +1988,7 @@ expr parser::id_to_expr(name const & id, pos_info const & p, bool resolve_only, } catch (exception &) {} } if (!r) { - return parser_error_or_expr({sstream() << "unknown identifier '" << id << "'", p}); + return parser_error_or_expr({sstream() << "unknown identifier '" << id.escape() << "'", p}); } return *r; } @@ -2043,7 +2043,7 @@ list parser::to_constants(name const & id, char const * msg, pos_info cons } if (rs.empty()) { - throw parser_error(sstream() << "unknown identifier '" << id << "'", p); + throw parser_error(sstream() << "unknown identifier '" << id.escape() << "'", p); } return to_list(rs); diff --git a/src/frontends/lean/pp.cpp b/src/frontends/lean/pp.cpp index 907246f563..768a0d79f1 100644 --- a/src/frontends/lean/pp.cpp +++ b/src/frontends/lean/pp.cpp @@ -669,6 +669,10 @@ optional pretty_fn::is_aliased(name const & n) const { } } +static format escape(name const & n) { + return format(n.escape()); +} + auto pretty_fn::pp_const(expr const & e, optional const & num_ref_univ_params) -> result { if (is_neutral_expr(e) && m_unicode) return format("◾"); @@ -720,11 +724,11 @@ auto pretty_fn::pp_const(expr const & e, optional const & num_ref_univ to_buffer(const_levels(e), ls); if (num_ref_univ_params) { if (ls.size() <= *num_ref_univ_params) - return result(format(n)); + return result(escape(n)); else first_idx = *num_ref_univ_params; } - format r = compose(format(n), format(".{")); + format r = compose(escape(n), format(".{")); bool first = true; for (unsigned i = first_idx; i < ls.size(); i++) { level const & l = ls[i]; @@ -740,7 +744,7 @@ auto pretty_fn::pp_const(expr const & e, optional const & num_ref_univ r += format("}"); return result(group(r)); } else { - return result(format(n)); + return result(escape(n)); } } @@ -765,7 +769,7 @@ auto pretty_fn::pp_local(expr const & e) -> result { if (m_locals_full_names) return result(format("<") + format(n + mlocal_name(e)) + format(">")); else - return format(n); + return format(escape(n)); } bool pretty_fn::has_implicit_args(expr const & f) { @@ -893,7 +897,7 @@ format pretty_fn::pp_binder(expr const & local) { auto bi = local_info(local); if (bi != binder_info()) r += format(open_binder_string(bi, m_unicode)); - r += format(local_pp_name(local)); + r += escape(local_pp_name(local)); if (m_binder_types) { r += space(); r += compose(colon(), nest(m_indent, compose(line(), pp_child(mlocal_type(local), 0).fmt()))); @@ -908,7 +912,7 @@ format pretty_fn::pp_binder_block(buffer const & names, expr const & type, if (m_binder_types || bi != binder_info()) r += format(open_binder_string(bi, m_unicode)); for (name const & n : names) { - r += format(n); + r += escape(n); } if (m_binder_types) { r += space(); @@ -1009,7 +1013,7 @@ auto pretty_fn::pp_have(expr const & e) -> result { format proof_fmt = pp_child(proof, 0).fmt(); format body_fmt = pp_child(body, 0).fmt(); format head_fmt = *g_have_fmt; - format r = head_fmt + space() + format(n) + space(); + format r = head_fmt + space() + escape(n) + space(); r += colon() + nest(m_indent, line() + type_fmt + comma() + space() + *g_from_fmt); r = group(r); r += nest(m_indent, line() + proof_fmt + comma()); diff --git a/src/frontends/lean/scanner.cpp b/src/frontends/lean/scanner.cpp index f18d3f767a..7ec1b8a606 100644 --- a/src/frontends/lean/scanner.cpp +++ b/src/frontends/lean/scanner.cpp @@ -7,6 +7,7 @@ Author: Leonardo de Moura #include #include #include "util/exception.h" +#include "util/name.h" #include "util/utf8.h" #include "library/type_context.h" #include "library/message_builder.h" @@ -25,50 +26,6 @@ unsigned scanner::get_utf8_size(uchar c) { return r; } -static unsigned utf8_to_unicode(uchar const * begin, uchar const * end) { - unsigned result = 0; - if (begin == end) - return result; - auto it = begin; - unsigned c = *it; - ++it; - if (c < 128) - return c; - unsigned mask = (1u << 6) -1; - unsigned hmask = mask; - unsigned shift = 0; - unsigned num_bits = 0; - while ((c & 0xC0) == 0xC0) { - c <<= 1; - c &= 0xff; - num_bits += 6; - hmask >>= 1; - shift++; - result <<= 6; - if (it == end) - return 0; - result |= *it & mask; - ++it; - } - result |= ((c >> shift) & hmask) << num_bits; - return result; -} - -bool is_greek_unicode(unsigned u) { return 0x391 <= u && u <= 0x3DD; } -bool is_letter_like_unicode(unsigned u) { - return - (0x3b1 <= u && u <= 0x3c9 && u != 0x3bb) || // Lower greek, but lambda - (0x391 <= u && u <= 0x3A9 && u != 0x3A0 && u != 0x3A3) || // Upper greek, but Pi and Sigma - (0x3ca <= u && u <= 0x3fb) || // Coptic letters - (0x1f00 <= u && u <= 0x1ffe) || // Polytonic Greek Extended Character Set - (0x2100 <= u && u <= 0x214f); // Letter like block -} -bool is_sub_script_alnum_unicode(unsigned u) { - return - (0x207f <= u && u <= 0x2089) || // n superscript and numberic subscripts - (0x2090 <= u && u <= 0x209c); // letter-like subscripts -} - void scanner::fetch_line() { m_curr_line.clear(); if (std::getline(*m_stream, m_curr_line)) { @@ -465,23 +422,6 @@ void scanner::next_utf(buffer & cs) { next_utf_core(curr(), cs); } -constexpr char16_t id_begin_escape = u'«'; -constexpr char16_t id_end_escape = u'»'; - -static bool is_id_first(uchar const * begin, uchar const * end) { - if (std::isalpha(*begin) || *begin == '_') - return true; - unsigned u = utf8_to_unicode(begin, end); - return u == id_begin_escape || is_letter_like_unicode(u); -} - -bool is_id_rest(uchar const * begin, uchar const * end) { - if (std::isalnum(*begin) || *begin == '_' || *begin == '\'') - return true; - unsigned u = utf8_to_unicode(begin, end); - return is_letter_like_unicode(u) || is_sub_script_alnum_unicode(u); -} - static char const * g_error_key_msg = "unexpected token"; void scanner::read_field_idx() { diff --git a/src/frontends/lean/scanner.h b/src/frontends/lean/scanner.h index b2593ccd12..7cdec208a0 100644 --- a/src/frontends/lean/scanner.h +++ b/src/frontends/lean/scanner.h @@ -10,6 +10,7 @@ Author: Leonardo de Moura #include "kernel/pos_info_provider.h" #include "util/name.h" #include "util/flet.h" +#include "util/utf8.h" #include "util/numerics/mpq.h" #include "kernel/environment.h" #include "library/io_state.h" @@ -20,8 +21,6 @@ enum class token_kind {Keyword, CommandKeyword, Identifier, Numeral, Decimal, String, Char, QuotedSymbol, DocBlock, ModDocBlock, FieldNum, FieldName, Eof}; -using uchar = unsigned char; - /** \brief Scanner. The behavior of the scanner is controlled using a token set. @@ -119,11 +118,6 @@ public: }; }; std::ostream & operator<<(std::ostream & out, token_kind k); -bool is_id_rest(uchar const * begin, uchar const * end); -inline bool is_id_rest(char const * begin, char const * end) { - return is_id_rest(reinterpret_cast(begin), - reinterpret_cast(end)); -} class token { token_kind m_kind; diff --git a/src/util/name.cpp b/src/util/name.cpp index 339c254bea..ec3bedecf6 100644 --- a/src/util/name.cpp +++ b/src/util/name.cpp @@ -23,23 +23,71 @@ Author: Leonardo de Moura namespace lean { constexpr char const * anonymous_str = "[anonymous]"; -void name::imp::display_core(std::ostream & out, imp * p, char const * sep) { - lean_assert(p != nullptr); - if (p->m_prefix) { - display_core(out, p->m_prefix, sep); - out << sep; - } - if (p->m_is_string) - out << p->m_str; - else - out << p->m_k; +bool is_greek_unicode(unsigned u) { return 0x391 <= u && u <= 0x3DD; } +bool is_letter_like_unicode(unsigned u) { + return + (0x3b1 <= u && u <= 0x3c9 && u != 0x3bb) || // Lower greek, but lambda + (0x391 <= u && u <= 0x3A9 && u != 0x3A0 && u != 0x3A3) || // Upper greek, but Pi and Sigma + (0x3ca <= u && u <= 0x3fb) || // Coptic letters + (0x1f00 <= u && u <= 0x1ffe) || // Polytonic Greek Extended Character Set + (0x2100 <= u && u <= 0x214f); // Letter like block +} +bool is_sub_script_alnum_unicode(unsigned u) { + return + (0x207f <= u && u <= 0x2089) || // n superscript and numberic subscripts + (0x2090 <= u && u <= 0x209c); // letter-like subscripts } -void name::imp::display(std::ostream & out, imp * p, char const * sep) { +bool is_id_first(char const * begin, char const * end) { + if (std::isalpha(*begin) || *begin == '_') + return true; + unsigned u = utf8_to_unicode(begin, end); + return u == id_begin_escape || is_letter_like_unicode(u); +} + +bool is_id_rest(char const * begin, char const * end) { + if (std::isalnum(*begin) || *begin == '_' || *begin == '\'') + return true; + unsigned u = utf8_to_unicode(begin, end); + return is_letter_like_unicode(u) || is_sub_script_alnum_unicode(u); +} + +void name::imp::display_core(std::ostream & out, imp * p, bool escape, char const * sep) { + lean_assert(p != nullptr); + if (p->m_prefix) { + display_core(out, p->m_prefix, escape, sep); + out << sep; + } + if (p->m_is_string) { + size_t sz = strlen(p->m_str); + bool must_escape = false; + if (escape && *p->m_str) { + if (!is_id_first(p->m_str, p->m_str + sz)) + must_escape = true; + // don't escape names produced by server::display_decl + if (must_escape && p->m_str[0] == '?') + must_escape = false; + if (escape) { + for (char * s = p->m_str + get_utf8_size(p->m_str[0]); !must_escape && *s; s += get_utf8_size(*s)) { + if (!is_id_rest(s, p->m_str + sz)) + must_escape = true; + } + } + } + if (must_escape) + out << "«" << p->m_str << "»"; + else + out << p->m_str; + } else { + out << p->m_k; + } +} + +void name::imp::display(std::ostream & out, imp * p, bool escape, char const * sep) { if (p == nullptr) out << anonymous_str; else - display_core(out, p, sep); + display_core(out, p, escape, sep); } void copy_limbs(name::imp * p, buffer & limbs) { @@ -297,12 +345,18 @@ name name::get_root() const { std::string name::to_string(char const * sep) const { std::ostringstream s; - imp::display(s, m_ptr, sep); + imp::display(s, m_ptr, false, sep); + return s.str(); +} + +std::string name::escape(char const * sep) const { + std::ostringstream s; + imp::display(s, m_ptr, true, sep); return s.str(); } std::ostream & operator<<(std::ostream & out, name const & n) { - name::imp::display(out, n.m_ptr); + name::imp::display(out, n.m_ptr, false); return out; } diff --git a/src/util/name.h b/src/util/name.h index c41e7f6987..7ca7730e10 100644 --- a/src/util/name.h +++ b/src/util/name.h @@ -18,6 +18,21 @@ Author: Leonardo de Moura namespace lean { constexpr char const * lean_name_separator = "."; +constexpr char16_t id_begin_escape = u'«'; +constexpr char16_t id_end_escape = u'»'; + +bool is_id_first(char const * begin, char const * end); +inline bool is_id_first(unsigned char const * begin, unsigned char const * end) { + return is_id_first(reinterpret_cast(begin), + reinterpret_cast(end)); +} + +bool is_id_rest(char const * begin, char const * end); +inline bool is_id_rest(unsigned char const * begin, unsigned char const * end) { + return is_id_rest(reinterpret_cast(begin), + reinterpret_cast(end)); +} + enum class name_kind { ANONYMOUS, STRING, NUMERAL }; /** \brief Hierarchical names. @@ -36,8 +51,8 @@ public: }; void dealloc(); imp(bool s, imp * p):m_rc(1), m_is_string(s), m_hash(0), m_prefix(p) { if (p) p->inc_ref(); } - static void display_core(std::ostream & out, imp * p, char const * sep); - static void display(std::ostream & out, imp * p, char const * sep = lean_name_separator); + static void display_core(std::ostream & out, imp * p, bool escape, char const * sep); + static void display(std::ostream & out, imp * p, bool escape, char const * sep = lean_name_separator); friend void copy_limbs(imp * p, buffer & limbs); }; private: @@ -125,6 +140,7 @@ public: name get_root() const; /** \brief Convert this hierarchical name into a string. */ std::string to_string(char const * sep = lean_name_separator) const; + std::string escape(char const * sep = lean_name_separator) const; /** \brief Size of the this name (in characters). */ size_t size() const; /** \brief Size of the this name in unicode. */ diff --git a/src/util/utf8.cpp b/src/util/utf8.cpp index 23fe994988..872583cc53 100644 --- a/src/util/utf8.cpp +++ b/src/util/utf8.cpp @@ -8,6 +8,7 @@ Author: Leonardo de Moura #include #include "util/debug.h" #include "util/optional.h" +#include "util/utf8.h" namespace lean { bool is_utf8_next(unsigned char c) { return (c & 0xC0) == 0x80; } @@ -81,4 +82,33 @@ std::string utf8_trim(std::string const & s) { stop = s.size(); return s.substr(start, stop - start); } + +unsigned utf8_to_unicode(uchar const * begin, uchar const * end) { + unsigned result = 0; + if (begin == end) + return result; + auto it = begin; + unsigned c = *it; + ++it; + if (c < 128) + return c; + unsigned mask = (1u << 6) -1; + unsigned hmask = mask; + unsigned shift = 0; + unsigned num_bits = 0; + while ((c & 0xC0) == 0xC0) { + c <<= 1; + c &= 0xff; + num_bits += 6; + hmask >>= 1; + shift++; + result <<= 6; + if (it == end) + return 0; + result |= *it & mask; + ++it; + } + result |= ((c >> shift) & hmask) << num_bits; + return result; +} } diff --git a/src/util/utf8.h b/src/util/utf8.h index 1f88315955..206dcc387a 100644 --- a/src/util/utf8.h +++ b/src/util/utf8.h @@ -9,10 +9,17 @@ Author: Leonardo de Moura #include "util/optional.h" namespace lean { +using uchar = unsigned char; + bool is_utf8_next(unsigned char c); unsigned get_utf8_size(unsigned char c); size_t utf8_strlen(char const * str); optional utf8_char_pos(char const * str, size_t char_idx); char const * get_utf8_last_char(char const * str); std::string utf8_trim(std::string const & s); +unsigned utf8_to_unicode(uchar const * begin, uchar const * end); +inline unsigned utf8_to_unicode(char const * begin, char const * end) { + return utf8_to_unicode(reinterpret_cast(begin), + reinterpret_cast(end)); +} } diff --git a/tests/lean/escape_id.lean b/tests/lean/escape_id.lean new file mode 100644 index 0000000000..0fa2968827 --- /dev/null +++ b/tests/lean/escape_id.lean @@ -0,0 +1,17 @@ +def «def» := 1 +#check «def» + +def «[ ]» := 1 +#check «[ ]» +#check «[» + +def a.«b.c» := 1 +#check a.«b.c» +#check a.b.c +#check «a.b.c» + +#check [1].«length» + +#check ««» +#check « +» diff --git a/tests/lean/escape_id.lean.expected.out b/tests/lean/escape_id.lean.expected.out new file mode 100644 index 0000000000..e915a68742 --- /dev/null +++ b/tests/lean/escape_id.lean.expected.out @@ -0,0 +1,12 @@ +def : ℕ +«[ ]» : ℕ +escape_id.lean:6:7: error: unknown identifier '«[»' +a.«b.c» : ℕ +escape_id.lean:10:7: error: unknown identifier 'a.b.c' +escape_id.lean:11:7: error: unknown identifier '«a.b.c»' +list.length [1] : ℕ +escape_id.lean:15:8: error: illegal character in escaped identifier +escape_id.lean:16:0: error: invalid expression, unexpected token +escape_id.lean:16:8: error: illegal character in escaped identifier +escape_id.lean:17:0: error: unexpected token +escape_id.lean:17:1: error: invalid expression, unexpected token