diff --git a/library/init/data/string/basic.lean b/library/init/data/string/basic.lean index 38287dac23..e571537d9b 100644 --- a/library/init/data/string/basic.lean +++ b/library/init/data/string/basic.lean @@ -58,6 +58,50 @@ def append : string → (@& string) → string def to_list (s : string) : list char := s.data +private def csize (c : char) : usize := +usize.of_uint32 c.utf8_size + +private def utf8_byte_size_aux : list char → usize → usize +| [] r := r +| (c::cs) r := utf8_byte_size_aux cs (r + csize c) + +@[extern cpp "lean::string_utf8_byte_size"] +def utf8_byte_size : (@& string) → usize +| ⟨s⟩ := utf8_byte_size_aux s 0 + +abbreviation utf8_pos := usize + +def utf8_begin : utf8_pos := 0 + +private def utf8_get_aux : list char → usize → usize → char +| [] i p := default char +| (c::cs) i p := if i = p then c else utf8_get_aux cs (i + csize c) p + +@[extern cpp "lean::string_utf8_get"] +def utf8_get : (@& string) → utf8_pos → char +| ⟨s⟩ p := utf8_get_aux s 0 p + +@[extern cpp "lean::string_utf8_next"] +def utf8_next (s : @& string) (p : utf8_pos) : utf8_pos := +let c := utf8_get s p in +p + csize c + +@[extern cpp "lean::string_utf8_at_end"] +def utf8_at_end : (@& string) → utf8_pos → bool +| s p := p ≥ utf8_byte_size s + +private def utf8_extract_aux₂ : list char → usize → usize → list char +| [] _ _ := [] +| (c::cs) i e := if i = e then [] else c :: utf8_extract_aux₂ cs (i + csize c) e + +private def utf8_extract_aux₁ : list char → usize → usize → usize → list char +| [] _ _ _ := [] +| s@(c::cs) i b e := if i = b then utf8_extract_aux₂ s i e else utf8_extract_aux₁ cs (i + csize c) b e + +@[extern cpp "lean::string_utf8_extract"] +def utf8_extract : (@& string) → utf8_pos → utf8_pos → string +| ⟨s⟩ b e := if b ≥ e then ⟨[]⟩ else ⟨utf8_extract_aux₁ s 0 b e⟩ + /- In the VM, the string iterator is implemented as a pointer to the string being iterated + index. TODO: mark it opaque. -/ structure iterator := diff --git a/src/frontends/lean/scanner.cpp b/src/frontends/lean/scanner.cpp index c7cab0478d..65f4b8914b 100644 --- a/src/frontends/lean/scanner.cpp +++ b/src/frontends/lean/scanner.cpp @@ -177,7 +177,7 @@ auto scanner::read_char() -> token_kind { next(); return token_kind::Char; } - if (optional sz = is_utf8_first_byte(c)) { + if (optional sz = get_utf8_first_byte_opt(c)) { m_buffer.clear(); for (unsigned i = 0; i < *sz; i++) { m_buffer += c; diff --git a/src/library/vm/vm_string.cpp b/src/library/vm/vm_string.cpp index 32d27ac46a..a0053b92c1 100644 --- a/src/library/vm/vm_string.cpp +++ b/src/library/vm/vm_string.cpp @@ -179,7 +179,7 @@ static bool is_unshared_it_vm_string(vm_obj const & it) { } static unsigned get_utf8_char_size_at(std::string const & s, unsigned i) { - if (auto sz = is_utf8_first_byte(s[i])) { + if (auto sz = get_utf8_first_byte_opt(s[i])) { return *sz; } else { return 1; @@ -247,7 +247,7 @@ vm_obj string_iterator_prev(vm_obj const & it) { /* we have to walk at most 4 steps backwards */ for (unsigned j = 0; j < 4; j++) { --new_i; - if (is_utf8_first_byte(s.m_value[new_i])) { + if (get_utf8_first_byte_opt(s.m_value[new_i])) { return update_vm_constructor(update_vm_constructor(it, 1, mk_vm_nat(new_i)), 2, mk_vm_nat(r + 1));; } } diff --git a/src/runtime/object.cpp b/src/runtime/object.cpp index 2267bf1fd9..e95f9efaa1 100644 --- a/src/runtime/object.cpp +++ b/src/runtime/object.cpp @@ -1523,6 +1523,90 @@ obj_res string_data(obj_arg s) { return string_to_list_core(tmp); } +uint32 string_utf8_get(b_obj_arg s, usize i) { + char const * str = string_cstr(s); + usize size = string_size(s) - 1; + if (i >= string_size(s) - 1) + return char_default_value(); + unsigned c = static_cast(str[i]); + /* zero continuation (0 to 127) */ + if ((c & 0x80) == 0) { + i++; + return c; + } + + /* one continuation (128 to 2047) */ + if ((c & 0xe0) == 0xc0 && i + 1 < size) { + unsigned c1 = static_cast(str[i+1]); + unsigned r = ((c & 0x1f) << 6) | (c1 & 0x3f); + if (r >= 128) { + i += 2; + return r; + } + } + + /* two continuations (2048 to 55295 and 57344 to 65535) */ + if ((c & 0xf0) == 0xe0 && i + 2 < size) { + unsigned c1 = static_cast(str[i+1]); + unsigned c2 = static_cast(str[i+2]); + unsigned r = ((c & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f); + if (r >= 2048 && (r < 55296 || r > 57343)) { + i += 3; + return r; + } + } + + /* three continuations (65536 to 1114111) */ + if ((c & 0xf8) == 0xf0 && i + 3 < size) { + unsigned c1 = static_cast(str[i+1]); + unsigned c2 = static_cast(str[i+2]); + unsigned c3 = static_cast(str[i+3]); + unsigned r = ((c & 0x07) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f); + if (r >= 65536 && r <= 1114111) { + i += 4; + return r; + } + } + + /* invalid UTF-8 encoded string */ + return char_default_value(); +} + +/* The reference implementation is: + ``` + def utf8_next (s : @& string) (p : utf8_pos) : utf8_pos := + let c := utf8_get s p in + p + csize c + ``` +*/ +usize string_utf8_next(b_obj_arg s, usize i) { + char const * str = string_cstr(s); + usize size = string_size(s) - 1; + /* `csize c` is 1 when `i` is not a valid position in the reference implementation. */ + if (i >= size) return i+1; + unsigned c = static_cast(str[i]); + if ((c & 0x80) == 0) return i+1; + if ((c & 0xe0) == 0xc0) return i + 2; + if ((c & 0xf0) == 0xe0) return i + 3; + if ((c & 0xf8) == 0xf0) return i + 4; + /* invalid UTF-8 encoded string */ + return i + 1; +} + +obj_res string_utf8_extract(b_obj_arg s, usize b, usize e) { + usize sz = string_size(s) - 1; + if (b >= e || b >= sz) return mk_string(""); + if (e > sz) e = sz; + lean_assert(b < e); + usize new_sz = e - b; + lean_assert(new_sz > 0); + obj_res r = alloc_string(new_sz+1, new_sz+1, 0); + memcpy(w_string_cstr(r), string_cstr(s) + b, new_sz); + w_string_cstr(r)[new_sz] = 0; + to_string(r)->m_length = utf8_strlen(w_string_cstr(r), new_sz); + return r; +} + /* `pos` is in bytes, and `remaining` is in characters */ static obj_res mk_iterator(obj_arg s, size_t pos, size_t remaining) { obj_res r = alloc_cnstr(0, 1, sizeof(size_t)*2); @@ -1543,7 +1627,7 @@ static uint32 mk_default_char() { return 65; } static bool is_unshared_it_string(b_obj_arg it) { return is_exclusive(it) && !is_shared(cnstr_get(it, 0)); } static unsigned get_utf8_char_size_at(std::string const & s, unsigned i) { - if (auto sz = is_utf8_first_byte(s[i])) { + if (auto sz = get_utf8_first_byte_opt(s[i])) { return *sz; } else { return 1; @@ -1622,7 +1706,7 @@ obj_res string_iterator_prev(obj_arg it) { /* we have to walk at most 4 steps backwards */ for (unsigned j = 0; j < 4; j++) { --new_i; - if (is_utf8_first_byte(string_cstr(s)[new_i])) { + if (get_utf8_first_byte_opt(string_cstr(s)[new_i])) { if (is_exclusive(it)) { it_set_pos(it, new_i); it_set_remaining(it, r+1); diff --git a/src/runtime/object.h b/src/runtime/object.h index c88da86967..793edef298 100644 --- a/src/runtime/object.h +++ b/src/runtime/object.h @@ -1121,6 +1121,8 @@ inline obj_res mk_option_some(obj_arg v) { obj_res r = alloc_cnstr(1, 1, 0); cns // ======================================= // String +/* instance : inhabited char := ⟨'A'⟩ */ +inline uint32 char_default_value() { return 'A'; } inline obj_res alloc_string(size_t size, size_t capacity, size_t len) { LEAN_RUNTIME_STAT_CODE(g_num_string++); return new (alloc_heap_object(string_byte_size(capacity))) string_object(size, capacity, len); // NOLINT @@ -1136,6 +1138,11 @@ obj_res string_append(obj_arg s1, b_obj_arg s2); inline obj_res string_length(b_obj_arg s) { return nat_of_size_t(string_len(s)); } obj_res string_mk(obj_arg cs); obj_res string_data(obj_arg s); +inline usize string_utf8_byte_size(b_obj_arg s) { return string_size(s) - 1; } +uint32 string_utf8_get(b_obj_arg s, usize i); +usize string_utf8_next(b_obj_arg s, usize i); +inline uint8 string_utf8_at_end(b_obj_arg s, usize i) { return i >= string_size(s) - 1; } +obj_res string_utf8_extract(b_obj_arg s, usize b, usize e); obj_res string_mk_iterator(obj_arg s); uint32 string_iterator_curr(b_obj_arg it); obj_res string_iterator_set_curr(obj_arg it, uint32 c); diff --git a/src/runtime/utf8.cpp b/src/runtime/utf8.cpp index b0a6eec356..c8cff31369 100644 --- a/src/runtime/utf8.cpp +++ b/src/runtime/utf8.cpp @@ -42,10 +42,10 @@ size_t utf8_strlen(char const * str) { return r; } -size_t utf8_strlen(std::string const & str) { +size_t utf8_strlen(char const * str, size_t sz) { size_t r = 0; size_t i = 0; - while (i < str.size()) { + while (i < sz) { unsigned d = get_utf8_size(str[i]); r++; i += d; @@ -53,6 +53,10 @@ size_t utf8_strlen(std::string const & str) { return r; } +size_t utf8_strlen(std::string const & str) { + return utf8_strlen(str.data(), str.size()); +} + optional utf8_char_pos(char const * str, size_t char_idx) { size_t r = 0; while (*str != 0) { @@ -132,7 +136,7 @@ zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx */ -optional is_utf8_first_byte(unsigned char c) { +optional get_utf8_first_byte_opt(unsigned char c) { if ((c & 0x80) == 0) { /* 0xxxxxxx */ return optional(1); diff --git a/src/runtime/utf8.h b/src/runtime/utf8.h index 66003b5b7a..2949931b3e 100644 --- a/src/runtime/utf8.h +++ b/src/runtime/utf8.h @@ -19,6 +19,9 @@ size_t utf8_strlen(char const * str); /* Return the length of the string `str` encoded using UTF8. `str` may contain null characters. */ size_t utf8_strlen(std::string const & str); +/* Return the length of the string `str` encoded using UTF8. + `str` may contain null characters. */ +size_t utf8_strlen(char const * str, size_t sz); optional utf8_char_pos(char const * str, size_t char_idx); char const * get_utf8_last_char(char const * str); std::string utf8_trim(std::string const & s); @@ -31,7 +34,7 @@ inline unsigned utf8_to_unicode(char const * begin, char const * end) { /* If `c` is the first byte of an utf-8 encoded unicode scalar value, then return `some(n)` where `n` is the number of bytes needed to encode the unicode scalar value. Otherwise, return `none` */ -optional is_utf8_first_byte(unsigned char c); +optional get_utf8_first_byte_opt(unsigned char c); /* "Read" next unicode character starting at position i in a string using UTF-8 encoding. Return the unicode character and update i. */