feat(runtime, library/init/data/string/basic): add utf8_pos

`utf8_pos` is a low level alternative for `string.iterator`. TODO: implement `string.iterator` using it.
2019-03-09 12:23:21 -08:00 · 2019-03-09 12:23:21 -08:00 · c862ce4a75
commit c862ce4a75
parent 2c3cfcf1dd
7 changed files with 151 additions and 9 deletions
--- a/library/init/data/string/basic.lean
+++ b/library/init/data/string/basic.lean
@ -58,6 +58,50 @@ def append : string → (@& string) → string
 def to_list (s : string) : list char :=
 s.data

+private def csize (c : char) : usize :=
+usize.of_uint32 c.utf8_size
+
+private def utf8_byte_size_aux : list char → usize → usize
+| []      r := r
+| (c::cs) r := utf8_byte_size_aux cs (r + csize c)
+
+@[extern cpp "lean::string_utf8_byte_size"]
+def utf8_byte_size : (@& string) → usize
+| ⟨s⟩ := utf8_byte_size_aux s 0
+
+abbreviation utf8_pos := usize
+
+def utf8_begin : utf8_pos := 0
+
+private def utf8_get_aux : list char → usize → usize → char
+| []      i p := default char
+| (c::cs) i p := if i = p then c else utf8_get_aux cs (i + csize c) p
+
+@[extern cpp "lean::string_utf8_get"]
+def utf8_get : (@& string) → utf8_pos → char
+| ⟨s⟩ p := utf8_get_aux s 0 p
+
+@[extern cpp "lean::string_utf8_next"]
+def utf8_next (s : @& string) (p : utf8_pos) : utf8_pos :=
+let c := utf8_get s p in
+p + csize c
+
+@[extern cpp "lean::string_utf8_at_end"]
+def utf8_at_end : (@& string) → utf8_pos → bool
+| s p := p ≥ utf8_byte_size s
+
+private def utf8_extract_aux₂ : list char → usize → usize → list char
+| []      _ _ := []
+| (c::cs) i e := if i = e then [] else c :: utf8_extract_aux₂ cs (i + csize c) e
+
+private def utf8_extract_aux₁ : list char → usize → usize → usize → list char
+| []        _ _ _ := []
+| s@(c::cs) i b e := if i = b then utf8_extract_aux₂ s i e else utf8_extract_aux₁ cs (i + csize c) b e
+
+@[extern cpp "lean::string_utf8_extract"]
+def utf8_extract : (@& string) → utf8_pos → utf8_pos → string
+| ⟨s⟩ b e := if b ≥ e then ⟨[]⟩ else ⟨utf8_extract_aux₁ s 0 b e⟩
+
 /- In the VM, the string iterator is implemented as a pointer to the string being iterated + index.
   TODO: mark it opaque. -/
 structure iterator :=
--- a/src/frontends/lean/scanner.cpp
+++ b/src/frontends/lean/scanner.cpp
@ -177,7 +177,7 @@ auto scanner::read_char() -> token_kind {
        next();
        return token_kind::Char;
    }
-    if (optional<unsigned> sz = is_utf8_first_byte(c)) {
+    if (optional<unsigned> sz = get_utf8_first_byte_opt(c)) {
        m_buffer.clear();
        for (unsigned i = 0; i < *sz; i++) {
            m_buffer += c;
--- a/src/library/vm/vm_string.cpp
+++ b/src/library/vm/vm_string.cpp
@ -179,7 +179,7 @@ static bool is_unshared_it_vm_string(vm_obj const & it) {
 }

 static unsigned get_utf8_char_size_at(std::string const & s, unsigned i) {
-    if (auto sz = is_utf8_first_byte(s[i])) {
+    if (auto sz = get_utf8_first_byte_opt(s[i])) {
        return *sz;
    } else {
        return 1;
@ -247,7 +247,7 @@ vm_obj string_iterator_prev(vm_obj const & it) {
        /* we have to walk at most 4 steps backwards */
        for (unsigned j = 0; j < 4; j++) {
            --new_i;
-            if (is_utf8_first_byte(s.m_value[new_i])) {
+            if (get_utf8_first_byte_opt(s.m_value[new_i])) {
                return update_vm_constructor(update_vm_constructor(it, 1, mk_vm_nat(new_i)), 2, mk_vm_nat(r + 1));;
            }
        }
--- a/src/runtime/object.cpp
+++ b/src/runtime/object.cpp
@ -1523,6 +1523,90 @@ obj_res string_data(obj_arg s) {
    return string_to_list_core(tmp);
 }

+uint32 string_utf8_get(b_obj_arg s, usize i) {
+    char const * str = string_cstr(s);
+    usize size = string_size(s) - 1;
+    if (i >= string_size(s) - 1)
+        return char_default_value();
+    unsigned c = static_cast<unsigned char>(str[i]);
+    /* zero continuation (0 to 127) */
+    if ((c & 0x80) == 0) {
+        i++;
+        return c;
+    }
+
+    /* one continuation (128 to 2047) */
+    if ((c & 0xe0) == 0xc0 && i + 1 < size) {
+        unsigned c1 = static_cast<unsigned char>(str[i+1]);
+        unsigned r = ((c & 0x1f) << 6) | (c1 & 0x3f);
+        if (r >= 128) {
+            i += 2;
+            return r;
+        }
+    }
+
+    /* two continuations (2048 to 55295 and 57344 to 65535) */
+    if ((c & 0xf0) == 0xe0 && i + 2 < size) {
+        unsigned c1 = static_cast<unsigned char>(str[i+1]);
+        unsigned c2 = static_cast<unsigned char>(str[i+2]);
+        unsigned r = ((c & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f);
+        if (r >= 2048 && (r < 55296 || r > 57343)) {
+            i += 3;
+            return r;
+        }
+    }
+
+    /* three continuations (65536 to 1114111) */
+    if ((c & 0xf8) == 0xf0 && i + 3 < size) {
+        unsigned c1 = static_cast<unsigned char>(str[i+1]);
+        unsigned c2 = static_cast<unsigned char>(str[i+2]);
+        unsigned c3 = static_cast<unsigned char>(str[i+3]);
+        unsigned r  = ((c & 0x07) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f);
+        if (r >= 65536 && r <= 1114111) {
+            i += 4;
+            return r;
+        }
+    }
+
+    /* invalid UTF-8 encoded string */
+    return char_default_value();
+}
+
+/* The reference implementation is:
+   ```
+   def utf8_next (s : @& string) (p : utf8_pos) : utf8_pos :=
+   let c := utf8_get s p in
+   p + csize c
+   ```
+*/
+usize string_utf8_next(b_obj_arg s, usize i) {
+    char const * str = string_cstr(s);
+    usize size       = string_size(s) - 1;
+    /* `csize c` is 1 when `i` is not a valid position in the reference implementation. */
+    if (i >= size) return i+1;
+    unsigned c = static_cast<unsigned char>(str[i]);
+    if ((c & 0x80) == 0)    return i+1;
+    if ((c & 0xe0) == 0xc0) return i + 2;
+    if ((c & 0xf0) == 0xe0) return i + 3;
+    if ((c & 0xf8) == 0xf0) return i + 4;
+    /* invalid UTF-8 encoded string */
+    return i + 1;
+}
+
+obj_res string_utf8_extract(b_obj_arg s, usize b, usize e) {
+    usize sz = string_size(s) - 1;
+    if (b >= e || b >= sz) return mk_string("");
+    if (e > sz) e = sz;
+    lean_assert(b < e);
+    usize new_sz = e - b;
+    lean_assert(new_sz > 0);
+    obj_res r = alloc_string(new_sz+1, new_sz+1, 0);
+    memcpy(w_string_cstr(r), string_cstr(s) + b, new_sz);
+    w_string_cstr(r)[new_sz] = 0;
+    to_string(r)->m_length = utf8_strlen(w_string_cstr(r), new_sz);
+    return r;
+}
+
 /* `pos` is in bytes, and `remaining` is in characters */
 static obj_res mk_iterator(obj_arg s, size_t pos, size_t remaining) {
    obj_res r = alloc_cnstr(0, 1, sizeof(size_t)*2);
@ -1543,7 +1627,7 @@ static uint32 mk_default_char() { return 65; }
 static bool is_unshared_it_string(b_obj_arg it) { return is_exclusive(it) && !is_shared(cnstr_get(it, 0)); }

 static unsigned get_utf8_char_size_at(std::string const & s, unsigned i) {
-    if (auto sz = is_utf8_first_byte(s[i])) {
+    if (auto sz = get_utf8_first_byte_opt(s[i])) {
        return *sz;
    } else {
        return 1;
@ -1622,7 +1706,7 @@ obj_res string_iterator_prev(obj_arg it) {
        /* we have to walk at most 4 steps backwards */
        for (unsigned j = 0; j < 4; j++) {
            --new_i;
-            if (is_utf8_first_byte(string_cstr(s)[new_i])) {
+            if (get_utf8_first_byte_opt(string_cstr(s)[new_i])) {
                if (is_exclusive(it)) {
                    it_set_pos(it, new_i);
                    it_set_remaining(it, r+1);
--- a/src/runtime/object.h
+++ b/src/runtime/object.h
@ -1121,6 +1121,8 @@ inline obj_res mk_option_some(obj_arg v) { obj_res r = alloc_cnstr(1, 1, 0); cns
 // =======================================
 // String

+/* instance : inhabited char := ⟨'A'⟩ */
+inline uint32 char_default_value() { return 'A'; }
 inline obj_res alloc_string(size_t size, size_t capacity, size_t len) {
    LEAN_RUNTIME_STAT_CODE(g_num_string++);
    return new (alloc_heap_object(string_byte_size(capacity))) string_object(size, capacity, len); // NOLINT
@ -1136,6 +1138,11 @@ obj_res string_append(obj_arg s1, b_obj_arg s2);
 inline obj_res string_length(b_obj_arg s) { return nat_of_size_t(string_len(s)); }
 obj_res string_mk(obj_arg cs);
 obj_res string_data(obj_arg s);
+inline usize string_utf8_byte_size(b_obj_arg s) { return string_size(s) - 1; }
+uint32 string_utf8_get(b_obj_arg s, usize i);
+usize string_utf8_next(b_obj_arg s, usize i);
+inline uint8 string_utf8_at_end(b_obj_arg s, usize i) { return i >= string_size(s) - 1; }
+obj_res string_utf8_extract(b_obj_arg s, usize b, usize e);
 obj_res string_mk_iterator(obj_arg s);
 uint32 string_iterator_curr(b_obj_arg it);
 obj_res string_iterator_set_curr(obj_arg it, uint32 c);
--- a/src/runtime/utf8.cpp
+++ b/src/runtime/utf8.cpp
@ -42,10 +42,10 @@ size_t utf8_strlen(char const * str) {
    return r;
 }

-size_t utf8_strlen(std::string const & str) {
+size_t utf8_strlen(char const * str, size_t sz) {
    size_t r = 0;
    size_t i = 0;
-    while (i < str.size()) {
+    while (i < sz) {
        unsigned d = get_utf8_size(str[i]);
        r++;
        i += d;
@ -53,6 +53,10 @@ size_t utf8_strlen(std::string const & str) {
    return r;
 }

+size_t utf8_strlen(std::string const & str) {
+    return utf8_strlen(str.data(), str.size());
+}
+
 optional<size_t> utf8_char_pos(char const * str, size_t char_idx) {
    size_t r = 0;
    while (*str != 0) {
@ -132,7 +136,7 @@ zzzzyyyy yyxxxxxx          | 1110zzzz   | 10yyyyyy    | 10xxxxxx   |
 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu   | 10uuzzzz    | 10yyyyyy   | 10xxxxxx
 */

-optional<unsigned> is_utf8_first_byte(unsigned char c) {
+optional<unsigned> get_utf8_first_byte_opt(unsigned char c) {
    if ((c & 0x80) == 0) {
        /* 0xxxxxxx */
        return optional<unsigned>(1);
--- a/src/runtime/utf8.h
+++ b/src/runtime/utf8.h
@ -19,6 +19,9 @@ size_t utf8_strlen(char const * str);
 /* Return the length of the string `str` encoded using UTF8.
   `str` may contain null characters. */
 size_t utf8_strlen(std::string const & str);
+/* Return the length of the string `str` encoded using UTF8.
+   `str` may contain null characters. */
+size_t utf8_strlen(char const * str, size_t sz);
 optional<size_t> utf8_char_pos(char const * str, size_t char_idx);
 char const * get_utf8_last_char(char const * str);
 std::string utf8_trim(std::string const & s);
@ -31,7 +34,7 @@ inline unsigned utf8_to_unicode(char const * begin, char const * end) {
 /* If `c` is the first byte of an utf-8 encoded unicode scalar value,
   then return `some(n)` where `n` is the number of bytes needed to encode
   the unicode scalar value. Otherwise, return `none` */
-optional<unsigned> is_utf8_first_byte(unsigned char c);
+optional<unsigned> get_utf8_first_byte_opt(unsigned char c);

 /* "Read" next unicode character starting at position i in a string using UTF-8 encoding.
   Return the unicode character and update i. */