diff --git a/library/init/data/string/basic.lean b/library/init/data/string/basic.lean
index 38287dac23..e571537d9b 100644
--- a/library/init/data/string/basic.lean
+++ b/library/init/data/string/basic.lean
@@ -58,6 +58,50 @@ def append : string → (@& string) → string
 def to_list (s : string) : list char :=
 s.data
 
+private def csize (c : char) : usize :=
+usize.of_uint32 c.utf8_size
+
+private def utf8_byte_size_aux : list char → usize → usize
+| []      r := r
+| (c::cs) r := utf8_byte_size_aux cs (r + csize c)
+
+@[extern cpp "lean::string_utf8_byte_size"]
+def utf8_byte_size : (@& string) → usize
+| ⟨s⟩ := utf8_byte_size_aux s 0
+
+abbreviation utf8_pos := usize
+
+def utf8_begin : utf8_pos := 0
+
+private def utf8_get_aux : list char → usize → usize → char
+| []      i p := default char
+| (c::cs) i p := if i = p then c else utf8_get_aux cs (i + csize c) p
+
+@[extern cpp "lean::string_utf8_get"]
+def utf8_get : (@& string) → utf8_pos → char
+| ⟨s⟩ p := utf8_get_aux s 0 p
+
+@[extern cpp "lean::string_utf8_next"]
+def utf8_next (s : @& string) (p : utf8_pos) : utf8_pos :=
+let c := utf8_get s p in
+p + csize c
+
+@[extern cpp "lean::string_utf8_at_end"]
+def utf8_at_end : (@& string) → utf8_pos → bool
+| s p := p ≥ utf8_byte_size s
+
+private def utf8_extract_aux₂ : list char → usize → usize → list char
+| []      _ _ := []
+| (c::cs) i e := if i = e then [] else c :: utf8_extract_aux₂ cs (i + csize c) e
+
+private def utf8_extract_aux₁ : list char → usize → usize → usize → list char
+| []        _ _ _ := []
+| s@(c::cs) i b e := if i = b then utf8_extract_aux₂ s i e else utf8_extract_aux₁ cs (i + csize c) b e
+
+@[extern cpp "lean::string_utf8_extract"]
+def utf8_extract : (@& string) → utf8_pos → utf8_pos → string
+| ⟨s⟩ b e := if b ≥ e then ⟨[]⟩ else ⟨utf8_extract_aux₁ s 0 b e⟩
+
 /- In the VM, the string iterator is implemented as a pointer to the string being iterated + index.
    TODO: mark it opaque. -/
 structure iterator :=
diff --git a/src/frontends/lean/scanner.cpp b/src/frontends/lean/scanner.cpp
index c7cab0478d..65f4b8914b 100644
--- a/src/frontends/lean/scanner.cpp
+++ b/src/frontends/lean/scanner.cpp
@@ -177,7 +177,7 @@ auto scanner::read_char() -> token_kind {
         next();
         return token_kind::Char;
     }
-    if (optional<unsigned> sz = is_utf8_first_byte(c)) {
+    if (optional<unsigned> sz = get_utf8_first_byte_opt(c)) {
         m_buffer.clear();
         for (unsigned i = 0; i < *sz; i++) {
             m_buffer += c;
diff --git a/src/library/vm/vm_string.cpp b/src/library/vm/vm_string.cpp
index 32d27ac46a..a0053b92c1 100644
--- a/src/library/vm/vm_string.cpp
+++ b/src/library/vm/vm_string.cpp
@@ -179,7 +179,7 @@ static bool is_unshared_it_vm_string(vm_obj const & it) {
 }
 
 static unsigned get_utf8_char_size_at(std::string const & s, unsigned i) {
-    if (auto sz = is_utf8_first_byte(s[i])) {
+    if (auto sz = get_utf8_first_byte_opt(s[i])) {
         return *sz;
     } else {
         return 1;
@@ -247,7 +247,7 @@ vm_obj string_iterator_prev(vm_obj const & it) {
         /* we have to walk at most 4 steps backwards */
         for (unsigned j = 0; j < 4; j++) {
             --new_i;
-            if (is_utf8_first_byte(s.m_value[new_i])) {
+            if (get_utf8_first_byte_opt(s.m_value[new_i])) {
                 return update_vm_constructor(update_vm_constructor(it, 1, mk_vm_nat(new_i)), 2, mk_vm_nat(r + 1));;
             }
         }
diff --git a/src/runtime/object.cpp b/src/runtime/object.cpp
index 2267bf1fd9..e95f9efaa1 100644
--- a/src/runtime/object.cpp
+++ b/src/runtime/object.cpp
@@ -1523,6 +1523,90 @@ obj_res string_data(obj_arg s) {
     return string_to_list_core(tmp);
 }
 
+uint32 string_utf8_get(b_obj_arg s, usize i) {
+    char const * str = string_cstr(s);
+    usize size = string_size(s) - 1;
+    if (i >= string_size(s) - 1)
+        return char_default_value();
+    unsigned c = static_cast<unsigned char>(str[i]);
+    /* zero continuation (0 to 127) */
+    if ((c & 0x80) == 0) {
+        i++;
+        return c;
+    }
+
+    /* one continuation (128 to 2047) */
+    if ((c & 0xe0) == 0xc0 && i + 1 < size) {
+        unsigned c1 = static_cast<unsigned char>(str[i+1]);
+        unsigned r = ((c & 0x1f) << 6) | (c1 & 0x3f);
+        if (r >= 128) {
+            i += 2;
+            return r;
+        }
+    }
+
+    /* two continuations (2048 to 55295 and 57344 to 65535) */
+    if ((c & 0xf0) == 0xe0 && i + 2 < size) {
+        unsigned c1 = static_cast<unsigned char>(str[i+1]);
+        unsigned c2 = static_cast<unsigned char>(str[i+2]);
+        unsigned r = ((c & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f);
+        if (r >= 2048 && (r < 55296 || r > 57343)) {
+            i += 3;
+            return r;
+        }
+    }
+
+    /* three continuations (65536 to 1114111) */
+    if ((c & 0xf8) == 0xf0 && i + 3 < size) {
+        unsigned c1 = static_cast<unsigned char>(str[i+1]);
+        unsigned c2 = static_cast<unsigned char>(str[i+2]);
+        unsigned c3 = static_cast<unsigned char>(str[i+3]);
+        unsigned r  = ((c & 0x07) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f);
+        if (r >= 65536 && r <= 1114111) {
+            i += 4;
+            return r;
+        }
+    }
+
+    /* invalid UTF-8 encoded string */
+    return char_default_value();
+}
+
+/* The reference implementation is:
+   ```
+   def utf8_next (s : @& string) (p : utf8_pos) : utf8_pos :=
+   let c := utf8_get s p in
+   p + csize c
+   ```
+*/
+usize string_utf8_next(b_obj_arg s, usize i) {
+    char const * str = string_cstr(s);
+    usize size       = string_size(s) - 1;
+    /* `csize c` is 1 when `i` is not a valid position in the reference implementation. */
+    if (i >= size) return i+1;
+    unsigned c = static_cast<unsigned char>(str[i]);
+    if ((c & 0x80) == 0)    return i+1;
+    if ((c & 0xe0) == 0xc0) return i + 2;
+    if ((c & 0xf0) == 0xe0) return i + 3;
+    if ((c & 0xf8) == 0xf0) return i + 4;
+    /* invalid UTF-8 encoded string */
+    return i + 1;
+}
+
+obj_res string_utf8_extract(b_obj_arg s, usize b, usize e) {
+    usize sz = string_size(s) - 1;
+    if (b >= e || b >= sz) return mk_string("");
+    if (e > sz) e = sz;
+    lean_assert(b < e);
+    usize new_sz = e - b;
+    lean_assert(new_sz > 0);
+    obj_res r = alloc_string(new_sz+1, new_sz+1, 0);
+    memcpy(w_string_cstr(r), string_cstr(s) + b, new_sz);
+    w_string_cstr(r)[new_sz] = 0;
+    to_string(r)->m_length = utf8_strlen(w_string_cstr(r), new_sz);
+    return r;
+}
+
 /* `pos` is in bytes, and `remaining` is in characters */
 static obj_res mk_iterator(obj_arg s, size_t pos, size_t remaining) {
     obj_res r = alloc_cnstr(0, 1, sizeof(size_t)*2);
@@ -1543,7 +1627,7 @@ static uint32 mk_default_char() { return 65; }
 static bool is_unshared_it_string(b_obj_arg it) { return is_exclusive(it) && !is_shared(cnstr_get(it, 0)); }
 
 static unsigned get_utf8_char_size_at(std::string const & s, unsigned i) {
-    if (auto sz = is_utf8_first_byte(s[i])) {
+    if (auto sz = get_utf8_first_byte_opt(s[i])) {
         return *sz;
     } else {
         return 1;
@@ -1622,7 +1706,7 @@ obj_res string_iterator_prev(obj_arg it) {
         /* we have to walk at most 4 steps backwards */
         for (unsigned j = 0; j < 4; j++) {
             --new_i;
-            if (is_utf8_first_byte(string_cstr(s)[new_i])) {
+            if (get_utf8_first_byte_opt(string_cstr(s)[new_i])) {
                 if (is_exclusive(it)) {
                     it_set_pos(it, new_i);
                     it_set_remaining(it, r+1);
diff --git a/src/runtime/object.h b/src/runtime/object.h
index c88da86967..793edef298 100644
--- a/src/runtime/object.h
+++ b/src/runtime/object.h
@@ -1121,6 +1121,8 @@ inline obj_res mk_option_some(obj_arg v) { obj_res r = alloc_cnstr(1, 1, 0); cns
 // =======================================
 // String
 
+/* instance : inhabited char := ⟨'A'⟩ */
+inline uint32 char_default_value() { return 'A'; }
 inline obj_res alloc_string(size_t size, size_t capacity, size_t len) {
     LEAN_RUNTIME_STAT_CODE(g_num_string++);
     return new (alloc_heap_object(string_byte_size(capacity))) string_object(size, capacity, len); // NOLINT
@@ -1136,6 +1138,11 @@ obj_res string_append(obj_arg s1, b_obj_arg s2);
 inline obj_res string_length(b_obj_arg s) { return nat_of_size_t(string_len(s)); }
 obj_res string_mk(obj_arg cs);
 obj_res string_data(obj_arg s);
+inline usize string_utf8_byte_size(b_obj_arg s) { return string_size(s) - 1; }
+uint32 string_utf8_get(b_obj_arg s, usize i);
+usize string_utf8_next(b_obj_arg s, usize i);
+inline uint8 string_utf8_at_end(b_obj_arg s, usize i) { return i >= string_size(s) - 1; }
+obj_res string_utf8_extract(b_obj_arg s, usize b, usize e);
 obj_res string_mk_iterator(obj_arg s);
 uint32 string_iterator_curr(b_obj_arg it);
 obj_res string_iterator_set_curr(obj_arg it, uint32 c);
diff --git a/src/runtime/utf8.cpp b/src/runtime/utf8.cpp
index b0a6eec356..c8cff31369 100644
--- a/src/runtime/utf8.cpp
+++ b/src/runtime/utf8.cpp
@@ -42,10 +42,10 @@ size_t utf8_strlen(char const * str) {
     return r;
 }
 
-size_t utf8_strlen(std::string const & str) {
+size_t utf8_strlen(char const * str, size_t sz) {
     size_t r = 0;
     size_t i = 0;
-    while (i < str.size()) {
+    while (i < sz) {
         unsigned d = get_utf8_size(str[i]);
         r++;
         i += d;
@@ -53,6 +53,10 @@ size_t utf8_strlen(std::string const & str) {
     return r;
 }
 
+size_t utf8_strlen(std::string const & str) {
+    return utf8_strlen(str.data(), str.size());
+}
+
 optional<size_t> utf8_char_pos(char const * str, size_t char_idx) {
     size_t r = 0;
     while (*str != 0) {
@@ -132,7 +136,7 @@ zzzzyyyy yyxxxxxx          | 1110zzzz   | 10yyyyyy    | 10xxxxxx   |
 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu   | 10uuzzzz    | 10yyyyyy   | 10xxxxxx
 */
 
-optional<unsigned> is_utf8_first_byte(unsigned char c) {
+optional<unsigned> get_utf8_first_byte_opt(unsigned char c) {
     if ((c & 0x80) == 0) {
         /* 0xxxxxxx */
         return optional<unsigned>(1);
diff --git a/src/runtime/utf8.h b/src/runtime/utf8.h
index 66003b5b7a..2949931b3e 100644
--- a/src/runtime/utf8.h
+++ b/src/runtime/utf8.h
@@ -19,6 +19,9 @@ size_t utf8_strlen(char const * str);
 /* Return the length of the string `str` encoded using UTF8.
    `str` may contain null characters. */
 size_t utf8_strlen(std::string const & str);
+/* Return the length of the string `str` encoded using UTF8.
+   `str` may contain null characters. */
+size_t utf8_strlen(char const * str, size_t sz);
 optional<size_t> utf8_char_pos(char const * str, size_t char_idx);
 char const * get_utf8_last_char(char const * str);
 std::string utf8_trim(std::string const & s);
@@ -31,7 +34,7 @@ inline unsigned utf8_to_unicode(char const * begin, char const * end) {
 /* If `c` is the first byte of an utf-8 encoded unicode scalar value,
    then return `some(n)` where `n` is the number of bytes needed to encode
    the unicode scalar value. Otherwise, return `none` */
-optional<unsigned> is_utf8_first_byte(unsigned char c);
+optional<unsigned> get_utf8_first_byte_opt(unsigned char c);
 
 /* "Read" next unicode character starting at position i in a string using UTF-8 encoding.
    Return the unicode character and update i. */