feat(runtime, library/init/data/string/basic): add utf8_pos
`utf8_pos` is a low level alternative for `string.iterator`. TODO: implement `string.iterator` using it.
This commit is contained in:
parent
2c3cfcf1dd
commit
c862ce4a75
7 changed files with 151 additions and 9 deletions
|
|
@ -58,6 +58,50 @@ def append : string → (@& string) → string
|
|||
def to_list (s : string) : list char :=
|
||||
s.data
|
||||
|
||||
private def csize (c : char) : usize :=
|
||||
usize.of_uint32 c.utf8_size
|
||||
|
||||
private def utf8_byte_size_aux : list char → usize → usize
|
||||
| [] r := r
|
||||
| (c::cs) r := utf8_byte_size_aux cs (r + csize c)
|
||||
|
||||
@[extern cpp "lean::string_utf8_byte_size"]
|
||||
def utf8_byte_size : (@& string) → usize
|
||||
| ⟨s⟩ := utf8_byte_size_aux s 0
|
||||
|
||||
abbreviation utf8_pos := usize
|
||||
|
||||
def utf8_begin : utf8_pos := 0
|
||||
|
||||
private def utf8_get_aux : list char → usize → usize → char
|
||||
| [] i p := default char
|
||||
| (c::cs) i p := if i = p then c else utf8_get_aux cs (i + csize c) p
|
||||
|
||||
@[extern cpp "lean::string_utf8_get"]
|
||||
def utf8_get : (@& string) → utf8_pos → char
|
||||
| ⟨s⟩ p := utf8_get_aux s 0 p
|
||||
|
||||
@[extern cpp "lean::string_utf8_next"]
|
||||
def utf8_next (s : @& string) (p : utf8_pos) : utf8_pos :=
|
||||
let c := utf8_get s p in
|
||||
p + csize c
|
||||
|
||||
@[extern cpp "lean::string_utf8_at_end"]
|
||||
def utf8_at_end : (@& string) → utf8_pos → bool
|
||||
| s p := p ≥ utf8_byte_size s
|
||||
|
||||
private def utf8_extract_aux₂ : list char → usize → usize → list char
|
||||
| [] _ _ := []
|
||||
| (c::cs) i e := if i = e then [] else c :: utf8_extract_aux₂ cs (i + csize c) e
|
||||
|
||||
private def utf8_extract_aux₁ : list char → usize → usize → usize → list char
|
||||
| [] _ _ _ := []
|
||||
| s@(c::cs) i b e := if i = b then utf8_extract_aux₂ s i e else utf8_extract_aux₁ cs (i + csize c) b e
|
||||
|
||||
@[extern cpp "lean::string_utf8_extract"]
|
||||
def utf8_extract : (@& string) → utf8_pos → utf8_pos → string
|
||||
| ⟨s⟩ b e := if b ≥ e then ⟨[]⟩ else ⟨utf8_extract_aux₁ s 0 b e⟩
|
||||
|
||||
/- In the VM, the string iterator is implemented as a pointer to the string being iterated + index.
|
||||
TODO: mark it opaque. -/
|
||||
structure iterator :=
|
||||
|
|
|
|||
|
|
@ -177,7 +177,7 @@ auto scanner::read_char() -> token_kind {
|
|||
next();
|
||||
return token_kind::Char;
|
||||
}
|
||||
if (optional<unsigned> sz = is_utf8_first_byte(c)) {
|
||||
if (optional<unsigned> sz = get_utf8_first_byte_opt(c)) {
|
||||
m_buffer.clear();
|
||||
for (unsigned i = 0; i < *sz; i++) {
|
||||
m_buffer += c;
|
||||
|
|
|
|||
|
|
@ -179,7 +179,7 @@ static bool is_unshared_it_vm_string(vm_obj const & it) {
|
|||
}
|
||||
|
||||
static unsigned get_utf8_char_size_at(std::string const & s, unsigned i) {
|
||||
if (auto sz = is_utf8_first_byte(s[i])) {
|
||||
if (auto sz = get_utf8_first_byte_opt(s[i])) {
|
||||
return *sz;
|
||||
} else {
|
||||
return 1;
|
||||
|
|
@ -247,7 +247,7 @@ vm_obj string_iterator_prev(vm_obj const & it) {
|
|||
/* we have to walk at most 4 steps backwards */
|
||||
for (unsigned j = 0; j < 4; j++) {
|
||||
--new_i;
|
||||
if (is_utf8_first_byte(s.m_value[new_i])) {
|
||||
if (get_utf8_first_byte_opt(s.m_value[new_i])) {
|
||||
return update_vm_constructor(update_vm_constructor(it, 1, mk_vm_nat(new_i)), 2, mk_vm_nat(r + 1));;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1523,6 +1523,90 @@ obj_res string_data(obj_arg s) {
|
|||
return string_to_list_core(tmp);
|
||||
}
|
||||
|
||||
uint32 string_utf8_get(b_obj_arg s, usize i) {
|
||||
char const * str = string_cstr(s);
|
||||
usize size = string_size(s) - 1;
|
||||
if (i >= string_size(s) - 1)
|
||||
return char_default_value();
|
||||
unsigned c = static_cast<unsigned char>(str[i]);
|
||||
/* zero continuation (0 to 127) */
|
||||
if ((c & 0x80) == 0) {
|
||||
i++;
|
||||
return c;
|
||||
}
|
||||
|
||||
/* one continuation (128 to 2047) */
|
||||
if ((c & 0xe0) == 0xc0 && i + 1 < size) {
|
||||
unsigned c1 = static_cast<unsigned char>(str[i+1]);
|
||||
unsigned r = ((c & 0x1f) << 6) | (c1 & 0x3f);
|
||||
if (r >= 128) {
|
||||
i += 2;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
/* two continuations (2048 to 55295 and 57344 to 65535) */
|
||||
if ((c & 0xf0) == 0xe0 && i + 2 < size) {
|
||||
unsigned c1 = static_cast<unsigned char>(str[i+1]);
|
||||
unsigned c2 = static_cast<unsigned char>(str[i+2]);
|
||||
unsigned r = ((c & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f);
|
||||
if (r >= 2048 && (r < 55296 || r > 57343)) {
|
||||
i += 3;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
/* three continuations (65536 to 1114111) */
|
||||
if ((c & 0xf8) == 0xf0 && i + 3 < size) {
|
||||
unsigned c1 = static_cast<unsigned char>(str[i+1]);
|
||||
unsigned c2 = static_cast<unsigned char>(str[i+2]);
|
||||
unsigned c3 = static_cast<unsigned char>(str[i+3]);
|
||||
unsigned r = ((c & 0x07) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f);
|
||||
if (r >= 65536 && r <= 1114111) {
|
||||
i += 4;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
/* invalid UTF-8 encoded string */
|
||||
return char_default_value();
|
||||
}
|
||||
|
||||
/* The reference implementation is:
|
||||
```
|
||||
def utf8_next (s : @& string) (p : utf8_pos) : utf8_pos :=
|
||||
let c := utf8_get s p in
|
||||
p + csize c
|
||||
```
|
||||
*/
|
||||
usize string_utf8_next(b_obj_arg s, usize i) {
|
||||
char const * str = string_cstr(s);
|
||||
usize size = string_size(s) - 1;
|
||||
/* `csize c` is 1 when `i` is not a valid position in the reference implementation. */
|
||||
if (i >= size) return i+1;
|
||||
unsigned c = static_cast<unsigned char>(str[i]);
|
||||
if ((c & 0x80) == 0) return i+1;
|
||||
if ((c & 0xe0) == 0xc0) return i + 2;
|
||||
if ((c & 0xf0) == 0xe0) return i + 3;
|
||||
if ((c & 0xf8) == 0xf0) return i + 4;
|
||||
/* invalid UTF-8 encoded string */
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
obj_res string_utf8_extract(b_obj_arg s, usize b, usize e) {
|
||||
usize sz = string_size(s) - 1;
|
||||
if (b >= e || b >= sz) return mk_string("");
|
||||
if (e > sz) e = sz;
|
||||
lean_assert(b < e);
|
||||
usize new_sz = e - b;
|
||||
lean_assert(new_sz > 0);
|
||||
obj_res r = alloc_string(new_sz+1, new_sz+1, 0);
|
||||
memcpy(w_string_cstr(r), string_cstr(s) + b, new_sz);
|
||||
w_string_cstr(r)[new_sz] = 0;
|
||||
to_string(r)->m_length = utf8_strlen(w_string_cstr(r), new_sz);
|
||||
return r;
|
||||
}
|
||||
|
||||
/* `pos` is in bytes, and `remaining` is in characters */
|
||||
static obj_res mk_iterator(obj_arg s, size_t pos, size_t remaining) {
|
||||
obj_res r = alloc_cnstr(0, 1, sizeof(size_t)*2);
|
||||
|
|
@ -1543,7 +1627,7 @@ static uint32 mk_default_char() { return 65; }
|
|||
static bool is_unshared_it_string(b_obj_arg it) { return is_exclusive(it) && !is_shared(cnstr_get(it, 0)); }
|
||||
|
||||
static unsigned get_utf8_char_size_at(std::string const & s, unsigned i) {
|
||||
if (auto sz = is_utf8_first_byte(s[i])) {
|
||||
if (auto sz = get_utf8_first_byte_opt(s[i])) {
|
||||
return *sz;
|
||||
} else {
|
||||
return 1;
|
||||
|
|
@ -1622,7 +1706,7 @@ obj_res string_iterator_prev(obj_arg it) {
|
|||
/* we have to walk at most 4 steps backwards */
|
||||
for (unsigned j = 0; j < 4; j++) {
|
||||
--new_i;
|
||||
if (is_utf8_first_byte(string_cstr(s)[new_i])) {
|
||||
if (get_utf8_first_byte_opt(string_cstr(s)[new_i])) {
|
||||
if (is_exclusive(it)) {
|
||||
it_set_pos(it, new_i);
|
||||
it_set_remaining(it, r+1);
|
||||
|
|
|
|||
|
|
@ -1121,6 +1121,8 @@ inline obj_res mk_option_some(obj_arg v) { obj_res r = alloc_cnstr(1, 1, 0); cns
|
|||
// =======================================
|
||||
// String
|
||||
|
||||
/* instance : inhabited char := ⟨'A'⟩ */
|
||||
inline uint32 char_default_value() { return 'A'; }
|
||||
inline obj_res alloc_string(size_t size, size_t capacity, size_t len) {
|
||||
LEAN_RUNTIME_STAT_CODE(g_num_string++);
|
||||
return new (alloc_heap_object(string_byte_size(capacity))) string_object(size, capacity, len); // NOLINT
|
||||
|
|
@ -1136,6 +1138,11 @@ obj_res string_append(obj_arg s1, b_obj_arg s2);
|
|||
inline obj_res string_length(b_obj_arg s) { return nat_of_size_t(string_len(s)); }
|
||||
obj_res string_mk(obj_arg cs);
|
||||
obj_res string_data(obj_arg s);
|
||||
inline usize string_utf8_byte_size(b_obj_arg s) { return string_size(s) - 1; }
|
||||
uint32 string_utf8_get(b_obj_arg s, usize i);
|
||||
usize string_utf8_next(b_obj_arg s, usize i);
|
||||
inline uint8 string_utf8_at_end(b_obj_arg s, usize i) { return i >= string_size(s) - 1; }
|
||||
obj_res string_utf8_extract(b_obj_arg s, usize b, usize e);
|
||||
obj_res string_mk_iterator(obj_arg s);
|
||||
uint32 string_iterator_curr(b_obj_arg it);
|
||||
obj_res string_iterator_set_curr(obj_arg it, uint32 c);
|
||||
|
|
|
|||
|
|
@ -42,10 +42,10 @@ size_t utf8_strlen(char const * str) {
|
|||
return r;
|
||||
}
|
||||
|
||||
size_t utf8_strlen(std::string const & str) {
|
||||
size_t utf8_strlen(char const * str, size_t sz) {
|
||||
size_t r = 0;
|
||||
size_t i = 0;
|
||||
while (i < str.size()) {
|
||||
while (i < sz) {
|
||||
unsigned d = get_utf8_size(str[i]);
|
||||
r++;
|
||||
i += d;
|
||||
|
|
@ -53,6 +53,10 @@ size_t utf8_strlen(std::string const & str) {
|
|||
return r;
|
||||
}
|
||||
|
||||
size_t utf8_strlen(std::string const & str) {
|
||||
return utf8_strlen(str.data(), str.size());
|
||||
}
|
||||
|
||||
optional<size_t> utf8_char_pos(char const * str, size_t char_idx) {
|
||||
size_t r = 0;
|
||||
while (*str != 0) {
|
||||
|
|
@ -132,7 +136,7 @@ zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx |
|
|||
000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx
|
||||
*/
|
||||
|
||||
optional<unsigned> is_utf8_first_byte(unsigned char c) {
|
||||
optional<unsigned> get_utf8_first_byte_opt(unsigned char c) {
|
||||
if ((c & 0x80) == 0) {
|
||||
/* 0xxxxxxx */
|
||||
return optional<unsigned>(1);
|
||||
|
|
|
|||
|
|
@ -19,6 +19,9 @@ size_t utf8_strlen(char const * str);
|
|||
/* Return the length of the string `str` encoded using UTF8.
|
||||
`str` may contain null characters. */
|
||||
size_t utf8_strlen(std::string const & str);
|
||||
/* Return the length of the string `str` encoded using UTF8.
|
||||
`str` may contain null characters. */
|
||||
size_t utf8_strlen(char const * str, size_t sz);
|
||||
optional<size_t> utf8_char_pos(char const * str, size_t char_idx);
|
||||
char const * get_utf8_last_char(char const * str);
|
||||
std::string utf8_trim(std::string const & s);
|
||||
|
|
@ -31,7 +34,7 @@ inline unsigned utf8_to_unicode(char const * begin, char const * end) {
|
|||
/* If `c` is the first byte of an utf-8 encoded unicode scalar value,
|
||||
then return `some(n)` where `n` is the number of bytes needed to encode
|
||||
the unicode scalar value. Otherwise, return `none` */
|
||||
optional<unsigned> is_utf8_first_byte(unsigned char c);
|
||||
optional<unsigned> get_utf8_first_byte_opt(unsigned char c);
|
||||
|
||||
/* "Read" next unicode character starting at position i in a string using UTF-8 encoding.
|
||||
Return the unicode character and update i. */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue