fix(runtime/object): string_utf8_extract

This commit is contained in:
Leonardo de Moura 2019-03-09 12:57:51 -08:00
parent 67944a6c81
commit 01b4983fa2
4 changed files with 31 additions and 5 deletions

View file

@ -1593,11 +1593,23 @@ usize string_utf8_next(b_obj_arg s, usize i) {
return i + 1;
}
static inline bool is_utf8_first_byte(unsigned char c) {
return (c & 0x80) == 0 || (c & 0xe0) == 0xc0 || (c & 0xf0) == 0xe0 || (c & 0xf8) == 0xf0;
}
obj_res string_utf8_extract(b_obj_arg s, usize b, usize e) {
char const * str = string_cstr(s);
usize sz = string_size(s) - 1;
if (b >= e || b >= sz) return mk_string("");
/* In the reference implementation if `b` is not pointing to a valid UTF8
character start position, the result is the empty string. */
if (!is_utf8_first_byte(str[b])) return mk_string("");
if (e > sz) e = sz;
lean_assert(b < e);
lean_assert(e > 0);
/* In the reference implementation if `e` is not pointing to a valid UTF8
character start position, it is assumed to be at the end. */
if (e < sz && !is_utf8_first_byte(str[e])) e = sz;
usize new_sz = e - b;
lean_assert(new_sz > 0);
obj_res r = alloc_string(new_sz+1, new_sz+1, 0);

View file

@ -29,7 +29,7 @@ unsigned get_utf8_size(unsigned char c) {
else if (c == 0xFF)
return 1;
else
return 0;
return 1; /* invalid */
}
size_t utf8_strlen(char const * str) {

View file

@ -6,7 +6,15 @@ def show_chars : nat → string → string.utf8_pos → io unit
show_chars n s (s.utf8_next idx)
def main : io uint32 :=
let s₁ := "hello α world β" in
io.println' (to_string s₁.utf8_byte_size) *>
let s₁ := "hello α_world_β" in
let b := string.utf8_begin in
let e := s₁.utf8_byte_size in
io.println' (s₁.utf8_extract b e) *>
io.println' (s₁.utf8_extract (b+2) e) *>
io.println' (s₁.utf8_extract (b+2) (e-1)) *>
io.println' (s₁.utf8_extract (b+2) (e-2)) *>
io.println' (s₁.utf8_extract (b+7) e) *>
io.println' (s₁.utf8_extract (b+8) e) *>
io.println' (to_string e) *>
show_chars s₁.utf8_byte_size.to_nat s₁ 0 *>
pure 0

View file

@ -1,3 +1,9 @@
hello α_world_β
llo α_world_β
llo α_world_β
llo α_world_
_world_β
17
>> h
>> e
@ -6,11 +12,11 @@
>> o
>>
>> α
>>
>> _
>> w
>> o
>> r
>> l
>> d
>>
>> _
>> β