diff --git a/src/runtime/object.cpp b/src/runtime/object.cpp index e95f9efaa1..a9c92ec7da 100644 --- a/src/runtime/object.cpp +++ b/src/runtime/object.cpp @@ -1593,11 +1593,23 @@ usize string_utf8_next(b_obj_arg s, usize i) { return i + 1; } +static inline bool is_utf8_first_byte(unsigned char c) { + return (c & 0x80) == 0 || (c & 0xe0) == 0xc0 || (c & 0xf0) == 0xe0 || (c & 0xf8) == 0xf0; +} + obj_res string_utf8_extract(b_obj_arg s, usize b, usize e) { + char const * str = string_cstr(s); usize sz = string_size(s) - 1; if (b >= e || b >= sz) return mk_string(""); + /* In the reference implementation if `b` is not pointing to a valid UTF8 + character start position, the result is the empty string. */ + if (!is_utf8_first_byte(str[b])) return mk_string(""); if (e > sz) e = sz; lean_assert(b < e); + lean_assert(e > 0); + /* In the reference implementation if `e` is not pointing to a valid UTF8 + character start position, it is assumed to be at the end. */ + if (e < sz && !is_utf8_first_byte(str[e])) e = sz; usize new_sz = e - b; lean_assert(new_sz > 0); obj_res r = alloc_string(new_sz+1, new_sz+1, 0); diff --git a/src/runtime/utf8.cpp b/src/runtime/utf8.cpp index c8cff31369..4729f28f31 100644 --- a/src/runtime/utf8.cpp +++ b/src/runtime/utf8.cpp @@ -29,7 +29,7 @@ unsigned get_utf8_size(unsigned char c) { else if (c == 0xFF) return 1; else - return 0; + return 1; /* invalid */ } size_t utf8_strlen(char const * str) { diff --git a/tests/compiler/str.lean b/tests/compiler/str.lean index c6bbdef34f..26c6efae67 100644 --- a/tests/compiler/str.lean +++ b/tests/compiler/str.lean @@ -6,7 +6,15 @@ def show_chars : nat → string → string.utf8_pos → io unit show_chars n s (s.utf8_next idx) def main : io uint32 := -let s₁ := "hello α world β" in -io.println' (to_string s₁.utf8_byte_size) *> +let s₁ := "hello α_world_β" in +let b := string.utf8_begin in +let e := s₁.utf8_byte_size in +io.println' (s₁.utf8_extract b e) *> +io.println' (s₁.utf8_extract (b+2) e) *> +io.println' (s₁.utf8_extract (b+2) (e-1)) *> +io.println' (s₁.utf8_extract (b+2) (e-2)) *> +io.println' (s₁.utf8_extract (b+7) e) *> +io.println' (s₁.utf8_extract (b+8) e) *> +io.println' (to_string e) *> show_chars s₁.utf8_byte_size.to_nat s₁ 0 *> pure 0 diff --git a/tests/compiler/str.lean.expected.out b/tests/compiler/str.lean.expected.out index 9059a5ca03..f02fe3977e 100644 --- a/tests/compiler/str.lean.expected.out +++ b/tests/compiler/str.lean.expected.out @@ -1,3 +1,9 @@ +hello α_world_β +llo α_world_β +llo α_world_β +llo α_world_ + +_world_β 17 >> h >> e @@ -6,11 +12,11 @@ >> o >> >> α ->> +>> _ >> w >> o >> r >> l >> d ->> +>> _ >> β