feat: String.Pos.isValid (#3959)

This adds a function that can be used to check whether a position is on
a UTF-8 byte boundary.
This commit is contained in:
Mario Carneiro 2024-04-20 10:57:35 -04:00 committed by GitHub
parent 291bb84c97
commit aeacb7b69e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 23 additions and 0 deletions

View file

@ -44,6 +44,16 @@ def append : String → (@& String) → String
def toList (s : String) : List Char :=
s.data
/-- Returns true if `p` is a valid UTF-8 position in the string `s`, meaning that `p ≤ s.endPos`
and `p` lies on a UTF-8 character boundary. This has an O(1) implementation in the runtime. -/
@[extern "lean_string_is_valid_pos"]
def Pos.isValid (s : @&String) (p : @& Pos) : Bool :=
go s.data 0
where
go : List Char → Pos → Bool
| [], i => i = p
| c::cs, i => if i = p then true else go cs (i + c)
def utf8GetAux : List Char → Pos → Pos → Char
| [], _, _ => default
| c::cs, i, p => if i = p then c else utf8GetAux cs (i + c) p

View file

@ -1922,6 +1922,19 @@ static inline bool is_utf8_first_byte(unsigned char c) {
return (c & 0x80) == 0 || (c & 0xe0) == 0xc0 || (c & 0xf0) == 0xe0 || (c & 0xf8) == 0xf0;
}
extern "C" LEAN_EXPORT uint8 lean_string_is_valid_pos(b_obj_arg s, b_obj_arg i0) {
if (!lean_is_scalar(i0)) {
/* See comment at string_utf8_get */
return false;
}
usize i = lean_unbox(i0);
usize sz = lean_string_size(s) - 1;
if (i > sz) return false;
if (i == sz) return true;
char const * str = lean_string_cstr(s);
return is_utf8_first_byte(str[i]);
}
extern "C" LEAN_EXPORT obj_res lean_string_utf8_extract(b_obj_arg s, b_obj_arg b0, b_obj_arg e0) {
if (!lean_is_scalar(b0) || !lean_is_scalar(e0)) {
/* See comment at string_utf8_get */