From aeacb7b69e71c26f77a5fe42455465918dc0ee88 Mon Sep 17 00:00:00 2001 From: Mario Carneiro Date: Sat, 20 Apr 2024 10:57:35 -0400 Subject: [PATCH] feat: String.Pos.isValid (#3959) This adds a function that can be used to check whether a position is on a UTF-8 byte boundary. --- src/Init/Data/String/Basic.lean | 10 ++++++++++ src/runtime/object.cpp | 13 +++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/Init/Data/String/Basic.lean b/src/Init/Data/String/Basic.lean index 96b950ea81..1dbe48e753 100644 --- a/src/Init/Data/String/Basic.lean +++ b/src/Init/Data/String/Basic.lean @@ -44,6 +44,16 @@ def append : String → (@& String) → String def toList (s : String) : List Char := s.data +/-- Returns true if `p` is a valid UTF-8 position in the string `s`, meaning that `p ≤ s.endPos` +and `p` lies on a UTF-8 character boundary. This has an O(1) implementation in the runtime. -/ +@[extern "lean_string_is_valid_pos"] +def Pos.isValid (s : @&String) (p : @& Pos) : Bool := + go s.data 0 +where + go : List Char → Pos → Bool + | [], i => i = p + | c::cs, i => if i = p then true else go cs (i + c) + def utf8GetAux : List Char → Pos → Pos → Char | [], _, _ => default | c::cs, i, p => if i = p then c else utf8GetAux cs (i + c) p diff --git a/src/runtime/object.cpp b/src/runtime/object.cpp index 9fce155fc8..8d9c04b72b 100644 --- a/src/runtime/object.cpp +++ b/src/runtime/object.cpp @@ -1922,6 +1922,19 @@ static inline bool is_utf8_first_byte(unsigned char c) { return (c & 0x80) == 0 || (c & 0xe0) == 0xc0 || (c & 0xf0) == 0xe0 || (c & 0xf8) == 0xf0; } +extern "C" LEAN_EXPORT uint8 lean_string_is_valid_pos(b_obj_arg s, b_obj_arg i0) { + if (!lean_is_scalar(i0)) { + /* See comment at string_utf8_get */ + return false; + } + usize i = lean_unbox(i0); + usize sz = lean_string_size(s) - 1; + if (i > sz) return false; + if (i == sz) return true; + char const * str = lean_string_cstr(s); + return is_utf8_first_byte(str[i]); +} + extern "C" LEAN_EXPORT obj_res lean_string_utf8_extract(b_obj_arg s, b_obj_arg b0, b_obj_arg e0) { if (!lean_is_scalar(b0) || !lean_is_scalar(e0)) { /* See comment at string_utf8_get */