fix(util/utf8): UTF8 decoding

This commit is contained in:
Sebastian Ullrich 2017-10-27 16:03:36 +02:00 committed by Leonardo de Moura
parent 734ee66514
commit 1a80ea9c8e
3 changed files with 7 additions and 3 deletions

View file

@ -150,7 +150,7 @@ unsigned next_utf8(std::string const & str, size_t & i) {
/* one continuation (128 to 2047) */
if ((c & 0xe0) == 0xc0 && i + 1 < str.size()) {
unsigned c1 = static_cast<unsigned char>(str[i+1]);
unsigned r = ((c & 0x1f) << 6) | c1;
unsigned r = ((c & 0x1f) << 6) | (c1 & 0x3f);
if (r >= 128) {
i += 2;
return r;
@ -161,7 +161,7 @@ unsigned next_utf8(std::string const & str, size_t & i) {
if ((c & 0xf0) == 0xe0 && i + 2 < str.size()) {
unsigned c1 = static_cast<unsigned char>(str[i+1]);
unsigned c2 = static_cast<unsigned char>(str[i+2]);
unsigned r = ((c & 0x0f) << 12) | (c1 << 6) | c2;
unsigned r = ((c & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f);
if (r >= 2048 && (r < 55296 || r > 57343)) {
i += 3;
return r;
@ -173,7 +173,7 @@ unsigned next_utf8(std::string const & str, size_t & i) {
unsigned c1 = static_cast<unsigned char>(str[i+1]);
unsigned c2 = static_cast<unsigned char>(str[i+2]);
unsigned c3 = static_cast<unsigned char>(str[i+3]);
unsigned r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
unsigned r = ((c & 0x07) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f);
if (r >= 65536 && r <= 1114111) {
i += 4;
return r;

View file

@ -0,0 +1,2 @@
#eval "≠∀Π∃⁻ʰℵ⬝"
#eval "≠∀Π∃⁻ʰℵ⬝".to_list.map char.to_nat

View file

@ -0,0 +1,2 @@
"≠∀Π∃⁻ʰℵ⬝"
[8800, 8704, 928, 8707, 8315, 688, 8501, 11037]