Browse Source

Refactor string::UTF8NextCodepoint() with a static UTF8CodepointSize() function.

tags/v2.6.1
Andrew Belt 4 months ago
parent
commit
67cc5011b1
1 changed files with 21 additions and 16 deletions
  1. +21
    -16
      src/string.cpp

+ 21
- 16
src/string.cpp View File

@@ -322,7 +322,7 @@ std::u32string UTF8toUTF32(const std::string& s8) {
// Ignore overlong sequence
if (c32 < 0x800)
continue;
// Validate surrogate pairs range
// Ignore surrogate pairs
if (c32 >= 0xD800 && c32 <= 0xDFFF)
continue;
}
@@ -336,10 +336,10 @@ std::u32string UTF8toUTF32(const std::string& s8) {
(char32_t(s8[i + 3]) & 0x3F);
size = 4;

// Validate minimum value for 4-byte sequence
// Ignore overlong sequence
if (c32 < 0x10000)
continue;
// Validate maximum Unicode code point
// Ignore codepoints beyond Unicode maximum
if (c32 > 0x10FFFF)
continue;
}
@@ -357,27 +357,32 @@ std::u32string UTF8toUTF32(const std::string& s8) {
}


static size_t UTF8CodepointSize(char c) {
if (!c) return 0;
// First byte signals size
// 0b0xxxxxxx
if ((c & 0x80) == 0x00) return 1;
// 0b110xxxxx
if ((c & 0xe0) == 0xc0) return 2;
// 0b1110xxxx
if ((c & 0xf0) == 0xe0) return 3;
// 0b11110xxx
if ((c & 0xf8) == 0xf0) return 4;
// Invalid first UTF-8 byte
return 0;
}


size_t UTF8NextCodepoint(const std::string& s8, size_t i) {
// Check out of bounds
if (i >= s8.size())
return s8.size();
// Check if null terminator
if (!s8[i]) return i;
// First byte signals size
// 0b0xxxxxxx
if ((s8[i] & 0x80) == 0x00) return std::min(i + 1, s8.size());
size_t size = UTF8CodepointSize(s8[i]);
// Check for continuation byte 0b10xxxxxx
// if ((s8[1] & 0xc0) != 0x80) return 0;
// 0b110xxxxx
if ((s8[i] & 0xe0) == 0xc0) return std::min(i + 2, s8.size());
// if ((s8[2] & 0xc0) != 0x80) return 0;
// 0b1110xxxx
if ((s8[i] & 0xf0) == 0xe0) return std::min(i + 3, s8.size());
// if ((s8[3] & 0xc0) != 0x80) return 0;
// 0b11110xxx
if ((s8[i] & 0xf8) == 0xf0) return std::min(i + 4, s8.size());
// Invalid first UTF-8 byte
return i;
return std::min(i + size, s8.size());
}




Loading…
Cancel
Save