From 67cc5011b12b9cd7de8a942a0f0aac685afaeacd Mon Sep 17 00:00:00 2001 From: Andrew Belt Date: Mon, 23 Dec 2024 20:05:03 -0500 Subject: [PATCH] Refactor string::UTF8NextCodepoint() with a static UTF8CodepointSize() function. --- src/string.cpp | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/string.cpp b/src/string.cpp index 8cccd933..bdd0f437 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -322,7 +322,7 @@ std::u32string UTF8toUTF32(const std::string& s8) { // Ignore overlong sequence if (c32 < 0x800) continue; - // Validate surrogate pairs range + // Ignore surrogate pairs if (c32 >= 0xD800 && c32 <= 0xDFFF) continue; } @@ -336,10 +336,10 @@ std::u32string UTF8toUTF32(const std::string& s8) { (char32_t(s8[i + 3]) & 0x3F); size = 4; - // Validate minimum value for 4-byte sequence + // Ignore overlong sequence if (c32 < 0x10000) continue; - // Validate maximum Unicode code point + // Ignore codepoints beyond Unicode maximum if (c32 > 0x10FFFF) continue; } @@ -357,27 +357,32 @@ std::u32string UTF8toUTF32(const std::string& s8) { } +static size_t UTF8CodepointSize(char c) { + if (!c) return 0; + // First byte signals size + // 0b0xxxxxxx + if ((c & 0x80) == 0x00) return 1; + // 0b110xxxxx + if ((c & 0xe0) == 0xc0) return 2; + // 0b1110xxxx + if ((c & 0xf0) == 0xe0) return 3; + // 0b11110xxx + if ((c & 0xf8) == 0xf0) return 4; + // Invalid first UTF-8 byte + return 0; +} + + size_t UTF8NextCodepoint(const std::string& s8, size_t i) { // Check out of bounds if (i >= s8.size()) return s8.size(); - // Check if null terminator - if (!s8[i]) return i; - // First byte signals size - // 0b0xxxxxxx - if ((s8[i] & 0x80) == 0x00) return std::min(i + 1, s8.size()); + size_t size = UTF8CodepointSize(s8[i]); // Check for continuation byte 0b10xxxxxx // if ((s8[1] & 0xc0) != 0x80) return 0; - // 0b110xxxxx - if ((s8[i] & 0xe0) == 0xc0) return std::min(i + 2, s8.size()); // if ((s8[2] & 0xc0) != 0x80) return 0; - // 0b1110xxxx - if ((s8[i] & 0xf0) == 0xe0) return std::min(i + 3, s8.size()); // if ((s8[3] & 0xc0) != 0x80) return 0; - // 0b11110xxx - if ((s8[i] & 0xf8) == 0xf0) return std::min(i + 4, s8.size()); - // Invalid first UTF-8 byte - return i; + return std::min(i + size, s8.size()); }