From 12430710ba95383ed7bec336de02ef9ed1b9f30c Mon Sep 17 00:00:00 2001 From: Andrew Belt Date: Mon, 10 Feb 2025 01:07:27 -0500 Subject: [PATCH] Add string::UTF8Length(), UTF8CodepointIndex(), and UTF8CodepointPos(). Revise string::UTF8* docs. --- include/string.hpp | 29 +++++++++++++++++----- src/string.cpp | 62 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 19 deletions(-) diff --git a/include/string.hpp b/include/string.hpp index 7817e7bb..d68e1596 100644 --- a/include/string.hpp +++ b/include/string.hpp @@ -16,6 +16,7 @@ The wrapper template function below automatically converts all arguments (includ __attribute__((format(printf, 1, 2))) std::string f(const char* format, ...); std::string fV(const char* format, va_list args); + // Converts std::string arguments of f() to `const char*` template T convertFArg(const T& t) {return t;} @@ -101,14 +102,30 @@ std::string UTF32toUTF8(const std::u32string& s32); Skips invalid, overlong, and surrogate pair UTF-8 sequences. */ std::u32string UTF8toUTF32(const std::string& s8); -/** Finds the byte index of the next codepoint in a valid UTF-8 string. -i must be the start of a codepoint. +/** Finds the byte position of the next codepoint in a valid UTF-8 string. +pos is the byte position of the start of a codepoint. +Returns s8.size() if given codepoint is the last. +*/ +size_t UTF8NextCodepoint(const std::string& s8, size_t pos); +/** Finds the byte position of the previous codepoint in a valid UTF-8 string. +pos is the byte position of the start of a codepoint. +Returns 0 if given codepoint is the first. +*/ +size_t UTF8PrevCodepoint(const std::string& s8, size_t pos); +/** Returns the number of codepoints in a valid UTF-8 string. +O(len) time +*/ +size_t UTF8Length(const std::string& s8); +/** Returns a codepoint's index in a valid UTF-8 string. +pos is the byte position of the start of a codepoint. +O(pos) time */ -size_t UTF8NextCodepoint(const std::string& s8, size_t i); -/** Finds the byte index of the previous codepoint in a valid UTF-8 string. -i must be the start of a codepoint. +size_t UTF8CodepointIndex(const std::string& s8, size_t pos); +/** Returns a codepoint's byte position in a valid UTF-8 string. +Returns s8.size() if index is beyond the last codepoint. +O(index) time */ -size_t UTF8PrevCodepoint(const std::string& s8, size_t i); +size_t UTF8CodepointPos(const std::string& s8, size_t index); #if defined ARCH_WIN /** Performs a Unicode string conversion from UTF-16 to UTF-8. diff --git a/src/string.cpp b/src/string.cpp index bbb73138..d0c1712f 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -421,38 +421,74 @@ static size_t UTF8CodepointSize(char c) { } -size_t UTF8NextCodepoint(const std::string& s8, size_t i) { +size_t UTF8NextCodepoint(const std::string& s8, size_t pos) { // Check out of bounds - if (i >= s8.size()) + if (pos >= s8.size()) return s8.size(); - size_t size = UTF8CodepointSize(s8[i]); + size_t size = UTF8CodepointSize(s8[pos]); // Check for continuation byte 0b10xxxxxx // if ((s8[1] & 0xc0) != 0x80) return 0; // if ((s8[2] & 0xc0) != 0x80) return 0; // if ((s8[3] & 0xc0) != 0x80) return 0; - return std::min(i + size, s8.size()); + return std::min(pos + size, s8.size()); } /** Finds the byte index of the front of a codepoint by reversing until a non-continuation byte is found. */ -static size_t UTF8StartCodepoint(const std::string& s8, size_t i) { +static size_t UTF8StartCodepoint(const std::string& s8, size_t pos) { // Check out of bounds - if (i >= s8.size()) + if (pos >= s8.size()) return s8.size(); - while (i > 0) { + while (pos > 0) { // Check for continuation byte 0b10xxxxxx - if ((s8[i] & 0xc0) != 0x80) + if ((s8[pos] & 0xc0) != 0x80) break; - i--; + pos--; } - return i; + return pos; } -size_t UTF8PrevCodepoint(const std::string& s8, size_t i) { - if (i == 0) +size_t UTF8PrevCodepoint(const std::string& s8, size_t pos) { + if (pos == 0) return 0; - return UTF8StartCodepoint(s8, i - 1); + return UTF8StartCodepoint(s8, pos - 1); +} + + +size_t UTF8Length(const std::string& s8) { + return UTF8CodepointIndex(s8, s8.size()); +} + + +size_t UTF8CodepointIndex(const std::string& s8, size_t endPos) { + size_t pos = 0; + size_t index = 0; + endPos = std::min(endPos, s8.size()); + while (pos < endPos) { + size_t newPos = UTF8NextCodepoint(s8, pos); + // Check if codepoint is invalid + if (pos == newPos) + return index; + pos = newPos; + index++; + } + return index; +} + + +size_t UTF8CodepointPos(const std::string& s8, size_t endIndex) { + size_t pos = 0; + size_t index = 0; + while (index < endIndex && pos < s8.size()) { + size_t newPos = UTF8NextCodepoint(s8, pos); + // Check if codepoint is invalid + if (pos == newPos) + return pos; + pos = newPos; + index++; + } + return pos; }