Add string::UTF8Length(), UTF8CodepointIndex(), and UTF8CodepointPos(). Revise string::UTF8* docs.

3 months ago · 12430710ba
--- a/include/string.hpp
+++ b/include/string.hpp
@@ -16,6 +16,7 @@ The wrapper template function below automatically converts all arguments (includ
 __attribute__((format(printf, 1, 2)))
 std::string f(const char* format, ...);
 std::string fV(const char* format, va_list args);

 // Converts std::string arguments of f() to `const char*`
 template<typename T>
 T convertFArg(const T& t) {return t;}
@@ -101,14 +102,30 @@ std::string UTF32toUTF8(const std::u32string& s32);
 Skips invalid, overlong, and surrogate pair UTF-8 sequences.
 */
 std::u32string UTF8toUTF32(const std::string& s8);
 /** Finds the byte index of the next codepoint in a valid UTF-8 string.
 i must be the start of a codepoint.
 /** Finds the byte position of the next codepoint in a valid UTF-8 string.
 pos is the byte position of the start of a codepoint.
 Returns s8.size() if given codepoint is the last.
 */
 size_t UTF8NextCodepoint(const std::string& s8, size_t pos);
 /** Finds the byte position of the previous codepoint in a valid UTF-8 string.
 pos is the byte position of the start of a codepoint.
 Returns 0 if given codepoint is the first.
 */
 size_t UTF8PrevCodepoint(const std::string& s8, size_t pos);
 /** Returns the number of codepoints in a valid UTF-8 string.
 O(len) time
 */
 size_t UTF8Length(const std::string& s8);
 /** Returns a codepoint's index in a valid UTF-8 string.
 pos is the byte position of the start of a codepoint.
 O(pos) time
 */
 size_t UTF8NextCodepoint(const std::string& s8, size_t i);
 /** Finds the byte index of the previous codepoint in a valid UTF-8 string.
 i must be the start of a codepoint.
 size_t UTF8CodepointIndex(const std::string& s8, size_t pos);
 /** Returns a codepoint's byte position in a valid UTF-8 string.
 Returns s8.size() if index is beyond the last codepoint.
 O(index) time
 */
 size_t UTF8PrevCodepoint(const std::string& s8, size_t i);
 size_t UTF8CodepointPos(const std::string& s8, size_t index);

 #if defined ARCH_WIN
 /** Performs a Unicode string conversion from UTF-16 to UTF-8.
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -421,38 +421,74 @@ static size_t UTF8CodepointSize(char c) {
 }


 size_t UTF8NextCodepoint(const std::string& s8, size_t i) {
 size_t UTF8NextCodepoint(const std::string& s8, size_t pos) {
 	// Check out of bounds
 	if (i >= s8.size())
 	if (pos >= s8.size())
 		return s8.size();
 	size_t size = UTF8CodepointSize(s8[i]);
 	size_t size = UTF8CodepointSize(s8[pos]);
 	// Check for continuation byte 0b10xxxxxx
 	// if ((s8[1] & 0xc0) != 0x80) return 0;
 	// if ((s8[2] & 0xc0) != 0x80) return 0;
 	// if ((s8[3] & 0xc0) != 0x80) return 0;
 	return std::min(i + size, s8.size());
 	return std::min(pos + size, s8.size());
 }


 /** Finds the byte index of the front of a codepoint by reversing until a non-continuation byte is found. */
 static size_t UTF8StartCodepoint(const std::string& s8, size_t i) {
 static size_t UTF8StartCodepoint(const std::string& s8, size_t pos) {
 	// Check out of bounds
 	if (i >= s8.size())
 	if (pos >= s8.size())
 		return s8.size();
 	while (i > 0) {
 	while (pos > 0) {
 		// Check for continuation byte 0b10xxxxxx
 		if ((s8[i] & 0xc0) != 0x80)
 		if ((s8[pos] & 0xc0) != 0x80)
 			break;
 		i--;
 		pos--;
 	}
 	return i;
 	return pos;
 }


 size_t UTF8PrevCodepoint(const std::string& s8, size_t i) {
 	if (i == 0)
 size_t UTF8PrevCodepoint(const std::string& s8, size_t pos) {
 	if (pos == 0)
 		return 0;
 	return UTF8StartCodepoint(s8, i - 1);
 	return UTF8StartCodepoint(s8, pos - 1);
 }


 size_t UTF8Length(const std::string& s8) {
 	return UTF8CodepointIndex(s8, s8.size());
 }


 size_t UTF8CodepointIndex(const std::string& s8, size_t endPos) {
 	size_t pos = 0;
 	size_t index = 0;
 	endPos = std::min(endPos, s8.size());
 	while (pos < endPos) {
 		size_t newPos = UTF8NextCodepoint(s8, pos);
 		// Check if codepoint is invalid
 		if (pos == newPos)
 			return index;
 		pos = newPos;
 		index++;
 	}
 	return index;
 }


 size_t UTF8CodepointPos(const std::string& s8, size_t endIndex) {
 	size_t pos = 0;
 	size_t index = 0;
 	while (index < endIndex && pos < s8.size()) {
 		size_t newPos = UTF8NextCodepoint(s8, pos);
 		// Check if codepoint is invalid
 		if (pos == newPos)
 			return pos;
 		pos = newPos;
 		index++;
 	}
 	return pos;
 }