From 12430710ba95383ed7bec336de02ef9ed1b9f30c Mon Sep 17 00:00:00 2001
From: Andrew Belt <andrewpbelt@gmail.com>
Date: Mon, 10 Feb 2025 01:07:27 -0500
Subject: [PATCH] Add string::UTF8Length(), UTF8CodepointIndex(), and
 UTF8CodepointPos(). Revise string::UTF8* docs.

---
 include/string.hpp | 29 +++++++++++++++++-----
 src/string.cpp     | 62 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 72 insertions(+), 19 deletions(-)
diff --git a/include/string.hpp b/include/string.hpp
index 7817e7bb..d68e1596 100644
--- a/include/string.hpp
+++ b/include/string.hpp
@@ -16,6 +16,7 @@ The wrapper template function below automatically converts all arguments (includ
 __attribute__((format(printf, 1, 2)))
 std::string f(const char* format, ...);
 std::string fV(const char* format, va_list args);
+
 // Converts std::string arguments of f() to `const char*`
 template<typename T>
 T convertFArg(const T& t) {return t;}
@@ -101,14 +102,30 @@ std::string UTF32toUTF8(const std::u32string& s32);
 Skips invalid, overlong, and surrogate pair UTF-8 sequences.
 */
 std::u32string UTF8toUTF32(const std::string& s8);
-/** Finds the byte index of the next codepoint in a valid UTF-8 string.
-i must be the start of a codepoint.
+/** Finds the byte position of the next codepoint in a valid UTF-8 string.
+pos is the byte position of the start of a codepoint.
+Returns s8.size() if given codepoint is the last.
+*/
+size_t UTF8NextCodepoint(const std::string& s8, size_t pos);
+/** Finds the byte position of the previous codepoint in a valid UTF-8 string.
+pos is the byte position of the start of a codepoint.
+Returns 0 if given codepoint is the first.
+*/
+size_t UTF8PrevCodepoint(const std::string& s8, size_t pos);
+/** Returns the number of codepoints in a valid UTF-8 string.
+O(len) time
+*/
+size_t UTF8Length(const std::string& s8);
+/** Returns a codepoint's index in a valid UTF-8 string.
+pos is the byte position of the start of a codepoint.
+O(pos) time
 */
-size_t UTF8NextCodepoint(const std::string& s8, size_t i);
-/** Finds the byte index of the previous codepoint in a valid UTF-8 string.
-i must be the start of a codepoint.
+size_t UTF8CodepointIndex(const std::string& s8, size_t pos);
+/** Returns a codepoint's byte position in a valid UTF-8 string.
+Returns s8.size() if index is beyond the last codepoint.
+O(index) time
 */
-size_t UTF8PrevCodepoint(const std::string& s8, size_t i);
+size_t UTF8CodepointPos(const std::string& s8, size_t index);
 
 #if defined ARCH_WIN
 /** Performs a Unicode string conversion from UTF-16 to UTF-8.
diff --git a/src/string.cpp b/src/string.cpp
index bbb73138..d0c1712f 100644
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -421,38 +421,74 @@ static size_t UTF8CodepointSize(char c) {
 }
 
 
-size_t UTF8NextCodepoint(const std::string& s8, size_t i) {
+size_t UTF8NextCodepoint(const std::string& s8, size_t pos) {
 	// Check out of bounds
-	if (i >= s8.size())
+	if (pos >= s8.size())
 		return s8.size();
-	size_t size = UTF8CodepointSize(s8[i]);
+	size_t size = UTF8CodepointSize(s8[pos]);
 	// Check for continuation byte 0b10xxxxxx
 	// if ((s8[1] & 0xc0) != 0x80) return 0;
 	// if ((s8[2] & 0xc0) != 0x80) return 0;
 	// if ((s8[3] & 0xc0) != 0x80) return 0;
-	return std::min(i + size, s8.size());
+	return std::min(pos + size, s8.size());
 }
 
 
 /** Finds the byte index of the front of a codepoint by reversing until a non-continuation byte is found. */
-static size_t UTF8StartCodepoint(const std::string& s8, size_t i) {
+static size_t UTF8StartCodepoint(const std::string& s8, size_t pos) {
 	// Check out of bounds
-	if (i >= s8.size())
+	if (pos >= s8.size())
 		return s8.size();
-	while (i > 0) {
+	while (pos > 0) {
 		// Check for continuation byte 0b10xxxxxx
-		if ((s8[i] & 0xc0) != 0x80)
+		if ((s8[pos] & 0xc0) != 0x80)
 			break;
-		i--;
+		pos--;
 	}
-	return i;
+	return pos;
 }
 
 
-size_t UTF8PrevCodepoint(const std::string& s8, size_t i) {
-	if (i == 0)
+size_t UTF8PrevCodepoint(const std::string& s8, size_t pos) {
+	if (pos == 0)
 		return 0;
-	return UTF8StartCodepoint(s8, i - 1);
+	return UTF8StartCodepoint(s8, pos - 1);
+}
+
+
+size_t UTF8Length(const std::string& s8) {
+	return UTF8CodepointIndex(s8, s8.size());
+}
+
+
+size_t UTF8CodepointIndex(const std::string& s8, size_t endPos) {
+	size_t pos = 0;
+	size_t index = 0;
+	endPos = std::min(endPos, s8.size());
+	while (pos < endPos) {
+		size_t newPos = UTF8NextCodepoint(s8, pos);
+		// Check if codepoint is invalid
+		if (pos == newPos)
+			return index;
+		pos = newPos;
+		index++;
+	}
+	return index;
+}
+
+
+size_t UTF8CodepointPos(const std::string& s8, size_t endIndex) {
+	size_t pos = 0;
+	size_t index = 0;
+	while (index < endIndex && pos < s8.size()) {
+		size_t newPos = UTF8NextCodepoint(s8, pos);
+		// Check if codepoint is invalid
+		if (pos == newPos)
+			return pos;
+		pos = newPos;
+		index++;
+	}
+	return pos;
 }