From d8abaa05f9c02733e126afb457fadcf1d297e4da Mon Sep 17 00:00:00 2001
From: Andrew Belt <andrewpbelt@gmail.com>
Date: Sun, 15 Dec 2024 16:13:22 -0500
Subject: [PATCH] Add string::UTF32toUTF8(), UTF8toUTF32(),
 UTF8NextCodepoint(), and UTF8PrevCodepoint().

---
 include/string.hpp |  17 +++++
 src/string.cpp     | 151 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)

diff --git a/include/string.hpp b/include/string.hpp
index 495828b6..bfd46183 100644
--- a/include/string.hpp
+++ b/include/string.hpp
@@ -88,6 +88,23 @@ std::vector<std::string> split(const std::string& s, const std::string& seperato
 std::string formatTime(const char* format, double timestamp);
 std::string formatTimeISO(double timestamp);
 
+// Unicode functions
+/** Converts a UTF-32 string to a UTF-8 string.
+Skips invalid UTF-32 codepoints (greater than 0x10FFFF).
+*/
+std::string UTF32toUTF8(const std::u32string& s32);
+/** Converts a UTF-8 string to a UTF-32 string.
+Skips invalid, overlong, and surrogate pair UTF-8 sequences.
+*/
+std::u32string UTF8toUTF32(const std::string& s8);
+/** Finds the byte index of the next codepoint in a valid UTF-8 string.
+i must be the start of a codepoint.
+*/
+size_t UTF8NextCodepoint(const std::string& s8, size_t i);
+/** Finds the byte index of the previous codepoint in a valid UTF-8 string.
+i must be the start of a codepoint.
+*/
+size_t UTF8PrevCodepoint(const std::string& s8, size_t i);
 
 #if defined ARCH_WIN
 /** Performs a Unicode string conversion from UTF-16 to UTF-8.
diff --git a/src/string.cpp b/src/string.cpp
index 1241340f..8cccd933 100644
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -246,6 +246,157 @@ std::string formatTimeISO(double timestamp) {
 }
 
 
+std::string UTF32toUTF8(const std::u32string& s32) {
+	std::string s8;
+	// Pre-allocate maximum possible size
+	s8.reserve(s32.length() * 4);
+
+	for (char32_t c : s32) {
+		// 7-bit codepoint to 1-byte sequence
+		if (c <= 0x7F) {
+			s8.push_back(c);
+		}
+		// 11-bit codepoint to 2-byte sequence
+		else if (c <= 0x7FF) {
+			s8.push_back(0xC0 | ((c >> 6) & 0x1F));
+			s8.push_back(0x80 | (c & 0x3F));
+		}
+		// 16-bit codepoint to 3-byte sequence
+		else if (c <= 0xFFFF) {
+			s8.push_back(0xE0 | ((c >> 12) & 0x0F));
+			s8.push_back(0x80 | ((c >> 6) & 0x3F));
+			s8.push_back(0x80 | (c & 0x3F));
+		}
+		// 21-bit codepoint to 4-byte sequence
+		else if (c <= 0x10FFFF) {
+			s8.push_back(0xF0 | ((c >> 18) & 0x07));
+			s8.push_back(0x80 | ((c >> 12) & 0x3F));
+			s8.push_back(0x80 | ((c >> 6) & 0x3F));
+			s8.push_back(0x80 | (c & 0x3F));
+		}
+		// invalid codepoint
+		else {
+			// Ignore character
+		}
+	}
+
+	s8.shrink_to_fit();
+	return s8;
+}
+
+
+std::u32string UTF8toUTF32(const std::string& s8) {
+	std::u32string s32;
+	// Pre-allocate maximum possible size
+	s32.reserve(s8.size());
+
+	for (size_t i = 0; i < s8.size();) {
+		char32_t c32;
+		size_t size;
+
+		// Determine the number of bytes in the UTF-8 sequence
+		if ((s8[i] & 0x80) == 0x00) {
+			// 1-byte sequence
+			c32 = s8[i];
+			size = 1;
+		}
+		else if ((s8[i] & 0xE0) == 0xC0) {
+			// 2-byte sequence
+			if (i + 1 >= s8.size())
+				break;
+			c32 = (char32_t(s8[i] & 0x1F) << 6) |
+			      (char32_t(s8[i + 1]) & 0x3F);
+			size = 2;
+			// Ignore overlong sequence
+			if (c32 < 0x80)
+				continue;
+		}
+		else if ((s8[i] & 0xF0) == 0xE0) {
+			// 3-byte sequence
+			if (i + 2 >= s8.size())
+				break;
+			c32 = (char32_t(s8[i] & 0x0F) << 12) |
+			      ((char32_t(s8[i + 1]) & 0x3F) << 6) |
+			      (char32_t(s8[i + 2]) & 0x3F);
+			size = 3;
+			// Ignore overlong sequence
+			if (c32 < 0x800)
+				continue;
+			// Validate surrogate pairs range
+			if (c32 >= 0xD800 && c32 <= 0xDFFF)
+				continue;
+		}
+		else if ((s8[i] & 0xF8) == 0xF0) {
+			// 4-byte sequence
+			if (i + 3 >= s8.size())
+				break;
+			c32 = (char32_t(s8[i] & 0x07) << 18) |
+			      ((char32_t(s8[i + 1]) & 0x3F) << 12) |
+			      ((char32_t(s8[i + 2]) & 0x3F) << 6) |
+			      (char32_t(s8[i + 3]) & 0x3F);
+			size = 4;
+
+			// Validate minimum value for 4-byte sequence
+			if (c32 < 0x10000)
+				continue;
+			// Validate maximum Unicode code point
+			if (c32 > 0x10FFFF)
+				continue;
+		}
+		else {
+			// Ignore invalid first byte
+			continue;
+		}
+
+		s32.push_back(c32);
+		i += size;
+	}
+
+	s32.shrink_to_fit();
+	return s32;
+}
+
+
+size_t UTF8NextCodepoint(const std::string& s8, size_t i) {
+	// Check out of bounds
+	if (i >= s8.size())
+		return s8.size();
+	// Check if null terminator
+	if (!s8[i]) return i;
+	// First byte signals size
+	// 0b0xxxxxxx
+	if ((s8[i] & 0x80) == 0x00) return std::min(i + 1, s8.size());
+	// Check for continuation byte 0b10xxxxxx
+	// if ((s8[1] & 0xc0) != 0x80) return 0;
+	// 0b110xxxxx
+	if ((s8[i] & 0xe0) == 0xc0) return std::min(i + 2, s8.size());
+	// if ((s8[2] & 0xc0) != 0x80) return 0;
+	// 0b1110xxxx
+	if ((s8[i] & 0xf0) == 0xe0) return std::min(i + 3, s8.size());
+	// if ((s8[3] & 0xc0) != 0x80) return 0;
+	// 0b11110xxx
+	if ((s8[i] & 0xf8) == 0xf0) return std::min(i + 4, s8.size());
+	// Invalid first UTF-8 byte
+	return i;
+}
+
+
+size_t UTF8PrevCodepoint(const std::string& s8, size_t i) {
+	if (i == 0) return 0;
+	// Check the previous 3 bytes
+	for (size_t j = 1; j <= 3; j++) {
+		i--;
+		if (i == 0) return 0;
+		// Check out of bounds
+		if (i >= s8.size())
+			return s8.size();
+		// Check for continuation byte 0b10xxxxxx
+		if ((s8[i] & 0xc0) != 0x80) return i;
+	}
+	return i;
+}
+
+
 #if defined ARCH_WIN
 std::string UTF16toUTF8(const std::wstring& w) {
 	if (w.empty())