Browse Source

Add string::UTF32toUTF8(), UTF8toUTF32(), UTF8NextCodepoint(), and UTF8PrevCodepoint().

tags/v2.6.1
Andrew Belt 4 months ago
parent
commit
d8abaa05f9
2 changed files with 168 additions and 0 deletions
  1. +17
    -0
      include/string.hpp
  2. +151
    -0
      src/string.cpp

+ 17
- 0
include/string.hpp View File

@@ -88,6 +88,23 @@ std::vector<std::string> split(const std::string& s, const std::string& seperato
std::string formatTime(const char* format, double timestamp);
std::string formatTimeISO(double timestamp);

// Unicode functions
/** Converts a UTF-32 string to a UTF-8 string.
Skips invalid UTF-32 codepoints (greater than 0x10FFFF).
*/
std::string UTF32toUTF8(const std::u32string& s32);
/** Converts a UTF-8 string to a UTF-32 string.
Skips invalid, overlong, and surrogate pair UTF-8 sequences.
*/
std::u32string UTF8toUTF32(const std::string& s8);
/** Finds the byte index of the next codepoint in a valid UTF-8 string.
i must be the start of a codepoint.
*/
size_t UTF8NextCodepoint(const std::string& s8, size_t i);
/** Finds the byte index of the previous codepoint in a valid UTF-8 string.
i must be the start of a codepoint.
*/
size_t UTF8PrevCodepoint(const std::string& s8, size_t i);

#if defined ARCH_WIN
/** Performs a Unicode string conversion from UTF-16 to UTF-8.


+ 151
- 0
src/string.cpp View File

@@ -246,6 +246,157 @@ std::string formatTimeISO(double timestamp) {
}


std::string UTF32toUTF8(const std::u32string& s32) {
std::string s8;
// Pre-allocate maximum possible size
s8.reserve(s32.length() * 4);

for (char32_t c : s32) {
// 7-bit codepoint to 1-byte sequence
if (c <= 0x7F) {
s8.push_back(c);
}
// 11-bit codepoint to 2-byte sequence
else if (c <= 0x7FF) {
s8.push_back(0xC0 | ((c >> 6) & 0x1F));
s8.push_back(0x80 | (c & 0x3F));
}
// 16-bit codepoint to 3-byte sequence
else if (c <= 0xFFFF) {
s8.push_back(0xE0 | ((c >> 12) & 0x0F));
s8.push_back(0x80 | ((c >> 6) & 0x3F));
s8.push_back(0x80 | (c & 0x3F));
}
// 21-bit codepoint to 4-byte sequence
else if (c <= 0x10FFFF) {
s8.push_back(0xF0 | ((c >> 18) & 0x07));
s8.push_back(0x80 | ((c >> 12) & 0x3F));
s8.push_back(0x80 | ((c >> 6) & 0x3F));
s8.push_back(0x80 | (c & 0x3F));
}
// invalid codepoint
else {
// Ignore character
}
}

s8.shrink_to_fit();
return s8;
}


std::u32string UTF8toUTF32(const std::string& s8) {
std::u32string s32;
// Pre-allocate maximum possible size
s32.reserve(s8.size());

for (size_t i = 0; i < s8.size();) {
char32_t c32;
size_t size;

// Determine the number of bytes in the UTF-8 sequence
if ((s8[i] & 0x80) == 0x00) {
// 1-byte sequence
c32 = s8[i];
size = 1;
}
else if ((s8[i] & 0xE0) == 0xC0) {
// 2-byte sequence
if (i + 1 >= s8.size())
break;
c32 = (char32_t(s8[i] & 0x1F) << 6) |
(char32_t(s8[i + 1]) & 0x3F);
size = 2;
// Ignore overlong sequence
if (c32 < 0x80)
continue;
}
else if ((s8[i] & 0xF0) == 0xE0) {
// 3-byte sequence
if (i + 2 >= s8.size())
break;
c32 = (char32_t(s8[i] & 0x0F) << 12) |
((char32_t(s8[i + 1]) & 0x3F) << 6) |
(char32_t(s8[i + 2]) & 0x3F);
size = 3;
// Ignore overlong sequence
if (c32 < 0x800)
continue;
// Validate surrogate pairs range
if (c32 >= 0xD800 && c32 <= 0xDFFF)
continue;
}
else if ((s8[i] & 0xF8) == 0xF0) {
// 4-byte sequence
if (i + 3 >= s8.size())
break;
c32 = (char32_t(s8[i] & 0x07) << 18) |
((char32_t(s8[i + 1]) & 0x3F) << 12) |
((char32_t(s8[i + 2]) & 0x3F) << 6) |
(char32_t(s8[i + 3]) & 0x3F);
size = 4;

// Validate minimum value for 4-byte sequence
if (c32 < 0x10000)
continue;
// Validate maximum Unicode code point
if (c32 > 0x10FFFF)
continue;
}
else {
// Ignore invalid first byte
continue;
}

s32.push_back(c32);
i += size;
}

s32.shrink_to_fit();
return s32;
}


size_t UTF8NextCodepoint(const std::string& s8, size_t i) {
// Check out of bounds
if (i >= s8.size())
return s8.size();
// Check if null terminator
if (!s8[i]) return i;
// First byte signals size
// 0b0xxxxxxx
if ((s8[i] & 0x80) == 0x00) return std::min(i + 1, s8.size());
// Check for continuation byte 0b10xxxxxx
// if ((s8[1] & 0xc0) != 0x80) return 0;
// 0b110xxxxx
if ((s8[i] & 0xe0) == 0xc0) return std::min(i + 2, s8.size());
// if ((s8[2] & 0xc0) != 0x80) return 0;
// 0b1110xxxx
if ((s8[i] & 0xf0) == 0xe0) return std::min(i + 3, s8.size());
// if ((s8[3] & 0xc0) != 0x80) return 0;
// 0b11110xxx
if ((s8[i] & 0xf8) == 0xf0) return std::min(i + 4, s8.size());
// Invalid first UTF-8 byte
return i;
}


size_t UTF8PrevCodepoint(const std::string& s8, size_t i) {
if (i == 0) return 0;
// Check the previous 3 bytes
for (size_t j = 1; j <= 3; j++) {
i--;
if (i == 0) return 0;
// Check out of bounds
if (i >= s8.size())
return s8.size();
// Check for continuation byte 0b10xxxxxx
if ((s8[i] & 0xc0) != 0x80) return i;
}
return i;
}


#if defined ARCH_WIN
std::string UTF16toUTF8(const std::wstring& w) {
if (w.empty())


Loading…
Cancel
Save