@@ -16,6 +16,7 @@ The wrapper template function below automatically converts all arguments (includ | |||||
__attribute__((format(printf, 1, 2))) | __attribute__((format(printf, 1, 2))) | ||||
std::string f(const char* format, ...); | std::string f(const char* format, ...); | ||||
std::string fV(const char* format, va_list args); | std::string fV(const char* format, va_list args); | ||||
// Converts std::string arguments of f() to `const char*` | // Converts std::string arguments of f() to `const char*` | ||||
template<typename T> | template<typename T> | ||||
T convertFArg(const T& t) {return t;} | T convertFArg(const T& t) {return t;} | ||||
@@ -101,14 +102,30 @@ std::string UTF32toUTF8(const std::u32string& s32); | |||||
Skips invalid, overlong, and surrogate pair UTF-8 sequences. | Skips invalid, overlong, and surrogate pair UTF-8 sequences. | ||||
*/ | */ | ||||
std::u32string UTF8toUTF32(const std::string& s8); | std::u32string UTF8toUTF32(const std::string& s8); | ||||
/** Finds the byte index of the next codepoint in a valid UTF-8 string. | |||||
i must be the start of a codepoint. | |||||
/** Finds the byte position of the next codepoint in a valid UTF-8 string. | |||||
pos is the byte position of the start of a codepoint. | |||||
Returns s8.size() if given codepoint is the last. | |||||
*/ | |||||
size_t UTF8NextCodepoint(const std::string& s8, size_t pos); | |||||
/** Finds the byte position of the previous codepoint in a valid UTF-8 string. | |||||
pos is the byte position of the start of a codepoint. | |||||
Returns 0 if given codepoint is the first. | |||||
*/ | |||||
size_t UTF8PrevCodepoint(const std::string& s8, size_t pos); | |||||
/** Returns the number of codepoints in a valid UTF-8 string. | |||||
O(len) time | |||||
*/ | |||||
size_t UTF8Length(const std::string& s8); | |||||
/** Returns a codepoint's index in a valid UTF-8 string. | |||||
pos is the byte position of the start of a codepoint. | |||||
O(pos) time | |||||
*/ | */ | ||||
size_t UTF8NextCodepoint(const std::string& s8, size_t i); | |||||
/** Finds the byte index of the previous codepoint in a valid UTF-8 string. | |||||
i must be the start of a codepoint. | |||||
size_t UTF8CodepointIndex(const std::string& s8, size_t pos); | |||||
/** Returns a codepoint's byte position in a valid UTF-8 string. | |||||
Returns s8.size() if index is beyond the last codepoint. | |||||
O(index) time | |||||
*/ | */ | ||||
size_t UTF8PrevCodepoint(const std::string& s8, size_t i); | |||||
size_t UTF8CodepointPos(const std::string& s8, size_t index); | |||||
#if defined ARCH_WIN | #if defined ARCH_WIN | ||||
/** Performs a Unicode string conversion from UTF-16 to UTF-8. | /** Performs a Unicode string conversion from UTF-16 to UTF-8. | ||||
@@ -421,38 +421,74 @@ static size_t UTF8CodepointSize(char c) { | |||||
} | } | ||||
size_t UTF8NextCodepoint(const std::string& s8, size_t i) { | |||||
size_t UTF8NextCodepoint(const std::string& s8, size_t pos) { | |||||
// Check out of bounds | // Check out of bounds | ||||
if (i >= s8.size()) | |||||
if (pos >= s8.size()) | |||||
return s8.size(); | return s8.size(); | ||||
size_t size = UTF8CodepointSize(s8[i]); | |||||
size_t size = UTF8CodepointSize(s8[pos]); | |||||
// Check for continuation byte 0b10xxxxxx | // Check for continuation byte 0b10xxxxxx | ||||
// if ((s8[1] & 0xc0) != 0x80) return 0; | // if ((s8[1] & 0xc0) != 0x80) return 0; | ||||
// if ((s8[2] & 0xc0) != 0x80) return 0; | // if ((s8[2] & 0xc0) != 0x80) return 0; | ||||
// if ((s8[3] & 0xc0) != 0x80) return 0; | // if ((s8[3] & 0xc0) != 0x80) return 0; | ||||
return std::min(i + size, s8.size()); | |||||
return std::min(pos + size, s8.size()); | |||||
} | } | ||||
/** Finds the byte index of the front of a codepoint by reversing until a non-continuation byte is found. */ | /** Finds the byte index of the front of a codepoint by reversing until a non-continuation byte is found. */ | ||||
static size_t UTF8StartCodepoint(const std::string& s8, size_t i) { | |||||
static size_t UTF8StartCodepoint(const std::string& s8, size_t pos) { | |||||
// Check out of bounds | // Check out of bounds | ||||
if (i >= s8.size()) | |||||
if (pos >= s8.size()) | |||||
return s8.size(); | return s8.size(); | ||||
while (i > 0) { | |||||
while (pos > 0) { | |||||
// Check for continuation byte 0b10xxxxxx | // Check for continuation byte 0b10xxxxxx | ||||
if ((s8[i] & 0xc0) != 0x80) | |||||
if ((s8[pos] & 0xc0) != 0x80) | |||||
break; | break; | ||||
i--; | |||||
pos--; | |||||
} | } | ||||
return i; | |||||
return pos; | |||||
} | } | ||||
size_t UTF8PrevCodepoint(const std::string& s8, size_t i) { | |||||
if (i == 0) | |||||
size_t UTF8PrevCodepoint(const std::string& s8, size_t pos) { | |||||
if (pos == 0) | |||||
return 0; | return 0; | ||||
return UTF8StartCodepoint(s8, i - 1); | |||||
return UTF8StartCodepoint(s8, pos - 1); | |||||
} | |||||
size_t UTF8Length(const std::string& s8) { | |||||
return UTF8CodepointIndex(s8, s8.size()); | |||||
} | |||||
size_t UTF8CodepointIndex(const std::string& s8, size_t endPos) { | |||||
size_t pos = 0; | |||||
size_t index = 0; | |||||
endPos = std::min(endPos, s8.size()); | |||||
while (pos < endPos) { | |||||
size_t newPos = UTF8NextCodepoint(s8, pos); | |||||
// Check if codepoint is invalid | |||||
if (pos == newPos) | |||||
return index; | |||||
pos = newPos; | |||||
index++; | |||||
} | |||||
return index; | |||||
} | |||||
size_t UTF8CodepointPos(const std::string& s8, size_t endIndex) { | |||||
size_t pos = 0; | |||||
size_t index = 0; | |||||
while (index < endIndex && pos < s8.size()) { | |||||
size_t newPos = UTF8NextCodepoint(s8, pos); | |||||
// Check if codepoint is invalid | |||||
if (pos == newPos) | |||||
return pos; | |||||
pos = newPos; | |||||
index++; | |||||
} | |||||
return pos; | |||||
} | } | ||||