Browse Source

Add string::UTF8Length(), UTF8CodepointIndex(), and UTF8CodepointPos(). Revise string::UTF8* docs.

tags/v2.6.1
Andrew Belt 2 months ago
parent
commit
12430710ba
2 changed files with 72 additions and 19 deletions
  1. +23
    -6
      include/string.hpp
  2. +49
    -13
      src/string.cpp

+ 23
- 6
include/string.hpp View File

@@ -16,6 +16,7 @@ The wrapper template function below automatically converts all arguments (includ
__attribute__((format(printf, 1, 2))) __attribute__((format(printf, 1, 2)))
std::string f(const char* format, ...); std::string f(const char* format, ...);
std::string fV(const char* format, va_list args); std::string fV(const char* format, va_list args);

// Converts std::string arguments of f() to `const char*` // Converts std::string arguments of f() to `const char*`
template<typename T> template<typename T>
T convertFArg(const T& t) {return t;} T convertFArg(const T& t) {return t;}
@@ -101,14 +102,30 @@ std::string UTF32toUTF8(const std::u32string& s32);
Skips invalid, overlong, and surrogate pair UTF-8 sequences. Skips invalid, overlong, and surrogate pair UTF-8 sequences.
*/ */
std::u32string UTF8toUTF32(const std::string& s8); std::u32string UTF8toUTF32(const std::string& s8);
/** Finds the byte index of the next codepoint in a valid UTF-8 string.
i must be the start of a codepoint.
/** Finds the byte position of the next codepoint in a valid UTF-8 string.
pos is the byte position of the start of a codepoint.
Returns s8.size() if given codepoint is the last.
*/
size_t UTF8NextCodepoint(const std::string& s8, size_t pos);
/** Finds the byte position of the previous codepoint in a valid UTF-8 string.
pos is the byte position of the start of a codepoint.
Returns 0 if given codepoint is the first.
*/
size_t UTF8PrevCodepoint(const std::string& s8, size_t pos);
/** Returns the number of codepoints in a valid UTF-8 string.
O(len) time
*/
size_t UTF8Length(const std::string& s8);
/** Returns a codepoint's index in a valid UTF-8 string.
pos is the byte position of the start of a codepoint.
O(pos) time
*/ */
size_t UTF8NextCodepoint(const std::string& s8, size_t i);
/** Finds the byte index of the previous codepoint in a valid UTF-8 string.
i must be the start of a codepoint.
size_t UTF8CodepointIndex(const std::string& s8, size_t pos);
/** Returns a codepoint's byte position in a valid UTF-8 string.
Returns s8.size() if index is beyond the last codepoint.
O(index) time
*/ */
size_t UTF8PrevCodepoint(const std::string& s8, size_t i);
size_t UTF8CodepointPos(const std::string& s8, size_t index);


#if defined ARCH_WIN #if defined ARCH_WIN
/** Performs a Unicode string conversion from UTF-16 to UTF-8. /** Performs a Unicode string conversion from UTF-16 to UTF-8.


+ 49
- 13
src/string.cpp View File

@@ -421,38 +421,74 @@ static size_t UTF8CodepointSize(char c) {
} }




size_t UTF8NextCodepoint(const std::string& s8, size_t i) {
size_t UTF8NextCodepoint(const std::string& s8, size_t pos) {
// Check out of bounds // Check out of bounds
if (i >= s8.size())
if (pos >= s8.size())
return s8.size(); return s8.size();
size_t size = UTF8CodepointSize(s8[i]);
size_t size = UTF8CodepointSize(s8[pos]);
// Check for continuation byte 0b10xxxxxx // Check for continuation byte 0b10xxxxxx
// if ((s8[1] & 0xc0) != 0x80) return 0; // if ((s8[1] & 0xc0) != 0x80) return 0;
// if ((s8[2] & 0xc0) != 0x80) return 0; // if ((s8[2] & 0xc0) != 0x80) return 0;
// if ((s8[3] & 0xc0) != 0x80) return 0; // if ((s8[3] & 0xc0) != 0x80) return 0;
return std::min(i + size, s8.size());
return std::min(pos + size, s8.size());
} }




/** Finds the byte index of the front of a codepoint by reversing until a non-continuation byte is found. */ /** Finds the byte index of the front of a codepoint by reversing until a non-continuation byte is found. */
static size_t UTF8StartCodepoint(const std::string& s8, size_t i) {
static size_t UTF8StartCodepoint(const std::string& s8, size_t pos) {
// Check out of bounds // Check out of bounds
if (i >= s8.size())
if (pos >= s8.size())
return s8.size(); return s8.size();
while (i > 0) {
while (pos > 0) {
// Check for continuation byte 0b10xxxxxx // Check for continuation byte 0b10xxxxxx
if ((s8[i] & 0xc0) != 0x80)
if ((s8[pos] & 0xc0) != 0x80)
break; break;
i--;
pos--;
} }
return i;
return pos;
} }




size_t UTF8PrevCodepoint(const std::string& s8, size_t i) {
if (i == 0)
size_t UTF8PrevCodepoint(const std::string& s8, size_t pos) {
if (pos == 0)
return 0; return 0;
return UTF8StartCodepoint(s8, i - 1);
return UTF8StartCodepoint(s8, pos - 1);
}


size_t UTF8Length(const std::string& s8) {
return UTF8CodepointIndex(s8, s8.size());
}


size_t UTF8CodepointIndex(const std::string& s8, size_t endPos) {
size_t pos = 0;
size_t index = 0;
endPos = std::min(endPos, s8.size());
while (pos < endPos) {
size_t newPos = UTF8NextCodepoint(s8, pos);
// Check if codepoint is invalid
if (pos == newPos)
return index;
pos = newPos;
index++;
}
return index;
}


size_t UTF8CodepointPos(const std::string& s8, size_t endIndex) {
size_t pos = 0;
size_t index = 0;
while (index < endIndex && pos < s8.size()) {
size_t newPos = UTF8NextCodepoint(s8, pos);
// Check if codepoint is invalid
if (pos == newPos)
return pos;
pos = newPos;
index++;
}
return pos;
} }






Loading…
Cancel
Save