|  |  | @@ -322,7 +322,7 @@ std::u32string UTF8toUTF32(const std::string& s8) { | 
		
	
		
			
			|  |  |  | // Ignore overlong sequence | 
		
	
		
			
			|  |  |  | if (c32 < 0x800) | 
		
	
		
			
			|  |  |  | continue; | 
		
	
		
			
			|  |  |  | // Validate surrogate pairs range | 
		
	
		
			
			|  |  |  | // Ignore surrogate pairs | 
		
	
		
			
			|  |  |  | if (c32 >= 0xD800 && c32 <= 0xDFFF) | 
		
	
		
			
			|  |  |  | continue; | 
		
	
		
			
			|  |  |  | } | 
		
	
	
		
			
				|  |  | @@ -336,10 +336,10 @@ std::u32string UTF8toUTF32(const std::string& s8) { | 
		
	
		
			
			|  |  |  | (char32_t(s8[i + 3]) & 0x3F); | 
		
	
		
			
			|  |  |  | size = 4; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | // Validate minimum value for 4-byte sequence | 
		
	
		
			
			|  |  |  | // Ignore overlong sequence | 
		
	
		
			
			|  |  |  | if (c32 < 0x10000) | 
		
	
		
			
			|  |  |  | continue; | 
		
	
		
			
			|  |  |  | // Validate maximum Unicode code point | 
		
	
		
			
			|  |  |  | // Ignore codepoints beyond Unicode maximum | 
		
	
		
			
			|  |  |  | if (c32 > 0x10FFFF) | 
		
	
		
			
			|  |  |  | continue; | 
		
	
		
			
			|  |  |  | } | 
		
	
	
		
			
				|  |  | @@ -357,27 +357,32 @@ std::u32string UTF8toUTF32(const std::string& s8) { | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | static size_t UTF8CodepointSize(char c) { | 
		
	
		
			
			|  |  |  | if (!c) return 0; | 
		
	
		
			
			|  |  |  | // First byte signals size | 
		
	
		
			
			|  |  |  | // 0b0xxxxxxx | 
		
	
		
			
			|  |  |  | if ((c & 0x80) == 0x00) return 1; | 
		
	
		
			
			|  |  |  | // 0b110xxxxx | 
		
	
		
			
			|  |  |  | if ((c & 0xe0) == 0xc0) return 2; | 
		
	
		
			
			|  |  |  | // 0b1110xxxx | 
		
	
		
			
			|  |  |  | if ((c & 0xf0) == 0xe0) return 3; | 
		
	
		
			
			|  |  |  | // 0b11110xxx | 
		
	
		
			
			|  |  |  | if ((c & 0xf8) == 0xf0) return 4; | 
		
	
		
			
			|  |  |  | // Invalid first UTF-8 byte | 
		
	
		
			
			|  |  |  | return 0; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | size_t UTF8NextCodepoint(const std::string& s8, size_t i) { | 
		
	
		
			
			|  |  |  | // Check out of bounds | 
		
	
		
			
			|  |  |  | if (i >= s8.size()) | 
		
	
		
			
			|  |  |  | return s8.size(); | 
		
	
		
			
			|  |  |  | // Check if null terminator | 
		
	
		
			
			|  |  |  | if (!s8[i]) return i; | 
		
	
		
			
			|  |  |  | // First byte signals size | 
		
	
		
			
			|  |  |  | // 0b0xxxxxxx | 
		
	
		
			
			|  |  |  | if ((s8[i] & 0x80) == 0x00) return std::min(i + 1, s8.size()); | 
		
	
		
			
			|  |  |  | size_t size = UTF8CodepointSize(s8[i]); | 
		
	
		
			
			|  |  |  | // Check for continuation byte 0b10xxxxxx | 
		
	
		
			
			|  |  |  | // if ((s8[1] & 0xc0) != 0x80) return 0; | 
		
	
		
			
			|  |  |  | // 0b110xxxxx | 
		
	
		
			
			|  |  |  | if ((s8[i] & 0xe0) == 0xc0) return std::min(i + 2, s8.size()); | 
		
	
		
			
			|  |  |  | // if ((s8[2] & 0xc0) != 0x80) return 0; | 
		
	
		
			
			|  |  |  | // 0b1110xxxx | 
		
	
		
			
			|  |  |  | if ((s8[i] & 0xf0) == 0xe0) return std::min(i + 3, s8.size()); | 
		
	
		
			
			|  |  |  | // if ((s8[3] & 0xc0) != 0x80) return 0; | 
		
	
		
			
			|  |  |  | // 0b11110xxx | 
		
	
		
			
			|  |  |  | if ((s8[i] & 0xf8) == 0xf0) return std::min(i + 4, s8.size()); | 
		
	
		
			
			|  |  |  | // Invalid first UTF-8 byte | 
		
	
		
			
			|  |  |  | return i; | 
		
	
		
			
			|  |  |  | return std::min(i + size, s8.size()); | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | 
 | 
		
	
	
		
			
				|  |  | 
 |