Audio plugin host https://kx.studio/carla
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

CharPointer_UTF8.h 19KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. /*
  2. ==============================================================================
  3. This file is part of the Water library.
  4. Copyright (c) 2016 ROLI Ltd.
  5. Copyright (C) 2017 Filipe Coelho <falktx@falktx.com>
  6. Permission is granted to use this software under the terms of the ISC license
  7. http://www.isc.org/downloads/software-support-policy/isc-license/
  8. Permission to use, copy, modify, and/or distribute this software for any
  9. purpose with or without fee is hereby granted, provided that the above
  10. copyright notice and this permission notice appear in all copies.
  11. THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH REGARD
  12. TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  13. FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
  14. OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
  15. USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
  16. TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  17. OF THIS SOFTWARE.
  18. ==============================================================================
  19. */
  20. #ifndef WATER_CHARPOINTER_UTF8_H_INCLUDED
  21. #define WATER_CHARPOINTER_UTF8_H_INCLUDED
  22. #include "CharacterFunctions.h"
  23. #include "../memory/Atomic.h"
  24. #include "CarlaUtils.hpp"
  25. namespace water {
  26. class String;
  27. //==============================================================================
  28. /**
  29. Wraps a pointer to a null-terminated UTF-8 character string, and provides
  30. various methods to operate on the data.
  31. @see CharPointer_UTF16, CharPointer_UTF32
  32. */
  33. class CharPointer_UTF8
  34. {
  35. public:
  36. typedef char CharType;
  37. inline explicit CharPointer_UTF8 (const CharType* const rawPointer) noexcept
  38. : data (const_cast<CharType*> (rawPointer))
  39. {
  40. }
  41. inline CharPointer_UTF8 (const CharPointer_UTF8& other) noexcept
  42. : data (other.data)
  43. {
  44. }
  45. inline CharPointer_UTF8& operator= (CharPointer_UTF8 other) noexcept
  46. {
  47. data = other.data;
  48. return *this;
  49. }
  50. inline CharPointer_UTF8& operator= (const CharType* text) noexcept
  51. {
  52. data = const_cast<CharType*> (text);
  53. return *this;
  54. }
  55. /** This is a pointer comparison, it doesn't compare the actual text. */
  56. inline bool operator== (CharPointer_UTF8 other) const noexcept { return data == other.data; }
  57. inline bool operator!= (CharPointer_UTF8 other) const noexcept { return data != other.data; }
  58. inline bool operator<= (CharPointer_UTF8 other) const noexcept { return data <= other.data; }
  59. inline bool operator< (CharPointer_UTF8 other) const noexcept { return data < other.data; }
  60. inline bool operator>= (CharPointer_UTF8 other) const noexcept { return data >= other.data; }
  61. inline bool operator> (CharPointer_UTF8 other) const noexcept { return data > other.data; }
  62. /** Returns the address that this pointer is pointing to. */
  63. inline CharType* getAddress() const noexcept { return data; }
  64. /** Returns the address that this pointer is pointing to. */
  65. inline operator const CharType*() const noexcept { return data; }
  66. /** Returns true if this pointer is pointing to a null character. */
  67. inline bool isEmpty() const noexcept { return *data == 0; }
  68. /** Returns the unicode character that this pointer is pointing to. */
  69. water_uchar operator*() const noexcept
  70. {
  71. const signed char byte = (signed char) *data;
  72. if (byte >= 0)
  73. return (water_uchar) (uint8) byte;
  74. uint32 n = (uint32) (uint8) byte;
  75. uint32 mask = 0x7f;
  76. uint32 bit = 0x40;
  77. int numExtraValues = 0;
  78. while ((n & bit) != 0 && bit > 0x8)
  79. {
  80. mask >>= 1;
  81. ++numExtraValues;
  82. bit >>= 1;
  83. }
  84. n &= mask;
  85. for (int i = 1; i <= numExtraValues; ++i)
  86. {
  87. const uint32 nextByte = (uint32) (uint8) data[i];
  88. if ((nextByte & 0xc0) != 0x80)
  89. break;
  90. n <<= 6;
  91. n |= (nextByte & 0x3f);
  92. }
  93. return (water_uchar) n;
  94. }
  95. /** Moves this pointer along to the next character in the string. */
  96. CharPointer_UTF8& operator++() noexcept
  97. {
  98. wassert (*data != 0); // trying to advance past the end of the string?
  99. const signed char n = (signed char) *data++;
  100. if (n < 0)
  101. {
  102. water_uchar bit = 0x40;
  103. while ((static_cast<unsigned char>(n) & bit) != 0 && bit > 0x8)
  104. {
  105. ++data;
  106. bit >>= 1;
  107. }
  108. }
  109. return *this;
  110. }
  111. /** Moves this pointer back to the previous character in the string. */
  112. CharPointer_UTF8& operator--() noexcept
  113. {
  114. int count = 0;
  115. while ((*--data & 0xc0) == 0x80 && ++count < 4)
  116. {}
  117. return *this;
  118. }
  119. /** Returns the character that this pointer is currently pointing to, and then
  120. advances the pointer to point to the next character. */
  121. water_uchar getAndAdvance() noexcept
  122. {
  123. const signed char byte = (signed char) *data++;
  124. if (byte >= 0)
  125. return (water_uchar) (uint8) byte;
  126. uint32 n = (uint32) (uint8) byte;
  127. uint32 mask = 0x7f;
  128. uint32 bit = 0x40;
  129. int numExtraValues = 0;
  130. while ((n & bit) != 0 && bit > 0x8)
  131. {
  132. mask >>= 1;
  133. ++numExtraValues;
  134. bit >>= 1;
  135. }
  136. n &= mask;
  137. while (--numExtraValues >= 0)
  138. {
  139. const uint32 nextByte = (uint32) (uint8) *data;
  140. if ((nextByte & 0xc0) != 0x80)
  141. break;
  142. ++data;
  143. n <<= 6;
  144. n |= (nextByte & 0x3f);
  145. }
  146. return (water_uchar) n;
  147. }
  148. /** Moves this pointer along to the next character in the string. */
  149. CharPointer_UTF8 operator++ (int) noexcept
  150. {
  151. CharPointer_UTF8 temp (*this);
  152. ++*this;
  153. return temp;
  154. }
  155. /** Moves this pointer forwards by the specified number of characters. */
  156. void operator+= (int numToSkip) noexcept
  157. {
  158. if (numToSkip < 0)
  159. {
  160. while (++numToSkip <= 0)
  161. --*this;
  162. }
  163. else
  164. {
  165. while (--numToSkip >= 0)
  166. ++*this;
  167. }
  168. }
  169. /** Moves this pointer backwards by the specified number of characters. */
  170. void operator-= (int numToSkip) noexcept
  171. {
  172. operator+= (-numToSkip);
  173. }
  174. /** Returns the character at a given character index from the start of the string. */
  175. water_uchar operator[] (int characterIndex) const noexcept
  176. {
  177. CharPointer_UTF8 p (*this);
  178. p += characterIndex;
  179. return *p;
  180. }
  181. /** Returns a pointer which is moved forwards from this one by the specified number of characters. */
  182. CharPointer_UTF8 operator+ (int numToSkip) const noexcept
  183. {
  184. CharPointer_UTF8 p (*this);
  185. p += numToSkip;
  186. return p;
  187. }
  188. /** Returns a pointer which is moved backwards from this one by the specified number of characters. */
  189. CharPointer_UTF8 operator- (int numToSkip) const noexcept
  190. {
  191. CharPointer_UTF8 p (*this);
  192. p += -numToSkip;
  193. return p;
  194. }
  195. /** Returns the number of characters in this string. */
  196. size_t length() const noexcept
  197. {
  198. const CharType* d = data;
  199. size_t count = 0;
  200. for (;;)
  201. {
  202. const uint32 n = (uint32) (uint8) *d++;
  203. if ((n & 0x80) != 0)
  204. {
  205. while ((*d & 0xc0) == 0x80)
  206. ++d;
  207. }
  208. else if (n == 0)
  209. break;
  210. ++count;
  211. }
  212. return count;
  213. }
  214. /** Returns the number of characters in this string, or the given value, whichever is lower. */
  215. size_t lengthUpTo (const size_t maxCharsToCount) const noexcept
  216. {
  217. return CharacterFunctions::lengthUpTo (*this, maxCharsToCount);
  218. }
  219. /** Returns the number of characters in this string, or up to the given end pointer, whichever is lower. */
  220. size_t lengthUpTo (const CharPointer_UTF8 end) const noexcept
  221. {
  222. return CharacterFunctions::lengthUpTo (*this, end);
  223. }
  224. /** Returns the number of bytes that are used to represent this string.
  225. This includes the terminating null character.
  226. */
  227. size_t sizeInBytes() const noexcept
  228. {
  229. wassert (data != nullptr);
  230. return strlen (data) + 1;
  231. }
  232. /** Returns the number of bytes that would be needed to represent the given
  233. unicode character in this encoding format.
  234. */
  235. static size_t getBytesRequiredFor (const water_uchar charToWrite) noexcept
  236. {
  237. size_t num = 1;
  238. const uint32 c = (uint32) charToWrite;
  239. if (c >= 0x80)
  240. {
  241. ++num;
  242. if (c >= 0x800)
  243. {
  244. ++num;
  245. if (c >= 0x10000)
  246. ++num;
  247. }
  248. }
  249. return num;
  250. }
  251. /** Returns the number of bytes that would be needed to represent the given
  252. string in this encoding format.
  253. The value returned does NOT include the terminating null character.
  254. */
  255. template <class CharPointer>
  256. static size_t getBytesRequiredFor (CharPointer text) noexcept
  257. {
  258. size_t count = 0;
  259. while (water_uchar n = text.getAndAdvance())
  260. count += getBytesRequiredFor (n);
  261. return count;
  262. }
  263. /** Returns a pointer to the null character that terminates this string. */
  264. CharPointer_UTF8 findTerminatingNull() const noexcept
  265. {
  266. return CharPointer_UTF8 (data + strlen (data));
  267. }
  268. /** Writes a unicode character to this string, and advances this pointer to point to the next position. */
  269. void write (const water_uchar charToWrite) noexcept
  270. {
  271. const uint32 c = (uint32) charToWrite;
  272. if (c >= 0x80)
  273. {
  274. int numExtraBytes = 1;
  275. if (c >= 0x800)
  276. {
  277. ++numExtraBytes;
  278. if (c >= 0x10000)
  279. ++numExtraBytes;
  280. }
  281. *data++ = (CharType) ((uint32) (0xff << (7 - numExtraBytes)) | (c >> (numExtraBytes * 6)));
  282. while (--numExtraBytes >= 0)
  283. *data++ = (CharType) (0x80 | (0x3f & (c >> (numExtraBytes * 6))));
  284. }
  285. else
  286. {
  287. *data++ = (CharType) c;
  288. }
  289. }
  290. /** Writes a null character to this string (leaving the pointer's position unchanged). */
  291. inline void writeNull() const noexcept
  292. {
  293. *data = 0;
  294. }
  295. /** Copies a source string to this pointer, advancing this pointer as it goes. */
  296. template <typename CharPointer>
  297. void writeAll (const CharPointer src) noexcept
  298. {
  299. CharacterFunctions::copyAll (*this, src);
  300. }
  301. /** Copies a source string to this pointer, advancing this pointer as it goes. */
  302. void writeAll (const CharPointer_UTF8 src) noexcept
  303. {
  304. const CharType* s = src.data;
  305. while ((*data = *s) != 0)
  306. {
  307. ++data;
  308. ++s;
  309. }
  310. }
  311. /** Copies a source string to this pointer, advancing this pointer as it goes.
  312. The maxDestBytes parameter specifies the maximum number of bytes that can be written
  313. to the destination buffer before stopping.
  314. */
  315. template <typename CharPointer>
  316. size_t writeWithDestByteLimit (const CharPointer src, const size_t maxDestBytes) noexcept
  317. {
  318. return CharacterFunctions::copyWithDestByteLimit (*this, src, maxDestBytes);
  319. }
  320. /** Copies a source string to this pointer, advancing this pointer as it goes.
  321. The maxChars parameter specifies the maximum number of characters that can be
  322. written to the destination buffer before stopping (including the terminating null).
  323. */
  324. template <typename CharPointer>
  325. void writeWithCharLimit (const CharPointer src, const int maxChars) noexcept
  326. {
  327. CharacterFunctions::copyWithCharLimit (*this, src, maxChars);
  328. }
  329. /** Compares this string with another one. */
  330. template <typename CharPointer>
  331. int compare (const CharPointer other) const noexcept
  332. {
  333. return CharacterFunctions::compare (*this, other);
  334. }
  335. /** Compares this string with another one, up to a specified number of characters. */
  336. template <typename CharPointer>
  337. int compareUpTo (const CharPointer other, const int maxChars) const noexcept
  338. {
  339. return CharacterFunctions::compareUpTo (*this, other, maxChars);
  340. }
  341. /** Compares this string with another one. */
  342. template <typename CharPointer>
  343. int compareIgnoreCase (const CharPointer other) const noexcept
  344. {
  345. return CharacterFunctions::compareIgnoreCase (*this, other);
  346. }
  347. /** Compares this string with another one. */
  348. int compareIgnoreCase (const CharPointer_UTF8 other) const noexcept
  349. {
  350. return CharacterFunctions::compareIgnoreCase (*this, other);
  351. }
  352. /** Compares this string with another one, up to a specified number of characters. */
  353. template <typename CharPointer>
  354. int compareIgnoreCaseUpTo (const CharPointer other, const int maxChars) const noexcept
  355. {
  356. return CharacterFunctions::compareIgnoreCaseUpTo (*this, other, maxChars);
  357. }
  358. /** Returns the character index of a substring, or -1 if it isn't found. */
  359. template <typename CharPointer>
  360. int indexOf (const CharPointer stringToFind) const noexcept
  361. {
  362. return CharacterFunctions::indexOf (*this, stringToFind);
  363. }
  364. /** Returns the character index of a unicode character, or -1 if it isn't found. */
  365. int indexOf (const water_uchar charToFind) const noexcept
  366. {
  367. return CharacterFunctions::indexOfChar (*this, charToFind);
  368. }
  369. /** Returns the character index of a unicode character, or -1 if it isn't found. */
  370. int indexOf (const water_uchar charToFind, const bool ignoreCase) const noexcept
  371. {
  372. return ignoreCase ? CharacterFunctions::indexOfCharIgnoreCase (*this, charToFind)
  373. : CharacterFunctions::indexOfChar (*this, charToFind);
  374. }
  375. /** Returns true if the first character of this string is whitespace. */
  376. bool isWhitespace() const noexcept { const CharType c = *data; return c == ' ' || (c <= 13 && c >= 9); }
  377. /** Returns true if the first character of this string is a digit. */
  378. bool isDigit() const noexcept { const CharType c = *data; return c >= '0' && c <= '9'; }
  379. /** Returns true if the first character of this string is a letter. */
  380. bool isLetter() const noexcept { return CharacterFunctions::isLetter (operator*()) != 0; }
  381. /** Returns true if the first character of this string is a letter or digit. */
  382. bool isLetterOrDigit() const noexcept { return CharacterFunctions::isLetterOrDigit (operator*()) != 0; }
  383. /** Returns true if the first character of this string is upper-case. */
  384. bool isUpperCase() const noexcept { return CharacterFunctions::isUpperCase (operator*()) != 0; }
  385. /** Returns true if the first character of this string is lower-case. */
  386. bool isLowerCase() const noexcept { return CharacterFunctions::isLowerCase (operator*()) != 0; }
  387. /** Returns an upper-case version of the first character of this string. */
  388. water_uchar toUpperCase() const noexcept { return CharacterFunctions::toUpperCase (operator*()); }
  389. /** Returns a lower-case version of the first character of this string. */
  390. water_uchar toLowerCase() const noexcept { return CharacterFunctions::toLowerCase (operator*()); }
  391. /** Parses this string as a 32-bit integer. */
  392. int getIntValue32() const noexcept { return atoi (data); }
  393. /** Parses this string as a 64-bit integer. */
  394. int64 getIntValue64() const noexcept
  395. {
  396. return atoll (data);
  397. #if 0
  398. return CharacterFunctions::getIntValue <int64, CharPointer_UTF8> (*this);
  399. #endif
  400. }
  401. /** Parses this string as a floating point double. */
  402. double getDoubleValue() const noexcept { return CharacterFunctions::getDoubleValue (*this); }
  403. /** Returns the first non-whitespace character in the string. */
  404. CharPointer_UTF8 findEndOfWhitespace() const noexcept { return CharacterFunctions::findEndOfWhitespace (*this); }
  405. /** Returns true if the given unicode character can be represented in this encoding. */
  406. static bool canRepresent (water_uchar character) noexcept
  407. {
  408. return ((unsigned int) character) < (unsigned int) 0x10ffff;
  409. }
  410. /** Returns true if this data contains a valid string in this encoding. */
  411. static bool isValidString (const CharType* dataToTest, int maxBytesToRead)
  412. {
  413. while (--maxBytesToRead >= 0 && *dataToTest != 0)
  414. {
  415. const signed char byte = (signed char) *dataToTest++;
  416. if (byte < 0)
  417. {
  418. int bit = 0x40;
  419. int numExtraValues = 0;
  420. while ((byte & bit) != 0)
  421. {
  422. if (bit < 8)
  423. return false;
  424. ++numExtraValues;
  425. bit >>= 1;
  426. if (bit == 8 && (numExtraValues > maxBytesToRead
  427. || *CharPointer_UTF8 (dataToTest - 1) > 0x10ffff))
  428. return false;
  429. }
  430. if (numExtraValues == 0)
  431. return false;
  432. maxBytesToRead -= numExtraValues;
  433. if (maxBytesToRead < 0)
  434. return false;
  435. while (--numExtraValues >= 0)
  436. if ((*dataToTest++ & 0xc0) != 0x80)
  437. return false;
  438. }
  439. }
  440. return true;
  441. }
  442. /** These values are the byte-order mark (BOM) values for a UTF-8 stream. */
  443. enum
  444. {
  445. byteOrderMark1 = 0xef,
  446. byteOrderMark2 = 0xbb,
  447. byteOrderMark3 = 0xbf
  448. };
  449. /** Returns true if the first three bytes in this pointer are the UTF8 byte-order mark (BOM).
  450. The pointer must not be null, and must point to at least 3 valid bytes.
  451. */
  452. static bool isByteOrderMark (const void* possibleByteOrder) noexcept
  453. {
  454. wassert (possibleByteOrder != nullptr);
  455. const uint8* const c = static_cast<const uint8*> (possibleByteOrder);
  456. return c[0] == (uint8) byteOrderMark1
  457. && c[1] == (uint8) byteOrderMark2
  458. && c[2] == (uint8) byteOrderMark3;
  459. }
  460. private:
  461. CharType* data;
  462. friend class String;
  463. };
  464. }
  465. #endif // WATER_CHARPOINTER_UTF8_H_INCLUDED