/* ============================================================================== This file is part of the JUCE library. Copyright (c) 2015 - ROLI Ltd. Permission is granted to use this software under the terms of either: a) the GPL v2 (or any later version) b) the Affero GPL v3 Details of these licenses can be found at: www.gnu.org/licenses JUCE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ------------------------------------------------------------------------------ To release a closed-source product which uses JUCE, commercial licenses are available: visit www.juce.com for more information. ============================================================================== */ #ifndef JUCE_CPLUSPLUSCODETOKENISERFUNCTIONS_H_INCLUDED #define JUCE_CPLUSPLUSCODETOKENISERFUNCTIONS_H_INCLUDED //============================================================================== /** Class containing some basic functions for simple tokenising of C++ code. */ struct CppTokeniserFunctions { static bool isIdentifierStart (const juce_wchar c) noexcept { return CharacterFunctions::isLetter (c) || c == '_' || c == '@'; } static bool isIdentifierBody (const juce_wchar c) noexcept { return CharacterFunctions::isLetterOrDigit (c) || c == '_' || c == '@'; } static bool isReservedKeyword (String::CharPointerType token, const int tokenLength) noexcept { static const char* const keywords2Char[] = { "if", "do", "or", nullptr }; static const char* const keywords3Char[] = { "for", "int", "new", "try", "xor", "and", "asm", "not", nullptr }; static const char* const keywords4Char[] = { "bool", "void", "this", "true", "long", "else", "char", "enum", "case", "goto", "auto", nullptr }; static const char* const keywords5Char[] = { "float", "const", "while", "break", "false", "catch", "class", "bitor", "compl", "or_eq", "short", "throw", "union", "using", "final", nullptr }; static const char* const keywords6Char[] = { "return", "and_eq", "bitand", "delete", "double", "export", "extern", "friend", "inline", "not_eq", "public", "signed", "sizeof", "static", "struct", "switch", "typeid", "xor_eq", nullptr }; static const char* const keywords7Char[] = { "nullptr", "alignas", "alignof", "default", "mutable", "private", "typedef", "virtual", "wchar_t", "__cdecl", "_Pragma", "uint8_t", nullptr }; static const char* const keywordsOther[] = { "char16_t", "char32_t", "const_cast", "constexpr", "continue", "decltype", "dynamic_cast", "explicit", "namespace", "noexcept", "operator", "protected", "register", "reinterpret_cast", "static_assert", "static_cast", "template", "thread_local", "typename", "unsigned", "volatile", "@class", "@dynamic", "@end", "@implementation", "@interface", "@public", "@private", "@protected", "@property", "@synthesize", "__fastcall", "__stdcall", nullptr }; const char* const* k; switch (tokenLength) { case 2: k = keywords2Char; break; case 3: k = keywords3Char; break; case 4: k = keywords4Char; break; case 5: k = keywords5Char; break; case 6: k = keywords6Char; break; case 7: k = keywords7Char; break; default: if (tokenLength < 2 || tokenLength > 16) return false; k = keywordsOther; break; } for (int i = 0; k[i] != 0; ++i) if (token.compare (CharPointer_ASCII (k[i])) == 0) return true; return false; } template static int parseIdentifier (Iterator& source) noexcept { int tokenLength = 0; String::CharPointerType::CharType possibleIdentifier [100]; String::CharPointerType possible (possibleIdentifier); while (isIdentifierBody (source.peekNextChar())) { const juce_wchar c = source.nextChar(); if (tokenLength < 20) possible.write (c); ++tokenLength; } if (tokenLength > 1 && tokenLength <= 16) { possible.writeNull(); if (isReservedKeyword (String::CharPointerType (possibleIdentifier), tokenLength)) return CPlusPlusCodeTokeniser::tokenType_keyword; } return CPlusPlusCodeTokeniser::tokenType_identifier; } template static bool skipNumberSuffix (Iterator& source) { const juce_wchar c = source.peekNextChar(); if (c == 'l' || c == 'L' || c == 'u' || c == 'U') source.skip(); if (CharacterFunctions::isLetterOrDigit (source.peekNextChar())) return false; return true; } static bool isHexDigit (const juce_wchar c) noexcept { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } template static bool parseHexLiteral (Iterator& source) noexcept { if (source.peekNextChar() == '-') source.skip(); if (source.nextChar() != '0') return false; juce_wchar c = source.nextChar(); if (c != 'x' && c != 'X') return false; int numDigits = 0; while (isHexDigit (source.peekNextChar())) { ++numDigits; source.skip(); } if (numDigits == 0) return false; return skipNumberSuffix (source); } static bool isOctalDigit (const juce_wchar c) noexcept { return c >= '0' && c <= '7'; } template static bool parseOctalLiteral (Iterator& source) noexcept { if (source.peekNextChar() == '-') source.skip(); if (source.nextChar() != '0') return false; if (! isOctalDigit (source.nextChar())) return false; while (isOctalDigit (source.peekNextChar())) source.skip(); return skipNumberSuffix (source); } static bool isDecimalDigit (const juce_wchar c) noexcept { return c >= '0' && c <= '9'; } template static bool parseDecimalLiteral (Iterator& source) noexcept { if (source.peekNextChar() == '-') source.skip(); int numChars = 0; while (isDecimalDigit (source.peekNextChar())) { ++numChars; source.skip(); } if (numChars == 0) return false; return skipNumberSuffix (source); } template static bool parseFloatLiteral (Iterator& source) noexcept { if (source.peekNextChar() == '-') source.skip(); int numDigits = 0; while (isDecimalDigit (source.peekNextChar())) { source.skip(); ++numDigits; } const bool hasPoint = (source.peekNextChar() == '.'); if (hasPoint) { source.skip(); while (isDecimalDigit (source.peekNextChar())) { source.skip(); ++numDigits; } } if (numDigits == 0) return false; juce_wchar c = source.peekNextChar(); const bool hasExponent = (c == 'e' || c == 'E'); if (hasExponent) { source.skip(); c = source.peekNextChar(); if (c == '+' || c == '-') source.skip(); int numExpDigits = 0; while (isDecimalDigit (source.peekNextChar())) { source.skip(); ++numExpDigits; } if (numExpDigits == 0) return false; } c = source.peekNextChar(); if (c == 'f' || c == 'F') source.skip(); else if (! (hasExponent || hasPoint)) return false; return true; } template static int parseNumber (Iterator& source) { const Iterator original (source); if (parseFloatLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_float; source = original; if (parseHexLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_integer; source = original; if (parseOctalLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_integer; source = original; if (parseDecimalLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_integer; source = original; return CPlusPlusCodeTokeniser::tokenType_error; } template static void skipQuotedString (Iterator& source) noexcept { const juce_wchar quote = source.nextChar(); for (;;) { const juce_wchar c = source.nextChar(); if (c == quote || c == 0) break; if (c == '\\') source.skip(); } } template static void skipComment (Iterator& source) noexcept { bool lastWasStar = false; for (;;) { const juce_wchar c = source.nextChar(); if (c == 0 || (c == '/' && lastWasStar)) break; lastWasStar = (c == '*'); } } template static void skipPreprocessorLine (Iterator& source) noexcept { bool lastWasBackslash = false; for (;;) { const juce_wchar c = source.peekNextChar(); if (c == '"') { skipQuotedString (source); continue; } if (c == '/') { Iterator next (source); next.skip(); const juce_wchar c2 = next.peekNextChar(); if (c2 == '/' || c2 == '*') return; } if (c == 0) break; if (c == '\n' || c == '\r') { source.skipToEndOfLine(); if (lastWasBackslash) skipPreprocessorLine (source); break; } lastWasBackslash = (c == '\\'); source.skip(); } } template static void skipIfNextCharMatches (Iterator& source, const juce_wchar c) noexcept { if (source.peekNextChar() == c) source.skip(); } template static void skipIfNextCharMatches (Iterator& source, const juce_wchar c1, const juce_wchar c2) noexcept { const juce_wchar c = source.peekNextChar(); if (c == c1 || c == c2) source.skip(); } template static int readNextToken (Iterator& source) { source.skipWhitespace(); const juce_wchar firstChar = source.peekNextChar(); switch (firstChar) { case 0: break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '.': { int result = parseNumber (source); if (result == CPlusPlusCodeTokeniser::tokenType_error) { source.skip(); if (firstChar == '.') return CPlusPlusCodeTokeniser::tokenType_punctuation; } return result; } case ',': case ';': case ':': source.skip(); return CPlusPlusCodeTokeniser::tokenType_punctuation; case '(': case ')': case '{': case '}': case '[': case ']': source.skip(); return CPlusPlusCodeTokeniser::tokenType_bracket; case '"': case '\'': skipQuotedString (source); return CPlusPlusCodeTokeniser::tokenType_string; case '+': source.skip(); skipIfNextCharMatches (source, '+', '='); return CPlusPlusCodeTokeniser::tokenType_operator; case '-': { source.skip(); int result = parseNumber (source); if (result == CPlusPlusCodeTokeniser::tokenType_error) { skipIfNextCharMatches (source, '-', '='); return CPlusPlusCodeTokeniser::tokenType_operator; } return result; } case '*': case '%': case '=': case '!': source.skip(); skipIfNextCharMatches (source, '='); return CPlusPlusCodeTokeniser::tokenType_operator; case '/': { source.skip(); juce_wchar nextChar = source.peekNextChar(); if (nextChar == '/') { source.skipToEndOfLine(); return CPlusPlusCodeTokeniser::tokenType_comment; } if (nextChar == '*') { source.skip(); skipComment (source); return CPlusPlusCodeTokeniser::tokenType_comment; } if (nextChar == '=') source.skip(); return CPlusPlusCodeTokeniser::tokenType_operator; } case '?': case '~': source.skip(); return CPlusPlusCodeTokeniser::tokenType_operator; case '<': case '>': case '|': case '&': case '^': source.skip(); skipIfNextCharMatches (source, firstChar); skipIfNextCharMatches (source, '='); return CPlusPlusCodeTokeniser::tokenType_operator; case '#': skipPreprocessorLine (source); return CPlusPlusCodeTokeniser::tokenType_preprocessor; default: if (isIdentifierStart (firstChar)) return parseIdentifier (source); source.skip(); break; } return CPlusPlusCodeTokeniser::tokenType_error; } /** A class that can be passed to the CppTokeniserFunctions functions in order to parse a String. */ struct StringIterator { StringIterator (const String& s) noexcept : t (s.getCharPointer()), numChars (0) {} StringIterator (String::CharPointerType s) noexcept : t (s), numChars (0) {} juce_wchar nextChar() noexcept { if (isEOF()) return 0; ++numChars; return t.getAndAdvance(); } juce_wchar peekNextChar()noexcept { return *t; } void skip() noexcept { if (! isEOF()) { ++t; ++numChars; } } void skipWhitespace() noexcept { while (t.isWhitespace()) skip(); } void skipToEndOfLine() noexcept { while (*t != '\r' && *t != '\n' && *t != 0) skip(); } bool isEOF() const noexcept { return t.isEmpty(); } String::CharPointerType t; int numChars; }; //============================================================================== /** Takes a UTF8 string and writes it to a stream using standard C++ escape sequences for any non-ascii bytes. Although not strictly a tokenising function, this is still a function that often comes in handy when working with C++ code! Note that addEscapeChars() is easier to use than this function if you're working with Strings. @see addEscapeChars */ static void writeEscapeChars (OutputStream& out, const char* utf8, const int numBytesToRead, const int maxCharsOnLine, const bool breakAtNewLines, const bool replaceSingleQuotes, const bool allowStringBreaks) { int charsOnLine = 0; bool lastWasHexEscapeCode = false; for (int i = 0; i < numBytesToRead || numBytesToRead < 0; ++i) { const unsigned char c = (unsigned char) utf8[i]; bool startNewLine = false; switch (c) { case '\t': out << "\\t"; lastWasHexEscapeCode = false; charsOnLine += 2; break; case '\r': out << "\\r"; lastWasHexEscapeCode = false; charsOnLine += 2; break; case '\n': out << "\\n"; lastWasHexEscapeCode = false; charsOnLine += 2; startNewLine = breakAtNewLines; break; case '\\': out << "\\\\"; lastWasHexEscapeCode = false; charsOnLine += 2; break; case '\"': out << "\\\""; lastWasHexEscapeCode = false; charsOnLine += 2; break; case 0: if (numBytesToRead < 0) return; out << "\\0"; lastWasHexEscapeCode = true; charsOnLine += 2; break; case '\'': if (replaceSingleQuotes) { out << "\\\'"; lastWasHexEscapeCode = false; charsOnLine += 2; break; } // deliberate fall-through... default: if (c >= 32 && c < 127 && ! (lastWasHexEscapeCode // (have to avoid following a hex escape sequence with a valid hex digit) && CharacterFunctions::getHexDigitValue (c) >= 0)) { out << (char) c; lastWasHexEscapeCode = false; ++charsOnLine; } else if (allowStringBreaks && lastWasHexEscapeCode && c >= 32 && c < 127) { out << "\"\"" << (char) c; lastWasHexEscapeCode = false; charsOnLine += 3; } else { out << (c < 16 ? "\\x0" : "\\x") << String::toHexString ((int) c); lastWasHexEscapeCode = true; charsOnLine += 4; } break; } if ((startNewLine || (maxCharsOnLine > 0 && charsOnLine >= maxCharsOnLine)) && (numBytesToRead < 0 || i < numBytesToRead - 1)) { charsOnLine = 0; out << "\"" << newLine << "\""; lastWasHexEscapeCode = false; } } } /** Takes a string and returns a version of it where standard C++ escape sequences have been used to replace any non-ascii bytes. Although not strictly a tokenising function, this is still a function that often comes in handy when working with C++ code! @see writeEscapeChars */ static String addEscapeChars (const String& s) { MemoryOutputStream mo; writeEscapeChars (mo, s.toRawUTF8(), -1, -1, false, true, true); return mo.toString(); } }; #endif // JUCE_CPLUSPLUSCODETOKENISERFUNCTIONS_H_INCLUDED