/* ============================================================================== This file is part of the JUCE 6 technical preview. Copyright (c) 2020 - Raw Material Software Limited You may use this code under the terms of the GPL v3 (see www.gnu.org/licenses). For this technical preview, this file is not subject to commercial licensing. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE DISCLAIMED. ============================================================================== */ namespace juce { //============================================================================== /** Class containing some basic functions for simple tokenising of C++ code. @tags{GUI} */ struct CppTokeniserFunctions { static bool isIdentifierStart (const juce_wchar c) noexcept { return CharacterFunctions::isLetter (c) || c == '_' || c == '@'; } static bool isIdentifierBody (const juce_wchar c) noexcept { return CharacterFunctions::isLetterOrDigit (c) || c == '_' || c == '@'; } static bool isReservedKeyword (String::CharPointerType token, const int tokenLength) noexcept { static const char* const keywords2Char[] = { "do", "if", "or", nullptr }; static const char* const keywords3Char[] = { "and", "asm", "for", "int", "new", "not", "try", "xor", nullptr }; static const char* const keywords4Char[] = { "auto", "bool", "case", "char", "else", "enum", "goto", "long", "this", "true", "void", nullptr }; static const char* const keywords5Char[] = { "bitor", "break", "catch", "class", "compl", "const", "false", "final", "float", "or_eq", "short", "throw", "union", "using", "while", nullptr }; static const char* const keywords6Char[] = { "and_eq", "bitand", "delete", "double", "export", "extern", "friend", "import", "inline", "module", "not_eq", "public", "return", "signed", "sizeof", "static", "struct", "switch", "typeid", "xor_eq", nullptr }; static const char* const keywords7Char[] = { "__cdecl", "_Pragma", "alignas", "alignof", "concept", "default", "mutable", "nullptr", "private", "typedef", "uint8_t", "virtual", "wchar_t", nullptr }; static const char* const keywordsOther[] = { "@class", "@dynamic", "@end", "@implementation", "@interface", "@public", "@private", "@protected", "@property", "@synthesize", "__fastcall", "__stdcall", "atomic_cancel", "atomic_commit", "atomic_noexcept", "char16_t", "char32_t", "co_await", "co_return", "co_yield", "const_cast", "constexpr", "continue", "decltype", "dynamic_cast", "explicit", "namespace", "noexcept", "operator", "override", "protected", "register", "reinterpret_cast", "requires", "static_assert", "static_cast", "synchronized", "template", "thread_local", "typename", "unsigned", "volatile", nullptr }; const char* const* k; switch (tokenLength) { case 2: k = keywords2Char; break; case 3: k = keywords3Char; break; case 4: k = keywords4Char; break; case 5: k = keywords5Char; break; case 6: k = keywords6Char; break; case 7: k = keywords7Char; break; default: if (tokenLength < 2 || tokenLength > 16) return false; k = keywordsOther; break; } for (int i = 0; k[i] != nullptr; ++i) if (token.compare (CharPointer_ASCII (k[i])) == 0) return true; return false; } template static int parseIdentifier (Iterator& source) noexcept { int tokenLength = 0; String::CharPointerType::CharType possibleIdentifier[100]; String::CharPointerType possible (possibleIdentifier); while (isIdentifierBody (source.peekNextChar())) { auto c = source.nextChar(); if (tokenLength < 20) possible.write (c); ++tokenLength; } if (tokenLength > 1 && tokenLength <= 16) { possible.writeNull(); if (isReservedKeyword (String::CharPointerType (possibleIdentifier), tokenLength)) return CPlusPlusCodeTokeniser::tokenType_keyword; } return CPlusPlusCodeTokeniser::tokenType_identifier; } template static bool skipNumberSuffix (Iterator& source) { auto c = source.peekNextChar(); if (c == 'l' || c == 'L' || c == 'u' || c == 'U') source.skip(); if (CharacterFunctions::isLetterOrDigit (source.peekNextChar())) return false; return true; } static bool isHexDigit (const juce_wchar c) noexcept { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } template static bool parseHexLiteral (Iterator& source) noexcept { if (source.peekNextChar() == '-') source.skip(); if (source.nextChar() != '0') return false; auto c = source.nextChar(); if (c != 'x' && c != 'X') return false; int numDigits = 0; while (isHexDigit (source.peekNextChar())) { ++numDigits; source.skip(); } if (numDigits == 0) return false; return skipNumberSuffix (source); } static bool isOctalDigit (const juce_wchar c) noexcept { return c >= '0' && c <= '7'; } template static bool parseOctalLiteral (Iterator& source) noexcept { if (source.peekNextChar() == '-') source.skip(); if (source.nextChar() != '0') return false; if (! isOctalDigit (source.nextChar())) return false; while (isOctalDigit (source.peekNextChar())) source.skip(); return skipNumberSuffix (source); } static bool isDecimalDigit (const juce_wchar c) noexcept { return c >= '0' && c <= '9'; } template static bool parseDecimalLiteral (Iterator& source) noexcept { if (source.peekNextChar() == '-') source.skip(); int numChars = 0; while (isDecimalDigit (source.peekNextChar())) { ++numChars; source.skip(); } if (numChars == 0) return false; return skipNumberSuffix (source); } template static bool parseFloatLiteral (Iterator& source) noexcept { if (source.peekNextChar() == '-') source.skip(); int numDigits = 0; while (isDecimalDigit (source.peekNextChar())) { source.skip(); ++numDigits; } const bool hasPoint = (source.peekNextChar() == '.'); if (hasPoint) { source.skip(); while (isDecimalDigit (source.peekNextChar())) { source.skip(); ++numDigits; } } if (numDigits == 0) return false; auto c = source.peekNextChar(); bool hasExponent = (c == 'e' || c == 'E'); if (hasExponent) { source.skip(); c = source.peekNextChar(); if (c == '+' || c == '-') source.skip(); int numExpDigits = 0; while (isDecimalDigit (source.peekNextChar())) { source.skip(); ++numExpDigits; } if (numExpDigits == 0) return false; } c = source.peekNextChar(); if (c == 'f' || c == 'F') source.skip(); else if (! (hasExponent || hasPoint)) return false; return true; } template static int parseNumber (Iterator& source) { const Iterator original (source); if (parseFloatLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_float; source = original; if (parseHexLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_integer; source = original; if (parseOctalLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_integer; source = original; if (parseDecimalLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_integer; source = original; return CPlusPlusCodeTokeniser::tokenType_error; } template static void skipQuotedString (Iterator& source) noexcept { auto quote = source.nextChar(); for (;;) { auto c = source.nextChar(); if (c == quote || c == 0) break; if (c == '\\') source.skip(); } } template static void skipComment (Iterator& source) noexcept { bool lastWasStar = false; for (;;) { auto c = source.nextChar(); if (c == 0 || (c == '/' && lastWasStar)) break; lastWasStar = (c == '*'); } } template static void skipPreprocessorLine (Iterator& source) noexcept { bool lastWasBackslash = false; for (;;) { auto c = source.peekNextChar(); if (c == '"') { skipQuotedString (source); continue; } if (c == '/') { Iterator next (source); next.skip(); auto c2 = next.peekNextChar(); if (c2 == '/' || c2 == '*') return; } if (c == 0) break; if (c == '\n' || c == '\r') { source.skipToEndOfLine(); if (lastWasBackslash) skipPreprocessorLine (source); break; } lastWasBackslash = (c == '\\'); source.skip(); } } template static void skipIfNextCharMatches (Iterator& source, const juce_wchar c) noexcept { if (source.peekNextChar() == c) source.skip(); } template static void skipIfNextCharMatches (Iterator& source, const juce_wchar c1, const juce_wchar c2) noexcept { auto c = source.peekNextChar(); if (c == c1 || c == c2) source.skip(); } template static int readNextToken (Iterator& source) { source.skipWhitespace(); auto firstChar = source.peekNextChar(); switch (firstChar) { case 0: break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '.': { auto result = parseNumber (source); if (result == CPlusPlusCodeTokeniser::tokenType_error) { source.skip(); if (firstChar == '.') return CPlusPlusCodeTokeniser::tokenType_punctuation; } return result; } case ',': case ';': case ':': source.skip(); return CPlusPlusCodeTokeniser::tokenType_punctuation; case '(': case ')': case '{': case '}': case '[': case ']': source.skip(); return CPlusPlusCodeTokeniser::tokenType_bracket; case '"': case '\'': skipQuotedString (source); return CPlusPlusCodeTokeniser::tokenType_string; case '+': source.skip(); skipIfNextCharMatches (source, '+', '='); return CPlusPlusCodeTokeniser::tokenType_operator; case '-': { source.skip(); auto result = parseNumber (source); if (result == CPlusPlusCodeTokeniser::tokenType_error) { skipIfNextCharMatches (source, '-', '='); return CPlusPlusCodeTokeniser::tokenType_operator; } return result; } case '*': case '%': case '=': case '!': source.skip(); skipIfNextCharMatches (source, '='); return CPlusPlusCodeTokeniser::tokenType_operator; case '/': { source.skip(); auto nextChar = source.peekNextChar(); if (nextChar == '/') { source.skipToEndOfLine(); return CPlusPlusCodeTokeniser::tokenType_comment; } if (nextChar == '*') { source.skip(); skipComment (source); return CPlusPlusCodeTokeniser::tokenType_comment; } if (nextChar == '=') source.skip(); return CPlusPlusCodeTokeniser::tokenType_operator; } case '?': case '~': source.skip(); return CPlusPlusCodeTokeniser::tokenType_operator; case '<': case '>': case '|': case '&': case '^': source.skip(); skipIfNextCharMatches (source, firstChar); skipIfNextCharMatches (source, '='); return CPlusPlusCodeTokeniser::tokenType_operator; case '#': skipPreprocessorLine (source); return CPlusPlusCodeTokeniser::tokenType_preprocessor; default: if (isIdentifierStart (firstChar)) return parseIdentifier (source); source.skip(); break; } return CPlusPlusCodeTokeniser::tokenType_error; } /** A class that can be passed to the CppTokeniserFunctions functions in order to parse a String. */ struct StringIterator { StringIterator (const String& s) noexcept : t (s.getCharPointer()) {} StringIterator (String::CharPointerType s) noexcept : t (s) {} juce_wchar nextChar() noexcept { if (isEOF()) return 0; ++numChars; return t.getAndAdvance(); } juce_wchar peekNextChar()noexcept { return *t; } void skip() noexcept { if (! isEOF()) { ++t; ++numChars; } } void skipWhitespace() noexcept { while (t.isWhitespace()) skip(); } void skipToEndOfLine() noexcept { while (*t != '\r' && *t != '\n' && *t != 0) skip(); } bool isEOF() const noexcept { return t.isEmpty(); } String::CharPointerType t; int numChars = 0; }; //============================================================================== /** Takes a UTF8 string and writes it to a stream using standard C++ escape sequences for any non-ascii bytes. Although not strictly a tokenising function, this is still a function that often comes in handy when working with C++ code! Note that addEscapeChars() is easier to use than this function if you're working with Strings. @see addEscapeChars */ static void writeEscapeChars (OutputStream& out, const char* utf8, const int numBytesToRead, const int maxCharsOnLine, const bool breakAtNewLines, const bool replaceSingleQuotes, const bool allowStringBreaks) { int charsOnLine = 0; bool lastWasHexEscapeCode = false; bool trigraphDetected = false; for (int i = 0; i < numBytesToRead || numBytesToRead < 0; ++i) { auto c = (unsigned char) utf8[i]; bool startNewLine = false; switch (c) { case '\t': out << "\\t"; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; break; case '\r': out << "\\r"; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; break; case '\n': out << "\\n"; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; startNewLine = breakAtNewLines; break; case '\\': out << "\\\\"; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; break; case '\"': out << "\\\""; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; break; case '?': if (trigraphDetected) { out << "\\?"; charsOnLine++; trigraphDetected = false; } else { out << "?"; trigraphDetected = true; } lastWasHexEscapeCode = false; charsOnLine++; break; case 0: if (numBytesToRead < 0) return; out << "\\0"; lastWasHexEscapeCode = true; trigraphDetected = false; charsOnLine += 2; break; case '\'': if (replaceSingleQuotes) { out << "\\\'"; lastWasHexEscapeCode = false; trigraphDetected = false; charsOnLine += 2; break; } // deliberate fall-through... JUCE_FALLTHROUGH default: if (c >= 32 && c < 127 && ! (lastWasHexEscapeCode // (have to avoid following a hex escape sequence with a valid hex digit) && CharacterFunctions::getHexDigitValue (c) >= 0)) { out << (char) c; lastWasHexEscapeCode = false; trigraphDetected = false; ++charsOnLine; } else if (allowStringBreaks && lastWasHexEscapeCode && c >= 32 && c < 127) { out << "\"\"" << (char) c; lastWasHexEscapeCode = false; trigraphDetected = false; charsOnLine += 3; } else { out << (c < 16 ? "\\x0" : "\\x") << String::toHexString ((int) c); lastWasHexEscapeCode = true; trigraphDetected = false; charsOnLine += 4; } break; } if ((startNewLine || (maxCharsOnLine > 0 && charsOnLine >= maxCharsOnLine)) && (numBytesToRead < 0 || i < numBytesToRead - 1)) { charsOnLine = 0; out << "\"" << newLine << "\""; lastWasHexEscapeCode = false; } } } /** Takes a string and returns a version of it where standard C++ escape sequences have been used to replace any non-ascii bytes. Although not strictly a tokenising function, this is still a function that often comes in handy when working with C++ code! @see writeEscapeChars */ static String addEscapeChars (const String& s) { MemoryOutputStream mo; writeEscapeChars (mo, s.toRawUTF8(), -1, -1, false, true, true); return mo.toString(); } }; } // namespace juce