// Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors // Licensed under the MIT License: // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // This file contains parsers useful for character stream inputs, including parsers to parse // common kinds of tokens like identifiers, numbers, and quoted strings. #ifndef KJ_PARSE_CHAR_H_ #define KJ_PARSE_CHAR_H_ #if defined(__GNUC__) && !KJ_HEADER_WARNINGS #pragma GCC system_header #endif #include "common.h" #include "../string.h" #include <inttypes.h> namespace kj { namespace parse { // ======================================================================================= // Exact char/string. class ExactString_ { public: constexpr inline ExactString_(const char* str): str(str) {} template <typename Input> Maybe<Tuple<>> operator()(Input& input) const { const char* ptr = str; while (*ptr != '\0') { if (input.atEnd() || input.current() != *ptr) return nullptr; input.next(); ++ptr; } return Tuple<>(); } private: const char* str; }; constexpr inline ExactString_ exactString(const char* str) { return ExactString_(str); } template <char c> constexpr ExactlyConst_<char, c> exactChar() { // Returns a parser that matches exactly the character given by the template argument (returning // no result). return ExactlyConst_<char, c>(); } // ======================================================================================= // Char ranges / sets class CharGroup_ { public: constexpr inline CharGroup_(): bits{0, 0, 0, 0} {} constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const { return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )), bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)), bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)), bits[3] | (oneBits(last - 191) & ~oneBits(first - 192))); } constexpr inline CharGroup_ orAny(const char* chars) const { return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1); } constexpr inline CharGroup_ orChar(unsigned char c) const { return CharGroup_(bits[0] | bit(c), bits[1] | bit(c - 64), bits[2] | bit(c - 128), bits[3] | bit(c - 256)); } constexpr inline CharGroup_ orGroup(CharGroup_ other) const { return CharGroup_(bits[0] | other.bits[0], bits[1] | other.bits[1], bits[2] | other.bits[2], bits[3] | other.bits[3]); } constexpr inline CharGroup_ invert() const { return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]); } constexpr inline bool contains(unsigned char c) const { return (bits[c / 64] & (1ll << (c % 64))) != 0; } inline bool containsAll(ArrayPtr<const char> text) const { for (char c: text) { if (!contains(c)) return false; } return true; } template <typename Input> Maybe<char> operator()(Input& input) const { if (input.atEnd()) return nullptr; unsigned char c = input.current(); if (contains(c)) { input.next(); return c; } else { return nullptr; } } private: typedef unsigned long long Bits64; constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {} Bits64 bits[4]; static constexpr inline Bits64 oneBits(int count) { return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1); } static constexpr inline Bits64 bit(int index) { return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index); } }; constexpr inline CharGroup_ charRange(char first, char last) { // Create a parser which accepts any character in the range from `first` to `last`, inclusive. // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the // character matched. // // The returned object has methods which can be used to match more characters. The following // produces a parser which accepts any letter as well as '_', '+', '-', and '.'. // // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.") // // You can also use `.invert()` to match the opposite set of characters. return CharGroup_().orRange(first, last); } #if _MSC_VER #define anyOfChars(chars) CharGroup_().orAny(chars) // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from // building the compiler or schema parser. We don't know why this happens, but Harris found that // this horrible, horrible hack makes things work. This is awful, but it's better than nothing. // Hopefully, MSVC will get fixed soon and we'll be able to remove this. #else constexpr inline CharGroup_ anyOfChars(const char* chars) { // Returns a parser that accepts any of the characters in the given string (which should usually // be a literal). The returned parser is of the same type as returned by `charRange()` -- see // that function for more info. return CharGroup_().orAny(chars); } #endif // ======================================================================================= namespace _ { // private struct ArrayToString { inline String operator()(const Array<char>& arr) const { return heapString(arr); } }; } // namespace _ (private) template <typename SubParser> constexpr inline auto charsToString(SubParser&& subParser) -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) { // Wraps a parser that returns Array<char> such that it returns String instead. return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString()); } // ======================================================================================= // Basic character classes. constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z'); constexpr auto digit = charRange('0', '9'); constexpr auto alphaNumeric = alpha.orGroup(digit); constexpr auto nameStart = alpha.orChar('_'); constexpr auto nameChar = alphaNumeric.orChar('_'); constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F'); constexpr auto octDigit = charRange('0', '7'); constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v"); constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert(); constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v")); constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v")))); // Like discard(whitespace) but avoids some memory allocation. // ======================================================================================= // Identifiers namespace _ { // private struct IdentifierToString { inline String operator()(char first, const Array<char>& rest) const { if (rest.size() == 0) return heapString(&first, 1); String result = heapString(rest.size() + 1); result[0] = first; memcpy(result.begin() + 1, rest.begin(), rest.size()); return result; } }; } // namespace _ (private) constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString()); // Parses an identifier (e.g. a C variable name). // ======================================================================================= // Integers namespace _ { // private inline char parseDigit(char c) { if (c < 'A') return c - '0'; if (c < 'a') return c - 'A' + 10; return c - 'a' + 10; } template <uint base> struct ParseInteger { inline uint64_t operator()(const Array<char>& digits) const { return operator()('0', digits); } uint64_t operator()(char first, const Array<char>& digits) const { uint64_t result = parseDigit(first); for (char digit: digits) { result = result * base + parseDigit(digit); } return result; } }; } // namespace _ (private) constexpr auto integer = sequence( oneOf( transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()), transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()), transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())), notLookingAt(alpha.orAny("_."))); // ======================================================================================= // Numbers (i.e. floats) namespace _ { // private struct ParseFloat { double operator()(const Array<char>& digits, const Maybe<Array<char>>& fraction, const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const; }; } // namespace _ (private) constexpr auto number = transform( sequence( oneOrMore(digit), optional(sequence(exactChar<'.'>(), many(digit))), optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))), notLookingAt(alpha.orAny("_."))), _::ParseFloat()); // ======================================================================================= // Quoted strings namespace _ { // private struct InterpretEscape { char operator()(char c) const { switch (c) { case 'a': return '\a'; case 'b': return '\b'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\v'; default: return c; } } }; struct ParseHexEscape { inline char operator()(char first, char second) const { return (parseDigit(first) << 4) | parseDigit(second); } }; struct ParseHexByte { inline byte operator()(char first, char second) const { return (parseDigit(first) << 4) | parseDigit(second); } }; struct ParseOctEscape { inline char operator()(char first, Maybe<char> second, Maybe<char> third) const { char result = first - '0'; KJ_IF_MAYBE(digit1, second) { result = (result << 3) | (*digit1 - '0'); KJ_IF_MAYBE(digit2, third) { result = (result << 3) | (*digit2 - '0'); } } return result; } }; } // namespace _ (private) constexpr auto escapeSequence = sequence(exactChar<'\\'>(), oneOf( transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()), transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()), transform(sequence(octDigit, optional(octDigit), optional(octDigit)), _::ParseOctEscape()))); // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns // a char. constexpr auto doubleQuotedString = charsToString(sequence( exactChar<'\"'>(), many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)), exactChar<'\"'>())); // Parses a C-style double-quoted string. constexpr auto singleQuotedString = charsToString(sequence( exactChar<'\''>(), many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)), exactChar<'\''>())); // Parses a C-style single-quoted string. constexpr auto doubleQuotedHexBinary = sequence( exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(), oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())), discardWhitespace, exactChar<'\"'>()); // Parses a double-quoted hex binary literal. Returns Array<byte>. } // namespace parse } // namespace kj #endif // KJ_PARSE_CHAR_H_