lexer-test.c++

// Copyright (c) 2013, Kenton Varda <temporal@gmail.com>
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
//    list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright notice,
//    this list of conditions and the following disclaimer in the documentation
//    and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "lexer.h"
#include "../message.h"
#include <gtest/gtest.h>

namespace capnp {
namespace compiler {
namespace {

class TestFailingErrorReporter: public ErrorReporter {
public:
  void addError(uint32_t startByte, uint32_t endByte, kj::StringPtr message) const override {
    ADD_FAILURE() << "Parse failed: (" << startByte << "-" << endByte << ") " << message.cStr();
  }

  bool hadErrors() const override {
    // Not used by lexer.
    return false;
  }
};

template <typename LexResult>
kj::String doLex(kj::StringPtr constText) {
  // Parse the given string into the given Cap'n Proto struct type using lex(), then stringify the
  // result and return that string.  Additionally, single quotes in the input are converted to
  // double quotes, and double quotes in the output are converted to single quotes, to reduce the
  // amount of escaping needed in the test strings.
  //
  // Comparing stringifications against golden strings is ugly and brittle.  If we had a
  // text-format parser we could use that.  Except that said parser would probably be built on
  // the very lexer being tested here, so...  maybe this is the best we can reasonably do.

  kj::String text = heapString(constText);
  for (char& c: text) {
    // Make it easier to write input strings below.
    if (c == '\'') c = '\"';
  }
  MallocMessageBuilder message;
  auto file = message.initRoot<LexResult>();
  TestFailingErrorReporter errorReporter;
  EXPECT_TRUE(lex(text, file, errorReporter));
  kj::String result = kj::str(file);
  for (char& c: result) {
    // Make it easier to write golden strings below.
    if (c == '\"') c = '\'';
  }
  return result;
}

TEST(Lexer, Tokens) {
  EXPECT_STREQ(
      "(tokens = ["
        "(identifier = 'foo', endByte = 3), "
        "(identifier = 'bar', startByte = 4, endByte = 7)"
      "])",
      doLex<LexedTokens>("foo bar").cStr());

  EXPECT_STREQ(
      "(tokens = ["
        "(identifier = 'foo', endByte = 3), "
        "(identifier = 'bar', startByte = 15, endByte = 18)"
      "])",
      doLex<LexedTokens>("foo # comment\n bar").cStr());

  EXPECT_STREQ(
      "(tokens = ["
        "(stringLiteral = 'foo ', startByte = 2, endByte = 11), "
        "(integerLiteral = 123, startByte = 12, endByte = 15), "
        "(floatLiteral = 2.75, startByte = 16, endByte = 20), "
        "(floatLiteral = 60000, startByte = 21, endByte = 24), "
        "(operator = '+', startByte = 25, endByte = 26), "
        "(operator = '-=', startByte = 27, endByte = 29)"
      "])",
      doLex<LexedTokens>("  'foo\\x20' 123 2.75 6e4 + -=  ").cStr());

  EXPECT_STREQ(
      "(tokens = ["
        "(parenthesizedList = ["
          "["
            "(identifier = 'foo', startByte = 1, endByte = 4), "
            "(identifier = 'bar', startByte = 5, endByte = 8)"
          "], ["
            "(identifier = 'baz', startByte = 10, endByte = 13), "
            "(identifier = 'qux', startByte = 14, endByte = 17)"
          "], ["
            "(identifier = 'corge', startByte = 19, endByte = 24), "
            "(identifier = 'grault', startByte = 25, endByte = 31)"
          "]"
        "], endByte = 32)"
      "])",
      doLex<LexedTokens>("(foo bar, baz qux, corge grault)").cStr());

  EXPECT_STREQ(
      "(tokens = ["
        "(parenthesizedList = ["
          "["
            "(identifier = 'foo', startByte = 1, endByte = 4), "
            "(identifier = 'bar', startByte = 5, endByte = 8)"
          "]"
        "], endByte = 9)"
      "])",
      doLex<LexedTokens>("(foo bar)").cStr());

  // Empty parentheses should result in an empty list-of-lists, *not* a list containing an empty
  // list.
  EXPECT_STREQ(
      "(tokens = ["
        "(parenthesizedList = [], endByte = 4)"
      "])",
      doLex<LexedTokens>("(  )").cStr());

  EXPECT_STREQ(
      "(tokens = ["
        "(bracketedList = ["
          "["
            "(identifier = 'foo', startByte = 1, endByte = 4), "
            "(identifier = 'bar', startByte = 5, endByte = 8)"
          "], ["
            "(identifier = 'baz', startByte = 10, endByte = 13), "
            "(identifier = 'qux', startByte = 14, endByte = 17)"
          "], ["
            "(identifier = 'corge', startByte = 19, endByte = 24), "
            "(identifier = 'grault', startByte = 25, endByte = 31)"
          "]"
        "], endByte = 32)"
      "])",
      doLex<LexedTokens>("[foo bar, baz qux, corge grault]").cStr());

  EXPECT_STREQ(
      "(tokens = ["
        "(bracketedList = ["
          "["
            "(identifier = 'foo', startByte = 1, endByte = 4)"
          "], ["
            "(parenthesizedList = ["
              "["
                "(identifier = 'bar', startByte = 7, endByte = 10)"
              "], ["
                "(identifier = 'baz', startByte = 12, endByte = 15)"
              "]"
            "], startByte = 6, endByte = 16)"
          "]"
        "], endByte = 17), "
        "(identifier = 'qux', startByte = 18, endByte = 21)"
      "])",
      doLex<LexedTokens>("[foo, (bar, baz)] qux").cStr());

  EXPECT_STREQ(
      "(tokens = ["
        "(identifier = 'foo', endByte = 3), "
        "(identifier = 'bar', startByte = 7, endByte = 10)"
      "])",
      doLex<LexedTokens>("foo\n\r\t\vbar").cStr());
}

TEST(Lexer, Statements) {
  EXPECT_STREQ(
      "(statements = ["
        "(tokens = ["
          "(identifier = 'foo', endByte = 3), "
          "(identifier = 'bar', startByte = 4, endByte = 7)"
        "], endByte = 8)"
      "])",
      doLex<LexedStatements>("foo bar;").cStr());

  EXPECT_STREQ(
      "(statements = ["
        "(tokens = ["
          "(identifier = 'foo', endByte = 3)"
        "], endByte = 4), "
        "(tokens = ["
          "(identifier = 'bar', startByte = 5, endByte = 8)"
        "], startByte = 5, endByte = 9), "
        "(tokens = ["
          "(identifier = 'baz', startByte = 10, endByte = 13)"
        "], startByte = 10, endByte = 14)"
      "])",
      doLex<LexedStatements>("foo; bar; baz; ").cStr());

  EXPECT_STREQ(
      "(statements = ["
        "("
          "tokens = ["
            "(identifier = 'foo', endByte = 3)"
          "], "
          "block = ["
            "(tokens = ["
              "(identifier = 'bar', startByte = 5, endByte = 8)"
            "], startByte = 5, endByte = 9), "
            "(tokens = ["
              "(identifier = 'baz', startByte = 10, endByte = 13)"
            "], startByte = 10, endByte = 14)"
          "], "
          "endByte = 15"
        "), "
        "(tokens = ["
          "(identifier = 'qux', startByte = 16, endByte = 19)"
        "], startByte = 16, endByte = 20)"
      "])",
      doLex<LexedStatements>("foo {bar; baz;} qux;").cStr());
}

TEST(Lexer, DocComments) {
  EXPECT_STREQ(
      "(statements = ["
        "("
          "tokens = ["
            "(identifier = 'foo', endByte = 3)"
          "], "
          "docComment = 'blah blah\\n', "
          "endByte = 16"
        ")"
      "])",
      doLex<LexedStatements>("foo; # blah blah").cStr());

  EXPECT_STREQ(
      "(statements = ["
        "("
          "tokens = ["
            "(identifier = 'foo', endByte = 3)"
          "], "
          "docComment = 'blah blah\\n', "
          "endByte = 15"
        ")"
      "])",
      doLex<LexedStatements>("foo; #blah blah").cStr());

  EXPECT_STREQ(
      "(statements = ["
        "("
          "tokens = ["
            "(identifier = 'foo', endByte = 3)"
          "], "
          "docComment = ' blah blah\\n', "
          "endByte = 17"
        ")"
      "])",
      doLex<LexedStatements>("foo; #  blah blah").cStr());

  EXPECT_STREQ(
      "(statements = ["
        "("
          "tokens = ["
            "(identifier = 'foo', endByte = 3)"
          "], "
          "docComment = 'blah blah\\n', "
          "endByte = 16"
        ")"
      "])",
      doLex<LexedStatements>("foo;\n# blah blah").cStr());

  EXPECT_STREQ(
      "(statements = ["
        "("
          "tokens = ["
            "(identifier = 'foo', endByte = 3)"
          "], "
          "endByte = 4"
        ")"
      "])",
      doLex<LexedStatements>("foo;\n\n# blah blah").cStr());

  EXPECT_STREQ(
      "(statements = ["
        "("
          "tokens = ["
            "(identifier = 'foo', endByte = 3)"
          "], "
          "docComment = 'bar baz\\nqux corge\\n', "
          "endByte = 30"
        ")"
      "])",
      doLex<LexedStatements>("foo;\n # bar baz\n  # qux corge\n\n# grault\n# garply").cStr());

  EXPECT_STREQ(
      "(statements = ["
        "("
          "tokens = ["
            "(identifier = 'foo', endByte = 3)"
          "], "
          "block = ["
            "(tokens = ["
              "(identifier = 'bar', startByte = 17, endByte = 20)"
            "], docComment = 'hi\\n', startByte = 17, endByte = 27), "
            "(tokens = ["
              "(identifier = 'baz', startByte = 28, endByte = 31)"
            "], startByte = 28, endByte = 32)"
          "], "
          "docComment = 'blah blah\\n', "
          "endByte = 44"
        "), "
        "(tokens = ["
          "(identifier = 'qux', startByte = 44, endByte = 47)"
        "], startByte = 44, endByte = 48)"
      "])",
      doLex<LexedStatements>("foo {# blah blah\nbar; # hi\n baz;} # ignored\nqux;").cStr());

  EXPECT_STREQ(
      "(statements = ["
        "("
          "tokens = ["
            "(identifier = 'foo', endByte = 3)"
          "], "
          "block = ["
            "(tokens = ["
              "(identifier = 'bar', startByte = 5, endByte = 8)"
            "], startByte = 5, endByte = 9), "
            "(tokens = ["
              "(identifier = 'baz', startByte = 10, endByte = 13)"
            "], startByte = 10, endByte = 14)"
          "], "
          "docComment = 'late comment\\n', "
          "endByte = 31"
        "), "
        "(tokens = ["
          "(identifier = 'qux', startByte = 31, endByte = 34)"
        "], startByte = 31, endByte = 35)"
      "])",
      doLex<LexedStatements>("foo {bar; baz;}\n# late comment\nqux;").cStr());
}

}  // namespace
}  // namespace compiler
}  // namespace capnp