Implement lexer for new compiler.

ddb57c2d · Kenton Varda · 1d364ad9 · ddb57c2d · ddb57c2d · ddb57c2d
Commit ddb57c2d authored Jul 09, 2013 by Kenton Varda
13 changed files
--- a/c++/src/capnp/blob.h
+++ b/c++/src/capnp/blob.h
@@ -122,6 +122,7 @@ public:
  // Result does not include NUL terminator.

  inline char operator[](size_t index) const { return content[index]; }
+  inline char& operator[](size_t index) { return content[index]; }

  inline char* begin() { return content.begin(); }
  inline char* end() { return content.end() - 1; }

--- a/c++/src/capnp/compiler/capnpc2.c++
+++ b/c++/src/capnp/compiler/capnpc2.c++
+// Copyright (c) 2013, Kenton Varda <temporal@gmail.com>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "lexer.h"
+#include <kj/vector.h>
+#include <kj/io.h>
+#include <unistd.h>
+#include <kj/debug.h>
+#include "../message.h"
+
+int main(int argc, char* argv[]) {
+  // Eventually this will be capnpc.  For now it's just a dummy program that tests parsing.
+
+  kj::Vector<char> input;
+  char buffer[4096];
+  for (;;) {
+    ssize_t n;
+    KJ_SYSCALL(n = read(STDIN_FILENO, buffer, sizeof(buffer)));
+    if (n == 0) {
+      break;
+    }
+    input.addAll(buffer, buffer + n);
+  }
+
+  KJ_DBG(input);
+
+  capnp::MallocMessageBuilder message;
+  auto file = message.initRoot<capnp::compiler::LexedStatements>();
+  capnp::compiler::lex(input, file);
+
+  KJ_DBG(file);
+
+  return 0;
+}
--- a/c++/src/capnp/compiler/lexer-test.c++
+++ b/c++/src/capnp/compiler/lexer-test.c++
+// Copyright (c) 2013, Kenton Varda <temporal@gmail.com>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "lexer.h"
+#include "../message.h"
+#include <gtest/gtest.h>
+
+namespace capnp {
+namespace compiler {
+namespace {
+
+template <typename LexResult>
+kj::String doLex(kj::StringPtr constText) {
+  // Parse the given string into the given Cap'n Proto struct type using lex(), then stringify the
+  // result and return that string.  Additionally, single quotes in the input are converted to
+  // double quotes, and double quotes in the output are converted to single quotes, to reduce the
+  // amount of escaping needed in the test strings.
+  //
+  // Comparing stringifications against golden strings is ugly and brittle.  If we had a
+  // text-format parser we could use that.  Except that said parser would probably be built on
+  // the very lexer being tested here, so...  maybe this is the best we can reasonably do.
+
+  kj::String text = heapString(constText);
+  for (char& c: text) {
+    // Make it easier to write input strings below.
+    if (c == '\'') c = '\"';
+  }
+  MallocMessageBuilder message;
+  auto file = message.initRoot<LexResult>();
+  EXPECT_TRUE(lex(text, file));
+  kj::String result = kj::str(file);
+  for (char& c: result) {
+    // Make it easier to write golden strings below.
+    if (c == '\"') c = '\'';
+  }
+  return result;
+}
+
+TEST(Lexer, Tokens) {
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(token = (body = identifier('foo'), endByte = 3)), "
+        "(token = (body = identifier('bar'), startByte = 4, endByte = 7))"
+      "])",
+      doLex<LexedTokens>("foo bar").cStr());
+
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(token = (body = identifier('foo'), endByte = 3)), "
+        "(token = (body = identifier('bar'), startByte = 15, endByte = 18))"
+      "])",
+      doLex<LexedTokens>("foo # comment\n bar").cStr());
+
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(token = (body = stringLiteral('foo '), startByte = 2, endByte = 11)), "
+        "(token = (body = integerLiteral(123), startByte = 12, endByte = 15)), "
+        "(token = (body = floatLiteral(2.75), startByte = 16, endByte = 20)), "
+        "(token = (body = floatLiteral(60000), startByte = 21, endByte = 24)), "
+        "(token = (body = operator('+'), startByte = 25, endByte = 26)), "
+        "(token = (body = operator('-='), startByte = 27, endByte = 29))"
+      "])",
+      doLex<LexedTokens>("  'foo\\x20' 123 2.75 6e4 + -=  ").cStr());
+
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(token = (body = parenthesizedList(["
+          "["
+            "(token = (body = identifier('foo'), startByte = 1, endByte = 4)), "
+            "(token = (body = identifier('bar'), startByte = 5, endByte = 8))"
+          "], ["
+            "(token = (body = identifier('baz'), startByte = 10, endByte = 13)), "
+            "(token = (body = identifier('qux'), startByte = 14, endByte = 17))"
+          "], ["
+            "(token = (body = identifier('corge'), startByte = 19, endByte = 24)), "
+            "(token = (body = identifier('grault'), startByte = 25, endByte = 31))"
+          "]"
+        "]), endByte = 32))"
+      "])",
+      doLex<LexedTokens>("(foo bar, baz qux, corge grault)").cStr());
+
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(token = (body = parenthesizedList(["
+          "["
+            "(token = (body = identifier('foo'), startByte = 1, endByte = 4)), "
+            "(token = (body = identifier('bar'), startByte = 5, endByte = 8))"
+          "]"
+        "]), endByte = 9))"
+      "])",
+      doLex<LexedTokens>("(foo bar)").cStr());
+
+  // Empty parentheses should result in an empty list-of-lists, *not* a list containing an empty
+  // list.
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(token = (body = parenthesizedList([]), endByte = 4))"
+      "])",
+      doLex<LexedTokens>("(  )").cStr());
+
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(token = (body = bracketedList(["
+          "["
+            "(token = (body = identifier('foo'), startByte = 1, endByte = 4)), "
+            "(token = (body = identifier('bar'), startByte = 5, endByte = 8))"
+          "], ["
+            "(token = (body = identifier('baz'), startByte = 10, endByte = 13)), "
+            "(token = (body = identifier('qux'), startByte = 14, endByte = 17))"
+          "], ["
+            "(token = (body = identifier('corge'), startByte = 19, endByte = 24)), "
+            "(token = (body = identifier('grault'), startByte = 25, endByte = 31))"
+          "]"
+        "]), endByte = 32))"
+      "])",
+      doLex<LexedTokens>("[foo bar, baz qux, corge grault]").cStr());
+
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(token = (body = bracketedList(["
+          "["
+            "(token = (body = identifier('foo'), startByte = 1, endByte = 4))"
+          "], ["
+            "(token = (body = parenthesizedList(["
+              "["
+                "(token = (body = identifier('bar'), startByte = 7, endByte = 10))"
+              "], ["
+                "(token = (body = identifier('baz'), startByte = 12, endByte = 15))"
+              "]"
+            "]), startByte = 6, endByte = 16))"
+          "]"
+        "]), endByte = 17)), "
+        "(token = (body = identifier('qux'), startByte = 18, endByte = 21))"
+      "])",
+      doLex<LexedTokens>("[foo, (bar, baz)] qux").cStr());
+
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(token = (body = identifier('foo'), endByte = 3)), "
+        "(token = (body = identifier('bar'), startByte = 7, endByte = 10))"
+      "])",
+      doLex<LexedTokens>("foo\n\r\t\vbar").cStr());
+}
+
+TEST(Lexer, Statements) {
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = (tokens = ["
+          "(token = (body = identifier('foo'), endByte = 3)), "
+          "(token = (body = identifier('bar'), startByte = 4, endByte = 7))"
+        "]))"
+      "])",
+      doLex<LexedStatements>("foo bar;").cStr());
+
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = (tokens = ["
+          "(token = (body = identifier('foo'), endByte = 3))"
+        "])), "
+        "(statement = (tokens = ["
+          "(token = (body = identifier('bar'), startByte = 5, endByte = 8))"
+        "])), "
+        "(statement = (tokens = ["
+          "(token = (body = identifier('baz'), startByte = 10, endByte = 13))"
+        "]))"
+      "])",
+      doLex<LexedStatements>("foo; bar; baz; ").cStr());
+
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = ("
+          "tokens = ["
+            "(token = (body = identifier('foo'), endByte = 3))"
+          "], "
+          "block = statements(["
+            "(statement = (tokens = ["
+              "(token = (body = identifier('bar'), startByte = 5, endByte = 8))"
+            "])), "
+            "(statement = (tokens = ["
+              "(token = (body = identifier('baz'), startByte = 10, endByte = 13))"
+            "]))"
+          "]))"
+        "), "
+        "(statement = (tokens = ["
+          "(token = (body = identifier('qux'), startByte = 16, endByte = 19))"
+        "]))"
+      "])",
+      doLex<LexedStatements>("foo {bar; baz;} qux;").cStr());
+}
+
+TEST(Lexer, DocComments) {
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = ("
+          "tokens = ["
+            "(token = (body = identifier('foo'), endByte = 3))"
+          "], "
+          "docComment = 'blah blah\\n'"
+        "))"
+      "])",
+      doLex<LexedStatements>("foo; # blah blah").cStr());
+
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = ("
+          "tokens = ["
+            "(token = (body = identifier('foo'), endByte = 3))"
+          "], "
+          "docComment = 'blah blah\\n'"
+        "))"
+      "])",
+      doLex<LexedStatements>("foo; #blah blah").cStr());
+
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = ("
+          "tokens = ["
+            "(token = (body = identifier('foo'), endByte = 3))"
+          "], "
+          "docComment = ' blah blah\\n'"
+        "))"
+      "])",
+      doLex<LexedStatements>("foo; #  blah blah").cStr());
+
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = ("
+          "tokens = ["
+            "(token = (body = identifier('foo'), endByte = 3))"
+          "], "
+          "docComment = 'blah blah\\n'"
+        "))"
+      "])",
+      doLex<LexedStatements>("foo;\n# blah blah").cStr());
+
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = ("
+          "tokens = ["
+            "(token = (body = identifier('foo'), endByte = 3))"
+          "]"
+        "))"
+      "])",
+      doLex<LexedStatements>("foo;\n\n# blah blah").cStr());
+
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = ("
+          "tokens = ["
+            "(token = (body = identifier('foo'), endByte = 3))"
+          "], "
+          "docComment = 'bar baz\\nqux corge\\n'"
+        "))"
+      "])",
+      doLex<LexedStatements>("foo;\n # bar baz\n  # qux corge\n\n# grault\n# garply").cStr());
+
+  EXPECT_STREQ(
+      "(statements = ["
+        "(statement = ("
+          "tokens = ["
+            "(token = (body = identifier('foo'), endByte = 3))"
+          "], "
+          "block = statements(["
+            "(statement = (tokens = ["
+              "(token = (body = identifier('bar'), startByte = 17, endByte = 20))"
+            "], docComment = 'hi\\n')), "
+            "(statement = (tokens = ["
+              "(token = (body = identifier('baz'), startByte = 28, endByte = 31))"
+            "]))"
+          "]), "
+          "docComment = 'blah blah\\n'"
+        ")), "
+        "(statement = (tokens = ["
+          "(token = (body = identifier('qux'), startByte = 44, endByte = 47))"
+        "]))"
+      "])",
+      doLex<LexedStatements>("foo {# blah blah\nbar; # hi\n baz;} # ignored\nqux;").cStr());
+}
+
+}  // namespace
+}  // namespace compiler
+}  // namespace capnp
--- a/c++/src/capnp/compiler/lexer.c++
+++ b/c++/src/capnp/compiler/lexer.c++
+// Copyright (c) 2013, Kenton Varda <temporal@gmail.com>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "lexer.h"
+#include <kj/parse/char.h>
+#include <kj/debug.h>
+
+namespace capnp {
+namespace compiler {
+
+namespace p = kj::parse;
+
+namespace {
+
+typedef p::IteratorInput<char, const char*> Input;
+typedef p::Span<const char*> Location;
+
+void buildTokenSequenceList(List<List<TokenPointer>>::Builder builder,
+                            kj::Array<kj::Array<Orphan<Token>>>&& items) {
+  for (uint i = 0; i < items.size(); i++) {
+    auto& item = items[i];
+    auto itemBuilder = builder.init(i, item.size());
+    for (uint j = 0; j < item.size(); j++) {
+      itemBuilder[j].adoptToken(kj::mv(item[j]));
+    }
+  }
+}
+
+void attachDocComment(Statement::Builder statement, kj::Array<kj::String>&& comment) {
+  size_t size = 0;
+  for (auto& line: comment) {
+    size += line.size() + 1;  // include newline
+  }
+  if (size > 0) {
+    Text::Builder builder = statement.initDocComment(size);
+    char* pos = builder.begin();
+    for (auto& line: comment) {
+      memcpy(pos, line.begin(), line.size());
+      pos += line.size();
+      *pos++ = '\n';
+    }
+    KJ_ASSERT(pos == builder.end());
+  }
+}
+
+constexpr auto discardComment =
+    sequence(p::exactChar<'#'>(), p::discard(p::many(p::discard(p::anyOfChars("\n").invert()))),
+             p::oneOf(p::exactChar<'\n'>(), p::endOfInput));
+constexpr auto saveComment =
+    sequence(p::exactChar<'#'>(), p::discard(p::optional(p::exactChar<' '>())),
+             p::charsToString(p::many(p::anyOfChars("\n").invert())),
+             p::oneOf(p::exactChar<'\n'>(), p::endOfInput));
+
+constexpr auto commentsAndWhitespace =
+    sequence(p::discardWhitespace,
+             p::discard(p::many(sequence(discardComment, p::discardWhitespace))));
+
+constexpr auto discardLineWhitespace =
+    p::discard(p::many(p::discard(p::whitespaceChar.invert().orAny("\r\n").invert())));
+constexpr auto newline = p::oneOf(
+    p::exactChar<'\n'>(),
+    sequence(p::exactChar<'\r'>(), p::discard(p::optional(p::exactChar<'\n'>()))));
+
+constexpr auto docComment = sequence(
+    discardLineWhitespace,
+    p::discard(p::optional(newline)),
+    p::many(p::sequence(discardLineWhitespace, saveComment)));
+// Parses a set of comment lines preceded by at most one newline and with no intervening blank
+// lines.
+
+}  // namespace
+
+bool lex(kj::ArrayPtr<const char> input,
+         LexedStatements::Builder* resultStatements,
+         LexedTokens::Builder* resultTokens) {
+  // This is a bit hacky.  Since the transformations applied by our parser require access to an
+  // Orphanage in order to build objects, we construct the parsers as local variables.  This means
+  // that all the parsers need to live in a single function scope.  In order to handle both tokens
+  // and statements, we have the function take `resultStatements` and `resultTokens` and parse
+  // into whichever one is non-null.
+  //
+  // TODO(someday):  Perhaps there should be a utility class called ParserPool which has a method
+  //   that takes a parser, allocates a copy of it within some arena, then returns a ParserRef
+  //   referencing that copy.  Then there could be a Lexer class which contains a ParserPool and
+  //   builds all its parsers in its constructor.  This would allow the class to directly expose
+  //   the parsers so that they can be used within other parser combinators.
+
+  Orphanage orphanage = resultStatements == nullptr ?
+      Orphanage::getForMessageContaining(*resultTokens) :
+      Orphanage::getForMessageContaining(*resultStatements);
+
+  auto initTok = [&](Orphan<Token>& t, const Location& loc) -> Token::Body::Builder {
+    auto tb = t.get();
+    tb.setStartByte(loc.begin() - input.begin());
+    tb.setEndByte(loc.end() - input.begin());
+    return tb.getBody();
+  };
+
+  p::ParserRef<Input, kj::Array<Orphan<Token>>> tokenSequence;
+
+  auto commaDelimitedList = transform(
+      p::sequence(tokenSequence, p::many(p::sequence(p::exactChar<','>(), tokenSequence))),
+      [&](kj::Array<Orphan<Token>>&& first, kj::Array<kj::Array<Orphan<Token>>>&& rest)
+          -> kj::Array<kj::Array<Orphan<Token>>> {
+        if (first == nullptr && rest == nullptr) {
+          // Completely empty list.
+          return nullptr;
+        } else {
+          auto result = kj::heapArrayBuilder<kj::Array<Orphan<Token>>>(rest.size() + 1);
+          result.add(kj::mv(first));
+          for (auto& item: rest) {
+            result.add(kj::mv(item));
+          }
+          return result.finish();
+        }
+      });
+
+  auto token = p::oneOf(
+      p::transformWithLocation(p::identifier,
+          [&](Location loc, kj::String name) -> Orphan<Token> {
+            auto t = orphanage.newOrphan<Token>();
+            initTok(t, loc).setIdentifier(name);
+            return t;
+          }),
+      p::transformWithLocation(p::doubleQuotedString,
+          [&](Location loc, kj::String text) -> Orphan<Token> {
+            auto t = orphanage.newOrphan<Token>();
+            initTok(t, loc).setStringLiteral(text);
+            return t;
+          }),
+      p::transformWithLocation(p::integer,
+          [&](Location loc, uint64_t i) -> Orphan<Token> {
+            auto t = orphanage.newOrphan<Token>();
+            initTok(t, loc).setIntegerLiteral(i);
+            return t;
+          }),
+      p::transformWithLocation(p::number,
+          [&](Location loc, double x) -> Orphan<Token> {
+            auto t = orphanage.newOrphan<Token>();
+            initTok(t, loc).setFloatLiteral(x);
+            return t;
+          }),
+      p::transformWithLocation(
+          p::charsToString(p::oneOrMore(p::anyOfChars("!$%&*+-./:<=>?@^|~"))),
+          [&](Location loc, kj::String text) -> Orphan<Token> {
+            auto t = orphanage.newOrphan<Token>();
+            initTok(t, loc).setOperator(text);
+            return t;
+          }),
+      p::transformWithLocation(
+          sequence(p::exactChar<'('>(), commaDelimitedList, p::exactChar<')'>()),
+          [&](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> {
+            auto t = orphanage.newOrphan<Token>();
+            buildTokenSequenceList(
+                initTok(t, loc).initParenthesizedList(items.size()), kj::mv(items));
+            return t;
+          }),
+      p::transformWithLocation(
+          sequence(p::exactChar<'['>(), commaDelimitedList, p::exactChar<']'>()),
+          [&](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> {
+            auto t = orphanage.newOrphan<Token>();
+            buildTokenSequenceList(
+                initTok(t, loc).initBracketedList(items.size()), kj::mv(items));
+            return t;
+          })
+      );
+  auto tokenSequence_ =
+      sequence(commentsAndWhitespace, many(sequence(token, commentsAndWhitespace)));
+  tokenSequence = tokenSequence_;
+
+  if (resultStatements == nullptr) {
+    // Only a token sequence is requested.
+    Input parserInput(input.begin(), input.end());
+    kj::Maybe<kj::Array<Orphan<Token>>> parseOutput = tokenSequence(parserInput);
+
+    if (!parserInput.atEnd()) {
+      return false;
+    }
+
+    KJ_IF_MAYBE(output, parseOutput) {
+      auto l = resultTokens->initTokens(output->size());
+      for (uint i = 0; i < output->size(); i++) {
+        l[i].adoptToken(kj::mv((*output)[i]));
+      }
+      return true;
+    } else {
+      return false;
+    }
+  } else {
+    p::ParserRef<Input, kj::Array<Orphan<Statement>>> statementSequence;
+
+    auto statementEnd = p::oneOf(
+        transform(p::sequence(p::exactChar<';'>(), docComment),
+            [&](kj::Array<kj::String>&& comment) -> Orphan<Statement> {
+              auto result = orphanage.newOrphan<Statement>();
+              auto builder = result.get();
+              attachDocComment(builder, kj::mv(comment));
+              builder.getBlock().setNone();
+              return result;
+            }),
+        transform(
+            p::sequence(p::exactChar<'{'>(), docComment, statementSequence, p::exactChar<'}'>()),
+            [&](kj::Array<kj::String>&& comment, kj::Array<Orphan<Statement>>&& statements)
+                -> Orphan<Statement> {
+              auto result = orphanage.newOrphan<Statement>();
+              auto builder = result.get();
+              attachDocComment(builder, kj::mv(comment));
+              auto list = builder.getBlock().initStatements(statements.size());
+              for (uint i = 0; i < statements.size(); i++) {
+                list[i].adoptStatement(kj::mv(statements[i]));
+              }
+              return result;
+            })
+        );
+
+    auto statement = p::transform(p::sequence(tokenSequence, statementEnd),
+        [&](kj::Array<Orphan<Token>>&& tokens, Orphan<Statement>&& statement) {
+          auto tokensBuilder = statement.get().initTokens(tokens.size());
+          for (uint i = 0; i < tokens.size(); i++) {
+            tokensBuilder[i].adoptToken(kj::mv(tokens[i]));
+          }
+          return kj::mv(statement);
+        });
+
+    auto statementSequence_ =
+        sequence(commentsAndWhitespace, many(sequence(statement, commentsAndWhitespace)));
+    statementSequence = statementSequence_;
+
+    Input parserInput(input.begin(), input.end());
+    kj::Maybe<kj::Array<Orphan<Statement>>> parseOutput = statementSequence(parserInput);
+
+    if (!parserInput.atEnd()) {
+      return false;
+    }
+
+    KJ_IF_MAYBE(output, parseOutput) {
+      auto l = resultStatements->initStatements(output->size());
+      for (uint i = 0; i < output->size(); i++) {
+        l[i].adoptStatement(kj::mv((*output)[i]));
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
+
+bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result) {
+  return lex(kj::mv(input), &result, nullptr);
+}
+bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result) {
+  return lex(kj::mv(input), nullptr, &result);
+}
+
+}  // namespace compiler
+}  // namespace capnp
--- a/c++/src/capnp/compiler/lexer.capnp
+++ b/c++/src/capnp/compiler/lexer.capnp
+@0xa73956d2621fc3ee;
+
+using Cxx = import "/capnp/c++.capnp";
+
+$Cxx.namespace("capnp::compiler");
+
+struct Token {
+  body @0 union {
+    identifier @1 :Text;
+    stringLiteral @2 :Text;
+    integerLiteral @3 :UInt64;
+    floatLiteral @4 :Float64;
+    operator @5 :Text;
+    parenthesizedList @6 :List(List(TokenPointer));
+    bracketedList @7 :List(List(TokenPointer));
+  }
+
+  startByte @8 :UInt32;
+  endByte @9 :UInt32;
+}
+
+struct TokenPointer {
+  # Hack to deal with the fact that struct lists cannot adopt elements.
+  #
+  # TODO(cleanup):  Find a better approach.
+
+  token @0 :Token;
+}
+
+struct Statement {
+  tokens @0 :List(TokenPointer);
+  block @1 union {
+    none @2 :Void;
+    statements @3 :List(StatementPointer);
+  }
+
+  docComment @4 :Text;
+}
+
+struct StatementPointer {
+  # Hack to deal with the fact that struct lists cannot adopt elements.
+  #
+  # TODO(cleanup):  Find a better approach.
+
+  statement @0 :Statement;
+}
+
+struct LexedTokens {
+  # Lexer output when asked to parse tokens that don't form statements.
+
+  tokens @0 :List(TokenPointer);
+}
+
+struct LexedStatements {
+  # Lexer output when asked to parse statements.
+
+  statements @0 :List(StatementPointer);
+}
--- a/c++/src/capnp/compiler/lexer.h
+++ b/c++/src/capnp/compiler/lexer.h
+// Copyright (c) 2013, Kenton Varda <temporal@gmail.com>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef CAPNP_COMPILER_LEXER_H_
+#define CAPNP_COMPILER_LEXER_H_
+
+#include "lexer.capnp.h"
+
+namespace capnp {
+namespace compiler {
+
+bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result);
+bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result);
+// Lex the given source code, placing the results in `result`.  Returns true if there
+// were no errors, false if there were.  Even when errors are present, the file may have partial
+// content which can be fed into later stages of parsing in order to find more errors.
+//
+// There are two versions, one that parses a list of statements, and one which just parses tokens
+// that might form a part of one statement.  In other words, in the later case, the input should
+// not contain semicolons or curly braces, unless they are in string literals of course.
+
+}  // namespace compiler
+}  // namespace capnp
+
+#endif  // CAPNP_COMPILER_LEXER_H_
--- a/c++/src/capnp/orphan-test.c++
+++ b/c++/src/capnp/orphan-test.c++
@@ -59,13 +59,13 @@ TEST(Orphans, Lists) {
  Orphan<List<uint32_t>> orphan = root.disownUInt32List();
  EXPECT_FALSE(orphan == nullptr);

-  checkList(orphan.get().asReader(), {12, 34, 56});
+  checkList(orphan.get().asReader(), {12u, 34u, 56u});
  EXPECT_FALSE(root.hasUInt32List());

  root.adoptUInt32List(kj::mv(orphan));
  EXPECT_TRUE(orphan == nullptr);
  EXPECT_TRUE(root.hasUInt32List());
-  checkList(root.asReader().getUInt32List(), {12, 34, 56});
+  checkList(root.asReader().getUInt32List(), {12u, 34u, 56u});
 }

 TEST(Orphans, Text) {
@@ -202,7 +202,7 @@ TEST(Orphans, OrphanageListCopy) {

  Orphan<List<uint32_t>> orphan = builder2.getOrphanage().newOrphanCopy(
      root1.asReader().getUInt32List());
-  checkList(orphan.get().asReader(), {12, 34, 56});
+  checkList(orphan.get().asReader(), {12u, 34u, 56u});

  auto root2 = builder2.initRoot<TestAllTypes>();
  root2.adoptUInt32List(kj::mv(orphan));
@@ -272,13 +272,13 @@ TEST(Orphans, ListObject) {
  Orphan<List<uint32_t>> orphan = root.disownObjectField<List<uint32_t>>();
  EXPECT_FALSE(orphan == nullptr);

-  checkList(orphan.get().asReader(), {12, 34, 56});
+  checkList(orphan.get().asReader(), {12u, 34u, 56u});
  EXPECT_FALSE(root.hasObjectField());

  root.adoptObjectField(kj::mv(orphan));
  EXPECT_TRUE(orphan == nullptr);
  EXPECT_TRUE(root.hasObjectField());
-  checkList(root.asReader().getObjectField<List<uint32_t>>(), {12, 34, 56});
+  checkList(root.asReader().getObjectField<List<uint32_t>>(), {12u, 34u, 56u});
 }

 TEST(Orphans, DynamicStruct) {
@@ -318,7 +318,7 @@ TEST(Orphans, DynamicList) {
  root.adoptObjectField(kj::mv(orphan));
  EXPECT_TRUE(orphan == nullptr);
  EXPECT_TRUE(root.hasObjectField());
-  checkList(root.asReader().getObjectField<List<uint32_t>>(), {12, 34, 56});
+  checkList(root.asReader().getObjectField<List<uint32_t>>(), {12u, 34u, 56u});
 }

 TEST(Orphans, OrphanageDynamicStruct) {
@@ -344,7 +344,7 @@ TEST(Orphans, OrphanageDynamicList) {

  auto root = builder.initRoot<test::TestObject>();
  root.adoptObjectField(kj::mv(orphan));
-  checkList(root.getObjectField<List<uint32_t>>(), {123, 456});
+  checkList(root.getObjectField<List<uint32_t>>(), {123u, 456u});
 }

 TEST(Orphans, OrphanageDynamicStructCopy) {
@@ -376,7 +376,7 @@ TEST(Orphans, OrphanageDynamicListCopy) {

  auto root2 = builder2.initRoot<test::TestObject>();
  root2.adoptObjectField(kj::mv(orphan));
-  checkList(root2.getObjectField<List<uint32_t>>(), {12, 34, 56});
+  checkList(root2.getObjectField<List<uint32_t>>(), {12u, 34u, 56u});
 }

 TEST(Orphans, OrphanageFromBuilder) {

--- a/c++/src/kj/common.h
+++ b/c++/src/kj/common.h
@@ -267,6 +267,10 @@ template <typename T> struct RemoveConstOrBogus_ { struct Type; };
 template <typename T> struct RemoveConstOrBogus_<const T> { typedef T Type; };
 template <typename T> using RemoveConstOrBogus = typename RemoveConstOrBogus_<T>::Type;

+template <typename T> struct IsReference_ { static constexpr bool value = false; };
+template <typename T> struct IsReference_<T&> { static constexpr bool value = true; };
+template <typename T> constexpr bool isReference() { return IsReference_<T>::value; }
+
 // =======================================================================================
 // Equivalents to std::move() and std::forward(), since these are very commonly needed and the
 // std header <utility> pulls in lots of other stuff.

--- a/c++/src/kj/parse/char-test.c++
+++ b/c++/src/kj/parse/char-test.c++
@@ -351,11 +351,11 @@ TEST(CharParsers, DoubleQuotedString) {
  }

  {
-    StringPtr text = "\"test\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\?\x01\2\34\156\"";
+    StringPtr text = "\"test\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\?\\x01\\x20\\2\\34\\156\"";
    Input input(text.begin(), text.end());
    Maybe<String> result = parser(input);
    KJ_IF_MAYBE(value, result) {
-      EXPECT_EQ("test\a\b\f\n\r\t\v\'\"\?\x01\2\34\156", *value);
+      EXPECT_EQ("test\a\b\f\n\r\t\v\'\"\?\x01\x20\2\34\156", *value);
    } else {
      ADD_FAILURE() << "Expected string, got null.";
    }

--- a/c++/src/kj/parse/char.h
+++ b/c++/src/kj/parse/char.h
@@ -183,6 +183,9 @@ constexpr auto nameStart = alpha.orChar('_');
 constexpr auto nameChar = alphaNumeric.orChar('_');
 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
 constexpr auto octDigit = charRange('0', '7');
+constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
+constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
+
 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));

 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
@@ -257,7 +260,7 @@ struct ParseFloat {

 constexpr auto number = transform(
    sequence(
-        many(digit),
+        oneOrMore(digit),
        optional(sequence(exactChar<'.'>(), many(digit))),
        optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
        notLookingAt(alpha.orAny("_."))),
@@ -285,7 +288,7 @@ struct InterpretEscape {

 struct ParseHexEscape {
  inline char operator()(char first, char second) const {
-    return (parseDigit(first) << 4) | second;
+    return (parseDigit(first) << 4) | parseDigit(second);
  }
 };


--- a/c++/src/kj/parse/common.h
+++ b/c++/src/kj/parse/common.h
@@ -111,14 +111,30 @@ class ParserRef {
  // from becoming ridiculous.  Using too many of them can hurt performance, though.

 public:
+  ParserRef(): parser(nullptr), wrapper(nullptr) {}
+  ParserRef(const ParserRef&) = default;
+  ParserRef(ParserRef&&) = default;
+  ParserRef& operator=(const ParserRef& other) = default;
+  ParserRef& operator=(ParserRef&& other) = default;
+
  template <typename Other>
-  constexpr ParserRef(Other& other)
-      : parser(&other), wrapper(WrapperImplInstance<Other>::instance) {}
+  constexpr ParserRef(Other&& other)
+      : parser(&other), wrapper(&WrapperImplInstance<Decay<Other>>::instance) {
+    static_assert(kj::isReference<Other>(), "ParseRef should not be assigned to a temporary.");
+  }
+
+  template <typename Other>
+  inline ParserRef& operator=(Other&& other) {
+    static_assert(kj::isReference<Other>(), "ParseRef should not be assigned to a temporary.");
+    parser = &other;
+    wrapper = &WrapperImplInstance<Decay<Other>>::instance;
+    return *this;
+  }

  KJ_ALWAYS_INLINE(Maybe<Output> operator()(Input& input) const) {
    // Always inline in the hopes that this allows branch prediction to kick in so the virtual call
    // doesn't hurt so much.
-    return wrapper.parse(parser, input);
+    return wrapper->parse(parser, input);
  }

 private:
@@ -137,7 +153,7 @@ private:
  };

  const void* parser;
-  const Wrapper& wrapper;
+  const Wrapper* wrapper;
 };

 template <typename Input, typename Output>
@@ -517,8 +533,8 @@ constexpr OneOf_<SubParsers...> oneOf(SubParsers&&... parsers) {
 template <typename Position>
 struct Span {
 public:
-  inline const Position& begin() { return begin_; }
-  inline const Position& end() { return end_; }
+  inline const Position& begin() const { return begin_; }
+  inline const Position& end() const { return end_; }

  Span() = default;
  inline constexpr Span(Position&& begin, Position&& end): begin_(mv(begin)), end_(mv(end)) {}

--- a/c++/src/kj/tuple.h
+++ b/c++/src/kj/tuple.h
@@ -180,6 +180,10 @@ class Tuple<> {
  // Tuple<>() is constexpr.
 };

+template <typename T>
+class Tuple<T>;
+// Single-element tuple should never be used.  The public API should ensure this.
+
 template <size_t index, typename... T>
 inline TypeByIndex<index, T...>& getImpl(Tuple<T...>& tuple) {
  // Get member of a Tuple by index, e.g. `get<2>(myTuple)`.
@@ -210,6 +214,10 @@ inline T&& getImpl(T&& value) {

 template <typename Func, typename SoFar, typename... T>
 struct ExpandAndApplyResult_;
+// Template which computes the return type of applying Func to T... after flattening tuples.
+// SoFar starts as Tuple<> and accumulates the flattened parameter types -- so after this template
+// is recursively expanded, T... is empty and SoFar is a Tuple containing all the parameters.
+
 template <typename Func, typename First, typename... Rest, typename... T>
 struct ExpandAndApplyResult_<Func, Tuple<T...>, First, Rest...>
    : public ExpandAndApplyResult_<Func, Tuple<T..., First>, Rest...> {};

--- a/c++/src/kj/vector.h
+++ b/c++/src/kj/vector.h
@@ -75,11 +75,18 @@ public:
    builder.add(kj::fwd<Params>(params)...);
  }

+  template <typename Iterator>
+  inline void addAll(Iterator begin, Iterator end) {
+    size_t needed = builder.size() + (end - begin);
+    if (needed > builder.capacity()) grow(needed);
+    builder.addAll(begin, end);
+  }
+
 private:
  ArrayBuilder<T> builder;

-  void grow() {
-    setCapacity(capacity() == 0 ? 4 : capacity() * 2);
+  void grow(size_t minCapacity = 0) {
+    setCapacity(kj::max(minCapacity, capacity() == 0 ? 4 : capacity() * 2));
  }
  void setCapacity(size_t newSize) {
    ArrayBuilder<T> newBuilder = heapArrayBuilder<T>(newSize);