Treat UTF-8 BOMs as whitespace. Complain if other BOMs or obviously-non-UTF-8 data is seen.

dc44fee8 · Kenton Varda · 34734994 · dc44fee8 · dc44fee8
Commit dc44fee8 authored Nov 22, 2014 by Kenton Varda
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 5 deletions

lexer-test.c++ c++/src/capnp/compiler/lexer-test.c++ +10 -0

lexer.c++ c++/src/capnp/compiler/lexer.c++ +20 -5

No files found.
--- a/c++/src/capnp/compiler/lexer-test.c++
+++ b/c++/src/capnp/compiler/lexer-test.c++
@@ -345,6 +345,16 @@ TEST(Lexer, DocComments) {
      doLex<LexedStatements>("foo {bar; baz;}\n# late comment\nqux;").cStr());
 }

+TEST(Lexer, Utf8Bom) {
+  EXPECT_STREQ(
+      "(tokens = ["
+        "(identifier = 'foo', startByte = 3, endByte = 6), "
+        "(identifier = 'bar', startByte = 7, endByte = 10), "
+        "(identifier = 'baz', startByte = 13, endByte = 16)"
+      "])",
+      doLex<LexedTokens>("\xef\xbb\xbf""foo bar\xef\xbb\xbf""baz").cStr());
+}
+
 }  // namespace
 }  // namespace compiler
 }  // namespace capnp
--- a/c++/src/capnp/compiler/lexer.c++
+++ b/c++/src/capnp/compiler/lexer.c++
@@ -117,9 +117,16 @@ constexpr auto saveComment =
             p::charsToString(p::many(p::anyOfChars("\n").invert())),
             p::oneOf(p::exactChar<'\n'>(), p::endOfInput));

-constexpr auto commentsAndWhitespace =
+constexpr auto utf8Bom =
+    sequence(p::exactChar<'\xef'>(), p::exactChar<'\xbb'>(), p::exactChar<'\xbf'>());
+
+constexpr auto bomsAndWhitespace =
    sequence(p::discardWhitespace,
-             p::discard(p::many(sequence(discardComment, p::discardWhitespace))));
+             p::discard(p::many(sequence(utf8Bom, p::discardWhitespace))));
+
+constexpr auto commentsAndWhitespace =
+    sequence(bomsAndWhitespace,
+             p::discard(p::many(sequence(discardComment, bomsAndWhitespace))));

 constexpr auto discardLineWhitespace =
    p::discard(p::many(p::discard(p::whitespaceChar.invert().orAny("\r\n").invert())));
@@ -136,7 +143,7 @@ constexpr auto docComment = p::optional(p::sequence(

 }  // namespace

-Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporterParam)
+Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporter)
    : orphanage(orphanageParam) {

  // Note that because passing an lvalue to a parser constructor uses it by-referencee, it's safe
@@ -215,8 +222,16 @@ Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporterParam)
            buildTokenSequenceList(
                initTok(t, loc).initBracketedList(items.size()), kj::mv(items));
            return t;
-          })
-      ));
+          }),
+      p::transformOrReject(p::transformWithLocation(
+          p::oneOf(sequence(p::exactChar<'\xff'>(), p::exactChar<'\xfe'>()),
+                   sequence(p::exactChar<'\xfe'>(), p::exactChar<'\xff'>()),
+                   sequence(p::exactChar<'\x00'>())),
+          [this, &errorReporter](Location loc) -> kj::Maybe<Orphan<Token>> {
+            errorReporter.addError(loc.begin(), loc.end(),
+                "Non-UTF-8 input detected. Cap'n Proto schema files must be UTF-8 text.");
+            return nullptr;
+          }), [](kj::Maybe<Orphan<Token>> param) { return param; })));
  parsers.tokenSequence = arena.copy(p::sequence(
      commentsAndWhitespace, p::many(p::sequence(token, commentsAndWhitespace))));