Commit dc44fee8 authored by Kenton Varda's avatar Kenton Varda

Treat UTF-8 BOMs as whitespace. Complain if other BOMs or obviously-non-UTF-8 data is seen.

parent 34734994
......@@ -345,6 +345,16 @@ TEST(Lexer, DocComments) {
doLex<LexedStatements>("foo {bar; baz;}\n# late comment\nqux;").cStr());
}
TEST(Lexer, Utf8Bom) {
EXPECT_STREQ(
"(tokens = ["
"(identifier = 'foo', startByte = 3, endByte = 6), "
"(identifier = 'bar', startByte = 7, endByte = 10), "
"(identifier = 'baz', startByte = 13, endByte = 16)"
"])",
doLex<LexedTokens>("\xef\xbb\xbf""foo bar\xef\xbb\xbf""baz").cStr());
}
} // namespace
} // namespace compiler
} // namespace capnp
......@@ -117,9 +117,16 @@ constexpr auto saveComment =
p::charsToString(p::many(p::anyOfChars("\n").invert())),
p::oneOf(p::exactChar<'\n'>(), p::endOfInput));
constexpr auto commentsAndWhitespace =
constexpr auto utf8Bom =
sequence(p::exactChar<'\xef'>(), p::exactChar<'\xbb'>(), p::exactChar<'\xbf'>());
constexpr auto bomsAndWhitespace =
sequence(p::discardWhitespace,
p::discard(p::many(sequence(discardComment, p::discardWhitespace))));
p::discard(p::many(sequence(utf8Bom, p::discardWhitespace))));
constexpr auto commentsAndWhitespace =
sequence(bomsAndWhitespace,
p::discard(p::many(sequence(discardComment, bomsAndWhitespace))));
constexpr auto discardLineWhitespace =
p::discard(p::many(p::discard(p::whitespaceChar.invert().orAny("\r\n").invert())));
......@@ -136,7 +143,7 @@ constexpr auto docComment = p::optional(p::sequence(
} // namespace
Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporterParam)
Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporter)
: orphanage(orphanageParam) {
// Note that because passing an lvalue to a parser constructor uses it by-referencee, it's safe
......@@ -215,8 +222,16 @@ Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporterParam)
buildTokenSequenceList(
initTok(t, loc).initBracketedList(items.size()), kj::mv(items));
return t;
})
));
}),
p::transformOrReject(p::transformWithLocation(
p::oneOf(sequence(p::exactChar<'\xff'>(), p::exactChar<'\xfe'>()),
sequence(p::exactChar<'\xfe'>(), p::exactChar<'\xff'>()),
sequence(p::exactChar<'\x00'>())),
[this, &errorReporter](Location loc) -> kj::Maybe<Orphan<Token>> {
errorReporter.addError(loc.begin(), loc.end(),
"Non-UTF-8 input detected. Cap'n Proto schema files must be UTF-8 text.");
return nullptr;
}), [](kj::Maybe<Orphan<Token>> param) { return param; })));
parsers.tokenSequence = arena.copy(p::sequence(
commentsAndWhitespace, p::many(p::sequence(token, commentsAndWhitespace))));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment