Commit ce4162e8 authored by Kenton Varda's avatar Kenton Varda

Refactor lexer using arena.

parent 8e1d1e53
......@@ -28,12 +28,62 @@
namespace capnp {
namespace compiler {
bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result) {
Lexer lexer(Orphanage::getForMessageContaining(result));
Lexer::ParserInput parserInput(input.begin(), input.end());
kj::Maybe<kj::Array<Orphan<Statement>>> parseOutput =
lexer.getParsers().statementSequence(parserInput);
if (!parserInput.atEnd()) {
return false;
}
KJ_IF_MAYBE(output, parseOutput) {
auto l = result.initStatements(output->size());
for (uint i = 0; i < output->size(); i++) {
l[i].adoptStatement(kj::mv((*output)[i]));
}
return true;
} else {
return false;
}
}
bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result) {
Lexer lexer(Orphanage::getForMessageContaining(result));
Lexer::ParserInput parserInput(input.begin(), input.end());
kj::Maybe<kj::Array<Orphan<Token>>> parseOutput =
lexer.getParsers().tokenSequence(parserInput);
if (!parserInput.atEnd()) {
return false;
}
KJ_IF_MAYBE(output, parseOutput) {
auto l = result.initTokens(output->size());
for (uint i = 0; i < output->size(); i++) {
l[i].adoptToken(kj::mv((*output)[i]));
}
return true;
} else {
return false;
}
}
namespace p = kj::parse;
namespace {
typedef p::IteratorInput<char, const char*> Input;
typedef p::Span<const char*> Location;
typedef p::Span<uint32_t> Location;
Token::Body::Builder initTok(Orphan<Token>& t, const Location& loc) {
auto tb = t.get();
tb.setStartByte(loc.begin());
tb.setEndByte(loc.end());
return tb.getBody();
}
void buildTokenSequenceList(List<List<TokenPointer>>::Builder builder,
kj::Array<kj::Array<Orphan<Token>>>&& items) {
......@@ -90,35 +140,13 @@ constexpr auto docComment = sequence(
} // namespace
bool lex(kj::ArrayPtr<const char> input,
LexedStatements::Builder* resultStatements,
LexedTokens::Builder* resultTokens) {
// This is a bit hacky. Since the transformations applied by our parser require access to an
// Orphanage in order to build objects, we construct the parsers as local variables. This means
// that all the parsers need to live in a single function scope. In order to handle both tokens
// and statements, we have the function take `resultStatements` and `resultTokens` and parse
// into whichever one is non-null.
//
// TODO(someday): Perhaps there should be a utility class called ParserPool which has a method
// that takes a parser, allocates a copy of it within some arena, then returns a ParserRef
// referencing that copy. Then there could be a Lexer class which contains a ParserPool and
// builds all its parsers in its constructor. This would allow the class to directly expose
// the parsers so that they can be used within other parser combinators.
Orphanage orphanage = resultStatements == nullptr ?
Orphanage::getForMessageContaining(*resultTokens) :
Orphanage::getForMessageContaining(*resultStatements);
auto initTok = [&](Orphan<Token>& t, const Location& loc) -> Token::Body::Builder {
auto tb = t.get();
tb.setStartByte(loc.begin() - input.begin());
tb.setEndByte(loc.end() - input.begin());
return tb.getBody();
};
p::ParserRef<Input, kj::Array<Orphan<Token>>> tokenSequence;
auto commaDelimitedList = transform(
Lexer::Lexer(Orphanage orphanage): orphanage(orphanage) {
// Note that because passing an lvalue to a parser constructor uses it by-referencee, it's safe
// for us to use parsers.tokenSequence even though we haven't yet constructed it.
auto& tokenSequence = parsers.tokenSequence;
auto& commaDelimitedList = arena.copy(p::transform(
p::sequence(tokenSequence, p::many(p::sequence(p::exactChar<','>(), tokenSequence))),
[&](kj::Array<Orphan<Token>>&& first, kj::Array<kj::Array<Orphan<Token>>>&& rest)
-> kj::Array<kj::Array<Orphan<Token>>> {
......@@ -133,9 +161,9 @@ bool lex(kj::ArrayPtr<const char> input,
}
return result.finish();
}
});
}));
auto token = p::oneOf(
auto& token = arena.copy(p::oneOf(
p::transformWithLocation(p::identifier,
[&](Location loc, kj::String name) -> Orphan<Token> {
auto t = orphanage.newOrphan<Token>();
......@@ -183,94 +211,54 @@ bool lex(kj::ArrayPtr<const char> input,
initTok(t, loc).initBracketedList(items.size()), kj::mv(items));
return t;
})
);
auto tokenSequence_ =
sequence(commentsAndWhitespace, many(sequence(token, commentsAndWhitespace)));
tokenSequence = tokenSequence_;
if (resultStatements == nullptr) {
// Only a token sequence is requested.
Input parserInput(input.begin(), input.end());
kj::Maybe<kj::Array<Orphan<Token>>> parseOutput = tokenSequence(parserInput);
if (!parserInput.atEnd()) {
return false;
}
));
parsers.tokenSequence = arena.copy(p::sequence(
commentsAndWhitespace, p::many(p::sequence(token, commentsAndWhitespace))));
KJ_IF_MAYBE(output, parseOutput) {
auto l = resultTokens->initTokens(output->size());
for (uint i = 0; i < output->size(); i++) {
l[i].adoptToken(kj::mv((*output)[i]));
}
return true;
} else {
return false;
}
} else {
p::ParserRef<Input, kj::Array<Orphan<Statement>>> statementSequence;
auto statementEnd = p::oneOf(
transform(p::sequence(p::exactChar<';'>(), docComment),
[&](kj::Array<kj::String>&& comment) -> Orphan<Statement> {
auto result = orphanage.newOrphan<Statement>();
auto builder = result.get();
attachDocComment(builder, kj::mv(comment));
builder.getBlock().setNone();
return result;
}),
transform(
p::sequence(p::exactChar<'{'>(), docComment, statementSequence, p::exactChar<'}'>()),
[&](kj::Array<kj::String>&& comment, kj::Array<Orphan<Statement>>&& statements)
-> Orphan<Statement> {
auto result = orphanage.newOrphan<Statement>();
auto builder = result.get();
attachDocComment(builder, kj::mv(comment));
auto list = builder.getBlock().initStatements(statements.size());
for (uint i = 0; i < statements.size(); i++) {
list[i].adoptStatement(kj::mv(statements[i]));
}
return result;
})
);
auto statement = p::transform(p::sequence(tokenSequence, statementEnd),
[&](kj::Array<Orphan<Token>>&& tokens, Orphan<Statement>&& statement) {
auto tokensBuilder = statement.get().initTokens(tokens.size());
for (uint i = 0; i < tokens.size(); i++) {
tokensBuilder[i].adoptToken(kj::mv(tokens[i]));
}
return kj::mv(statement);
});
auto& statementSequence = parsers.statementSequence;
auto statementSequence_ =
sequence(commentsAndWhitespace, many(sequence(statement, commentsAndWhitespace)));
statementSequence = statementSequence_;
auto& statementEnd = arena.copy(p::oneOf(
transform(p::sequence(p::exactChar<';'>(), docComment),
[&](kj::Array<kj::String>&& comment) -> Orphan<Statement> {
auto result = orphanage.newOrphan<Statement>();
auto builder = result.get();
attachDocComment(builder, kj::mv(comment));
builder.getBlock().setNone();
return result;
}),
transform(
p::sequence(p::exactChar<'{'>(), docComment, statementSequence, p::exactChar<'}'>()),
[&](kj::Array<kj::String>&& comment, kj::Array<Orphan<Statement>>&& statements)
-> Orphan<Statement> {
auto result = orphanage.newOrphan<Statement>();
auto builder = result.get();
attachDocComment(builder, kj::mv(comment));
auto list = builder.getBlock().initStatements(statements.size());
for (uint i = 0; i < statements.size(); i++) {
list[i].adoptStatement(kj::mv(statements[i]));
}
return result;
})
));
Input parserInput(input.begin(), input.end());
kj::Maybe<kj::Array<Orphan<Statement>>> parseOutput = statementSequence(parserInput);
auto& statement = arena.copy(p::transform(p::sequence(tokenSequence, statementEnd),
[&](kj::Array<Orphan<Token>>&& tokens, Orphan<Statement>&& statement) {
auto tokensBuilder = statement.get().initTokens(tokens.size());
for (uint i = 0; i < tokens.size(); i++) {
tokensBuilder[i].adoptToken(kj::mv(tokens[i]));
}
return kj::mv(statement);
}));
if (!parserInput.atEnd()) {
return false;
}
parsers.statementSequence = arena.copy(sequence(
commentsAndWhitespace, many(sequence(statement, commentsAndWhitespace))));
KJ_IF_MAYBE(output, parseOutput) {
auto l = resultStatements->initStatements(output->size());
for (uint i = 0; i < output->size(); i++) {
l[i].adoptStatement(kj::mv((*output)[i]));
}
return true;
} else {
return false;
}
}
parsers.token = token;
parsers.statement = statement;
parsers.emptySpace = commentsAndWhitespace;
}
bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result) {
return lex(kj::mv(input), &result, nullptr);
}
bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result) {
return lex(kj::mv(input), nullptr, &result);
}
Lexer::~Lexer() {}
} // namespace compiler
} // namespace capnp
......@@ -25,6 +25,8 @@
#define CAPNP_COMPILER_LEXER_H_
#include "lexer.capnp.h"
#include <kj/parse/common.h>
#include <kj/arena.h>
namespace capnp {
namespace compiler {
......@@ -39,6 +41,54 @@ bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result);
// that might form a part of one statement. In other words, in the later case, the input should
// not contain semicolons or curly braces, unless they are in string literals of course.
class Lexer {
// Advanced lexer interface. This interface exposes the inner parsers so that you can embed them
// into your own parsers.
public:
Lexer(Orphanage orphanage);
// `orphanage` is used to allocate Cap'n Proto message objects in the result. `inputStart` is
// a pointer to the beginning of the input, used to compute byte offsets.
~Lexer();
class ParserInput: public kj::parse::IteratorInput<char, const char*> {
// Like IteratorInput<char, const char*> except that positions are measured as byte offsets
// rather than pointers.
public:
ParserInput(const char* begin, const char* end)
: IteratorInput<char, const char*>(begin, end), begin(begin) {}
explicit ParserInput(ParserInput& parent)
: IteratorInput<char, const char*>(parent), begin(parent.begin) {}
inline uint32_t getPosition() {
return IteratorInput<char, const char*>::getPosition() - begin;
}
private:
const char* begin;
};
template <typename Output>
using Parser = kj::parse::ParserRef<ParserInput, Output>;
struct Parsers {
Parser<kj::Tuple<>> emptySpace;
Parser<Orphan<Token>> token;
Parser<kj::Array<Orphan<Token>>> tokenSequence;
Parser<Orphan<Statement>> statement;
Parser<kj::Array<Orphan<Statement>>> statementSequence;
};
const Parsers& getParsers() { return parsers; }
private:
Orphanage orphanage;
kj::Arena arena;
Parsers parsers;
};
} // namespace compiler
} // namespace capnp
......
......@@ -67,6 +67,11 @@ public:
// the need for the Arena itself to keep track of destructors to call later, which may make it
// slightly more efficient.
template <typename T>
inline T& copy(T&& value) { return allocate<Decay<T>>(kj::fwd<T>(value)); }
// Allocate a copy of the given value in the arena. This is just a shortcut for calling the
// type's copy (or move) constructor.
StringPtr copyString(StringPtr content);
// Make a copy of the given string inside the arena, and return a pointer to the copy.
......
......@@ -54,7 +54,7 @@ class IteratorInput {
public:
IteratorInput(Iterator begin, Iterator end)
: parent(nullptr), pos(begin), end(end), best(begin) {}
IteratorInput(IteratorInput& parent)
explicit IteratorInput(IteratorInput& parent)
: parent(&parent), pos(parent.pos), end(parent.end), best(parent.pos) {}
~IteratorInput() {
if (parent != nullptr) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment