lexer.c++ 11.8 KB
Newer Older
Kenton Varda's avatar
Kenton Varda committed
1 2
// Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
// Licensed under the MIT License:
3
//
Kenton Varda's avatar
Kenton Varda committed
4 5 6 7 8 9
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
10
//
Kenton Varda's avatar
Kenton Varda committed
11 12
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
13
//
Kenton Varda's avatar
Kenton Varda committed
14 15 16 17 18 19 20
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
21 22 23 24 25 26 27 28

#include "lexer.h"
#include <kj/parse/char.h>
#include <kj/debug.h>

namespace capnp {
namespace compiler {

29
namespace p = kj::parse;
30

31
bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result,
32
         ErrorReporter& errorReporter) {
33
  Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter);
34

35 36 37 38
  auto parser = p::sequence(lexer.getParsers().statementSequence, p::endOfInput);

  Lexer::ParserInput parserInput(input.begin(), input.end());
  kj::Maybe<kj::Array<Orphan<Statement>>> parseOutput = parser(parserInput);
39 40 41 42

  KJ_IF_MAYBE(output, parseOutput) {
    auto l = result.initStatements(output->size());
    for (uint i = 0; i < output->size(); i++) {
43
      l.adoptWithCaveats(i, kj::mv((*output)[i]));
44 45 46
    }
    return true;
  } else {
47 48
    uint32_t best = parserInput.getBest();
    errorReporter.addError(best, best, kj::str("Parse error."));
49 50 51 52
    return false;
  }
}

53
bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result,
54
         ErrorReporter& errorReporter) {
55
  Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter);
56

57
  auto parser = p::sequence(lexer.getParsers().tokenSequence, p::endOfInput);
58

59 60
  Lexer::ParserInput parserInput(input.begin(), input.end());
  kj::Maybe<kj::Array<Orphan<Token>>> parseOutput = parser(parserInput);
61 62 63 64

  KJ_IF_MAYBE(output, parseOutput) {
    auto l = result.initTokens(output->size());
    for (uint i = 0; i < output->size(); i++) {
65
      l.adoptWithCaveats(i, kj::mv((*output)[i]));
66 67 68
    }
    return true;
  } else {
69 70
    uint32_t best = parserInput.getBest();
    errorReporter.addError(best, best, kj::str("Parse error."));
71 72 73 74
    return false;
  }
}

75 76
namespace {

77 78
typedef p::Span<uint32_t> Location;

79 80 81 82 83
Token::Builder initTok(Orphan<Token>& t, const Location& loc) {
  auto builder = t.get();
  builder.setStartByte(loc.begin());
  builder.setEndByte(loc.end());
  return builder;
84
}
85

86
void buildTokenSequenceList(List<List<Token>>::Builder builder,
87 88 89 90 91
                            kj::Array<kj::Array<Orphan<Token>>>&& items) {
  for (uint i = 0; i < items.size(); i++) {
    auto& item = items[i];
    auto itemBuilder = builder.init(i, item.size());
    for (uint j = 0; j < item.size(); j++) {
92
      itemBuilder.adoptWithCaveats(j, kj::mv(item[j]));
93 94 95 96 97 98 99 100 101
    }
  }
}

void attachDocComment(Statement::Builder statement, kj::Array<kj::String>&& comment) {
  size_t size = 0;
  for (auto& line: comment) {
    size += line.size() + 1;  // include newline
  }
102 103 104 105 106 107
  Text::Builder builder = statement.initDocComment(size);
  char* pos = builder.begin();
  for (auto& line: comment) {
    memcpy(pos, line.begin(), line.size());
    pos += line.size();
    *pos++ = '\n';
108
  }
109
  KJ_ASSERT(pos == builder.end());
110 111 112 113 114 115 116 117 118 119
}

constexpr auto discardComment =
    sequence(p::exactChar<'#'>(), p::discard(p::many(p::discard(p::anyOfChars("\n").invert()))),
             p::oneOf(p::exactChar<'\n'>(), p::endOfInput));
constexpr auto saveComment =
    sequence(p::exactChar<'#'>(), p::discard(p::optional(p::exactChar<' '>())),
             p::charsToString(p::many(p::anyOfChars("\n").invert())),
             p::oneOf(p::exactChar<'\n'>(), p::endOfInput));

120 121 122 123
constexpr auto utf8Bom =
    sequence(p::exactChar<'\xef'>(), p::exactChar<'\xbb'>(), p::exactChar<'\xbf'>());

constexpr auto bomsAndWhitespace =
124
    sequence(p::discardWhitespace,
125 126 127 128 129
             p::discard(p::many(sequence(utf8Bom, p::discardWhitespace))));

constexpr auto commentsAndWhitespace =
    sequence(bomsAndWhitespace,
             p::discard(p::many(sequence(discardComment, bomsAndWhitespace))));
130 131 132 133 134 135 136

constexpr auto discardLineWhitespace =
    p::discard(p::many(p::discard(p::whitespaceChar.invert().orAny("\r\n").invert())));
constexpr auto newline = p::oneOf(
    p::exactChar<'\n'>(),
    sequence(p::exactChar<'\r'>(), p::discard(p::optional(p::exactChar<'\n'>()))));

137
constexpr auto docComment = p::optional(p::sequence(
138 139
    discardLineWhitespace,
    p::discard(p::optional(newline)),
140
    p::oneOrMore(p::sequence(discardLineWhitespace, saveComment))));
141 142 143 144 145
// Parses a set of comment lines preceded by at most one newline and with no intervening blank
// lines.

}  // namespace

146
Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporter)
147
    : orphanage(orphanageParam) {
148 149 150 151 152 153

  // Note that because passing an lvalue to a parser constructor uses it by-referencee, it's safe
  // for us to use parsers.tokenSequence even though we haven't yet constructed it.
  auto& tokenSequence = parsers.tokenSequence;

  auto& commaDelimitedList = arena.copy(p::transform(
154
      p::sequence(tokenSequence, p::many(p::sequence(p::exactChar<','>(), tokenSequence))),
155
      [](kj::Array<Orphan<Token>>&& first, kj::Array<kj::Array<Orphan<Token>>>&& rest)
156 157 158 159 160
          -> kj::Array<kj::Array<Orphan<Token>>> {
        if (first == nullptr && rest == nullptr) {
          // Completely empty list.
          return nullptr;
        } else {
161
          uint restSize = rest.size();
Kenton Varda's avatar
Kenton Varda committed
162
          if (restSize > 0 && rest[restSize - 1] == nullptr) {
163 164 165 166
            // Allow for trailing commas by shortening the list by one item if the final token is
            // nullptr
            restSize--;
          }
Kenton Varda's avatar
Kenton Varda committed
167
          auto result = kj::heapArrayBuilder<kj::Array<Orphan<Token>>>(1 + restSize); // first+rest
168
          result.add(kj::mv(first));
169
          for (uint i = 0; i < restSize ; i++) {
Kenton Varda's avatar
Kenton Varda committed
170
            result.add(kj::mv(rest[i]));
171 172 173
          }
          return result.finish();
        }
174
      }));
175

176
  auto& token = arena.copy(p::oneOf(
177
      p::transformWithLocation(p::identifier,
178
          [this](Location loc, kj::String name) -> Orphan<Token> {
179 180 181 182 183
            auto t = orphanage.newOrphan<Token>();
            initTok(t, loc).setIdentifier(name);
            return t;
          }),
      p::transformWithLocation(p::doubleQuotedString,
184
          [this](Location loc, kj::String text) -> Orphan<Token> {
185 186 187 188
            auto t = orphanage.newOrphan<Token>();
            initTok(t, loc).setStringLiteral(text);
            return t;
          }),
Jason Choy's avatar
Jason Choy committed
189
      p::transformWithLocation(p::doubleQuotedHexBinary,
190
          [this](Location loc, kj::Array<byte> data) -> Orphan<Token> {
Jason Choy's avatar
Jason Choy committed
191
            auto t = orphanage.newOrphan<Token>();
192
            initTok(t, loc).setBinaryLiteral(data);
Jason Choy's avatar
Jason Choy committed
193 194
            return t;
          }),
195
      p::transformWithLocation(p::integer,
196
          [this](Location loc, uint64_t i) -> Orphan<Token> {
197 198 199 200 201
            auto t = orphanage.newOrphan<Token>();
            initTok(t, loc).setIntegerLiteral(i);
            return t;
          }),
      p::transformWithLocation(p::number,
202
          [this](Location loc, double x) -> Orphan<Token> {
203 204 205 206 207 208
            auto t = orphanage.newOrphan<Token>();
            initTok(t, loc).setFloatLiteral(x);
            return t;
          }),
      p::transformWithLocation(
          p::charsToString(p::oneOrMore(p::anyOfChars("!$%&*+-./:<=>?@^|~"))),
209
          [this](Location loc, kj::String text) -> Orphan<Token> {
210 211 212 213 214 215
            auto t = orphanage.newOrphan<Token>();
            initTok(t, loc).setOperator(text);
            return t;
          }),
      p::transformWithLocation(
          sequence(p::exactChar<'('>(), commaDelimitedList, p::exactChar<')'>()),
216
          [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> {
217 218 219 220 221 222 223
            auto t = orphanage.newOrphan<Token>();
            buildTokenSequenceList(
                initTok(t, loc).initParenthesizedList(items.size()), kj::mv(items));
            return t;
          }),
      p::transformWithLocation(
          sequence(p::exactChar<'['>(), commaDelimitedList, p::exactChar<']'>()),
224
          [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> {
225 226 227 228
            auto t = orphanage.newOrphan<Token>();
            buildTokenSequenceList(
                initTok(t, loc).initBracketedList(items.size()), kj::mv(items));
            return t;
229 230 231 232 233
          }),
      p::transformOrReject(p::transformWithLocation(
          p::oneOf(sequence(p::exactChar<'\xff'>(), p::exactChar<'\xfe'>()),
                   sequence(p::exactChar<'\xfe'>(), p::exactChar<'\xff'>()),
                   sequence(p::exactChar<'\x00'>())),
234
          [&errorReporter](Location loc) -> kj::Maybe<Orphan<Token>> {
235 236 237 238
            errorReporter.addError(loc.begin(), loc.end(),
                "Non-UTF-8 input detected. Cap'n Proto schema files must be UTF-8 text.");
            return nullptr;
          }), [](kj::Maybe<Orphan<Token>> param) { return param; })));
239 240
  parsers.tokenSequence = arena.copy(p::sequence(
      commentsAndWhitespace, p::many(p::sequence(token, commentsAndWhitespace))));
241

242
  auto& statementSequence = parsers.statementSequence;
243

244 245
  auto& statementEnd = arena.copy(p::oneOf(
      transform(p::sequence(p::exactChar<';'>(), docComment),
246
          [this](kj::Maybe<kj::Array<kj::String>>&& comment) -> Orphan<Statement> {
247 248
            auto result = orphanage.newOrphan<Statement>();
            auto builder = result.get();
249 250 251
            KJ_IF_MAYBE(c, comment) {
              attachDocComment(builder, kj::mv(*c));
            }
252
            builder.setLine();
253 254 255
            return result;
          }),
      transform(
256 257 258 259 260
          p::sequence(p::exactChar<'{'>(), docComment, statementSequence, p::exactChar<'}'>(),
                      docComment),
          [this](kj::Maybe<kj::Array<kj::String>>&& comment,
                 kj::Array<Orphan<Statement>>&& statements,
                 kj::Maybe<kj::Array<kj::String>>&& lateComment)
261 262 263
              -> Orphan<Statement> {
            auto result = orphanage.newOrphan<Statement>();
            auto builder = result.get();
264 265 266 267 268
            KJ_IF_MAYBE(c, comment) {
              attachDocComment(builder, kj::mv(*c));
            } else KJ_IF_MAYBE(c, lateComment) {
              attachDocComment(builder, kj::mv(*c));
            }
269
            auto list = builder.initBlock(statements.size());
270
            for (uint i = 0; i < statements.size(); i++) {
271
              list.adoptWithCaveats(i, kj::mv(statements[i]));
272 273 274 275
            }
            return result;
          })
      ));
276

277
  auto& statement = arena.copy(p::transformWithLocation(p::sequence(tokenSequence, statementEnd),
278
      [](Location loc, kj::Array<Orphan<Token>>&& tokens, Orphan<Statement>&& statement) {
279 280
        auto builder = statement.get();
        auto tokensBuilder = builder.initTokens(tokens.size());
281
        for (uint i = 0; i < tokens.size(); i++) {
282
          tokensBuilder.adoptWithCaveats(i, kj::mv(tokens[i]));
283
        }
284 285
        builder.setStartByte(loc.begin());
        builder.setEndByte(loc.end());
286 287
        return kj::mv(statement);
      }));
288

289 290
  parsers.statementSequence = arena.copy(sequence(
      commentsAndWhitespace, many(sequence(statement, commentsAndWhitespace))));
291

292 293 294
  parsers.token = token;
  parsers.statement = statement;
  parsers.emptySpace = commentsAndWhitespace;
295 296
}

297
Lexer::~Lexer() noexcept(false) {}
298

299 300
}  // namespace compiler
}  // namespace capnp