Commit 51bb7631 authored by miloyip's avatar miloyip

Refactor regex with DecodedStream with one look-ahead character

parent 0dffe875
...@@ -60,8 +60,9 @@ public: ...@@ -60,8 +60,9 @@ public:
typedef typename Encoding::Ch Ch; typedef typename Encoding::Ch Ch;
GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(),rangeCount_() { GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(),rangeCount_() {
StringStream is(source); StringStream ss(source);
Parse(is); DecodedStream<StringStream> ds(ss);
Parse(ds);
} }
~GenericRegex() { ~GenericRegex() {
...@@ -74,6 +75,8 @@ public: ...@@ -74,6 +75,8 @@ public:
template <typename InputStream> template <typename InputStream>
bool Match(InputStream& is) const { bool Match(InputStream& is) const {
RAPIDJSON_ASSERT(IsValid()); RAPIDJSON_ASSERT(IsValid());
DecodedStream<InputStream> ds(is);
Allocator allocator; Allocator allocator;
Stack<Allocator> state0(&allocator, stateCount_ * sizeof(SizeType)); Stack<Allocator> state0(&allocator, stateCount_ * sizeof(SizeType));
Stack<Allocator> state1(&allocator, stateCount_ * sizeof(SizeType)); Stack<Allocator> state1(&allocator, stateCount_ * sizeof(SizeType));
...@@ -85,7 +88,7 @@ public: ...@@ -85,7 +88,7 @@ public:
AddState(stateSet, *current, root_); AddState(stateSet, *current, root_);
unsigned codepoint; unsigned codepoint;
while (!current->Empty() && Encoding::Decode(is, &codepoint) && codepoint != 0) { while (!current->Empty() && (codepoint = ds.Take()) != 0) {
std::memset(stateSet, 0, stateSetSize); std::memset(stateSet, 0, stateSetSize);
next->Clear(); next->Clear();
for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) { for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
...@@ -149,6 +152,23 @@ private: ...@@ -149,6 +152,23 @@ private:
SizeType out; //!< link-list of all output states SizeType out; //!< link-list of all output states
}; };
template <typename SourceStream>
class DecodedStream {
public:
DecodedStream(SourceStream& ss) : ss_(ss) { Decode(); }
unsigned Peek() { return codepoint_; }
unsigned Take() { unsigned c = codepoint_; Decode(); return c; }
private:
void Decode() {
if (!Encoding::Decode(ss_, &codepoint_))
codepoint_ = 0;
}
SourceStream& ss_;
unsigned codepoint_;
};
State& GetState(SizeType index) { State& GetState(SizeType index) {
RAPIDJSON_ASSERT(index < stateCount_); RAPIDJSON_ASSERT(index < stateCount_);
return states_.template Bottom<State>()[index]; return states_.template Bottom<State>()[index];
...@@ -196,7 +216,7 @@ private: ...@@ -196,7 +216,7 @@ private:
} }
template <typename InputStream> template <typename InputStream>
void Parse(InputStream& is) { void Parse(DecodedStream<InputStream>& ds) {
Allocator allocator; Allocator allocator;
Stack<Allocator> operandStack(&allocator, 256); // Frag Stack<Allocator> operandStack(&allocator, 256); // Frag
Stack<Allocator> operatorStack(&allocator, 256); // Operator Stack<Allocator> operatorStack(&allocator, 256); // Operator
...@@ -205,8 +225,8 @@ private: ...@@ -205,8 +225,8 @@ private:
*atomCountStack.template Push<unsigned>() = 0; *atomCountStack.template Push<unsigned>() = 0;
unsigned codepoint; unsigned codepoint;
while (Encoding::Decode(is, &codepoint) && codepoint != 0) { while (ds.Peek() != 0) {
switch (codepoint) { switch (codepoint = ds.Take()) {
case '|': case '|':
while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation) while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1))) if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
...@@ -254,7 +274,7 @@ private: ...@@ -254,7 +274,7 @@ private:
case '[': case '[':
{ {
SizeType range; SizeType range;
if (!ParseRange(is, &range)) if (!ParseRange(ds, &range))
return; return;
SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass); SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
GetState(s).rangeStart = range; GetState(s).rangeStart = range;
...@@ -264,9 +284,7 @@ private: ...@@ -264,9 +284,7 @@ private:
break; break;
case '\\': // Escape character case '\\': // Escape character
if (!Encoding::Decode(is, &codepoint) || codepoint == 0) if (!CharacterEscape(ds, &codepoint))
return; // Expect an escape character
if (!CharacterEscape(codepoint, &codepoint))
return; // Unsupported escape character return; // Unsupported escape character
// fall through to default // fall through to default
...@@ -389,14 +407,14 @@ private: ...@@ -389,14 +407,14 @@ private:
} }
template <typename InputStream> template <typename InputStream>
bool ParseRange(InputStream& is, SizeType* range) { bool ParseRange(DecodedStream<InputStream>& ds, SizeType* range) {
bool isBegin = true; bool isBegin = true;
bool negate = false; bool negate = false;
int step = 0; int step = 0;
SizeType start = kRegexInvalidRange; SizeType start = kRegexInvalidRange;
SizeType current = kRegexInvalidRange; SizeType current = kRegexInvalidRange;
unsigned codepoint; unsigned codepoint;
while (Encoding::Decode(is, &codepoint) && codepoint != 0) { while ((codepoint = ds.Take()) != 0) {
if (isBegin) { if (isBegin) {
isBegin = false; isBegin = false;
if (codepoint == '^') { if (codepoint == '^') {
...@@ -418,11 +436,11 @@ private: ...@@ -418,11 +436,11 @@ private:
return true; return true;
case '\\': case '\\':
if (!Encoding::Decode(is, &codepoint) || codepoint == 0) if (ds.Peek() == 'b') {
return false; // Expect an escape character ds.Take();
if (codepoint == 'b')
codepoint = 0x0008; // Escape backspace character codepoint = 0x0008; // Escape backspace character
else if (!CharacterEscape(codepoint, &codepoint)) }
else if (!CharacterEscape(ds, &codepoint))
return false; return false;
// fall through to default // fall through to default
...@@ -464,8 +482,10 @@ private: ...@@ -464,8 +482,10 @@ private:
return rangeCount_++; return rangeCount_++;
} }
bool CharacterEscape(unsigned codepoint, unsigned* escapedCodepoint) { template <typename InputStream>
switch (codepoint) { bool CharacterEscape(DecodedStream<InputStream>& ds, unsigned* escapedCodepoint) {
unsigned codepoint;
switch (codepoint = ds.Take()) {
case '|': case '|':
case '(': case '(':
case ')': case ')':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment