Commit 05c79891 authored by miloyip's avatar miloyip

Add parenthesis support in regex

parent 0bef29a5
...@@ -90,6 +90,12 @@ public: ...@@ -90,6 +90,12 @@ public:
} }
private: private:
enum Operator {
kConcatenation,
kAlternation,
kLeftParenthesis,
};
struct State { struct State {
SizeType out; //!< Equals to kInvalid for match SizeType out; //!< Equals to kInvalid for match
SizeType out1; //!< Equals to non-kInvalid for split SizeType out1; //!< Equals to non-kInvalid for split
...@@ -155,52 +161,96 @@ private: ...@@ -155,52 +161,96 @@ private:
void Parse(InputStream& is) { void Parse(InputStream& is) {
Allocator allocator; Allocator allocator;
Stack<Allocator> operandStack(&allocator, 256); // Frag Stack<Allocator> operandStack(&allocator, 256); // Frag
Stack<Allocator> operatorStack(&allocator, 256); // char Stack<Allocator> operatorStack(&allocator, 256); // Operator
Stack<Allocator> atomCountStack(&allocator, 256); // unsigned (Atom per parenthesis)
*atomCountStack.template Push<unsigned>() = 0;
unsigned codepoint; unsigned codepoint;
bool previousOperand = false;
while (Encoding::Decode(is, &codepoint) && codepoint != 0) { while (Encoding::Decode(is, &codepoint) && codepoint != 0) {
switch (codepoint) { switch (codepoint) {
case '|': case '|':
*operatorStack.template Push<char>() = '|'; while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
previousOperand = false; if (!Eval(operandStack, operatorStack))
return;
*operatorStack.template Push<Operator>() = kAlternation;
*atomCountStack.template Top<unsigned>() = 0;
break;
case '(':
*operatorStack.template Push<Operator>() = kLeftParenthesis;
*atomCountStack.template Push<unsigned>() = 0;
break;
case ')':
while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
if (!Eval(operandStack, operatorStack))
return;
if (operatorStack.Empty())
return;
operatorStack.template Pop<Operator>(1);
atomCountStack.template Pop<unsigned>(1);
ImplicitConcatenation(atomCountStack, operatorStack);
break; break;
default: default:
SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
// concatenation with previous operand *operandStack.template Push<Frag>() = Frag(s, s);
if (previousOperand) { ImplicitConcatenation(atomCountStack, operatorStack);
Frag* e = operandStack.template Top<Frag>();
Patch(e->out, s);
e->out = s;
}
else
*operandStack.template Push<Frag>() = Frag(s, s);
previousOperand = true;
} }
} }
while (!operatorStack.Empty()) { while (!operatorStack.Empty())
switch (*operatorStack.template Pop<char>(1)) { if (!Eval(operandStack, operatorStack))
case '|': return;
{
Frag e2 = *operandStack.template Pop<Frag>(1);
Frag e1 = *operandStack.template Pop<Frag>(1);
SizeType s = NewState(e1.start, e2.start, 0);
*operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out));
}
break;
}
}
// Link the operand to matching state. // Link the operand to matching state.
if (operandStack.GetSize() == sizeof(Frag)) { if (operandStack.GetSize() == sizeof(Frag)) {
Frag* e = operandStack.template Pop<Frag>(1); Frag* e = operandStack.template Pop<Frag>(1);
Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
root_ = e->start; root_ = e->start;
// printf("root: %d\n", root_);
// for (SizeType i = 0; i < stateCount_ ; i++) {
// State& s = GetState(i);
// printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint);
// }
// printf("\n");
} }
} }
bool Eval(Stack<Allocator>& operandStack, Stack<Allocator>& operatorStack) {
switch (*operatorStack.template Pop<Operator>(1)) {
case kConcatenation:
if (operandStack.GetSize() >= sizeof(Frag) * 2) {
Frag e2 = *operandStack.template Pop<Frag>(1);
Frag e1 = *operandStack.template Pop<Frag>(1);
Patch(e1.out, e2.start);
*operandStack.template Push<Frag>() = Frag(e1.start, e2.out);
return true;
}
return false;
case kAlternation:
if (operandStack.GetSize() >= sizeof(Frag) * 2) {
Frag e2 = *operandStack.template Pop<Frag>(1);
Frag e1 = *operandStack.template Pop<Frag>(1);
SizeType s = NewState(e1.start, e2.start, 0);
*operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out));
return true;
}
return false;
default:
return false;
}
}
void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) {
if (*atomCountStack.template Top<unsigned>())
*operatorStack.template Push<Operator>() = kConcatenation;
(*atomCountStack.template Top<unsigned>())++;
}
Stack<Allocator> states_; Stack<Allocator> states_;
SizeType root_; SizeType root_;
SizeType stateCount_; SizeType stateCount_;
......
...@@ -19,6 +19,7 @@ using namespace rapidjson::internal; ...@@ -19,6 +19,7 @@ using namespace rapidjson::internal;
TEST(Regex, concatenation) { TEST(Regex, concatenation) {
Regex re("abc"); Regex re("abc");
ASSERT_TRUE(re.IsValid());
EXPECT_TRUE(re.Match("abc")); EXPECT_TRUE(re.Match("abc"));
EXPECT_FALSE(re.Match("")); EXPECT_FALSE(re.Match(""));
EXPECT_FALSE(re.Match("a")); EXPECT_FALSE(re.Match("a"));
...@@ -27,24 +28,59 @@ TEST(Regex, concatenation) { ...@@ -27,24 +28,59 @@ TEST(Regex, concatenation) {
EXPECT_FALSE(re.Match("abcd")); EXPECT_FALSE(re.Match("abcd"));
} }
TEST(Regex, split) { TEST(Regex, split1) {
{ Regex re("abab|abbb");
Regex re("abab|abbb"); ASSERT_TRUE(re.IsValid());
EXPECT_TRUE(re.Match("abab")); EXPECT_TRUE(re.Match("abab"));
EXPECT_TRUE(re.Match("abbb")); EXPECT_TRUE(re.Match("abbb"));
EXPECT_FALSE(re.Match("")); EXPECT_FALSE(re.Match(""));
EXPECT_FALSE(re.Match("ab")); EXPECT_FALSE(re.Match("ab"));
EXPECT_FALSE(re.Match("ababa")); EXPECT_FALSE(re.Match("ababa"));
EXPECT_FALSE(re.Match("abb")); EXPECT_FALSE(re.Match("abb"));
EXPECT_FALSE(re.Match("abbbb")); EXPECT_FALSE(re.Match("abbbb"));
} }
{
Regex re("a|b|c"); TEST(Regex, split2) {
EXPECT_TRUE(re.Match("a")); Regex re("a|b|c");
EXPECT_TRUE(re.Match("b")); ASSERT_TRUE(re.IsValid());
EXPECT_TRUE(re.Match("c")); EXPECT_TRUE(re.Match("a"));
EXPECT_FALSE(re.Match("")); EXPECT_TRUE(re.Match("b"));
EXPECT_FALSE(re.Match("aa")); EXPECT_TRUE(re.Match("c"));
EXPECT_FALSE(re.Match("ab")); EXPECT_FALSE(re.Match(""));
} EXPECT_FALSE(re.Match("aa"));
EXPECT_FALSE(re.Match("ab"));
}
TEST(Regex, parenthesis1) {
Regex re("(ab)c");
ASSERT_TRUE(re.IsValid());
EXPECT_TRUE(re.Match("abc"));
EXPECT_FALSE(re.Match(""));
EXPECT_FALSE(re.Match("a"));
EXPECT_FALSE(re.Match("b"));
EXPECT_FALSE(re.Match("ab"));
EXPECT_FALSE(re.Match("abcd"));
}
TEST(Regex, parenthesis2) {
Regex re("a(bc)");
ASSERT_TRUE(re.IsValid());
EXPECT_TRUE(re.Match("abc"));
EXPECT_FALSE(re.Match(""));
EXPECT_FALSE(re.Match("a"));
EXPECT_FALSE(re.Match("b"));
EXPECT_FALSE(re.Match("ab"));
EXPECT_FALSE(re.Match("abcd"));
}
TEST(Regex, parenthesis3) {
Regex re("(a|b)(c|d)");
ASSERT_TRUE(re.IsValid());
EXPECT_TRUE(re.Match("ac"));
EXPECT_TRUE(re.Match("ad"));
EXPECT_TRUE(re.Match("bc"));
EXPECT_TRUE(re.Match("bd"));
EXPECT_FALSE(re.Match(""));
EXPECT_FALSE(re.Match("ab"));
EXPECT_FALSE(re.Match("cd"));
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment