Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
R
rapidjson
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
rapidjson
Commits
1784afe5
Commit
1784afe5
authored
May 25, 2015
by
miloyip
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add character class to regex
parent
06853b89
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
241 additions
and
38 deletions
+241
-38
regex.h
include/rapidjson/internal/regex.h
+155
-38
regextest.cpp
test/unittest/regextest.cpp
+86
-0
No files found.
include/rapidjson/internal/regex.h
View file @
1784afe5
...
@@ -29,23 +29,30 @@ namespace internal {
...
@@ -29,23 +29,30 @@ namespace internal {
// GenericRegex
// GenericRegex
static
const
SizeType
kRegexInvalidState
=
~
SizeType
(
0
);
//!< Represents an invalid index in GenericRegex::State::out, out1
static
const
SizeType
kRegexInvalidState
=
~
SizeType
(
0
);
//!< Represents an invalid index in GenericRegex::State::out, out1
static
const
SizeType
kRegexInvalidRange
=
~
SizeType
(
0
);
//! Regular expression engine.
//! Regular expression engine
with subset of ECMAscript grammar
.
/*!
/*!
Supported regular expression syntax:
Supported regular expression syntax:
- \c ab Concatenation
- \c ab Concatenation
- \c a|b Alternation
- \c a|b Alternation
- \c a? Zero or one
- \c a? Zero or one
- \c a* Zero or more
- \c a* Zero or more
- \c a+ One or more
- \c a+ One or more
- \c (ab)* Parenthesis grouping
- \c (ab)* Grouping
- \c . Any character
- \c [abc] Character classes
- \c [a-c] Character class range
- \c [a-z0-9_] Character class combination
- \c [^abc] Negated character classes
- \c [^a-c] Negated character class range
*/
*/
template
<
typename
Encoding
,
typename
Allocator
=
CrtAllocator
>
template
<
typename
Encoding
,
typename
Allocator
=
CrtAllocator
>
class
GenericRegex
{
class
GenericRegex
{
public
:
public
:
typedef
typename
Encoding
::
Ch
Ch
;
typedef
typename
Encoding
::
Ch
Ch
;
GenericRegex
(
const
Ch
*
source
,
Allocator
*
allocator
=
0
)
:
states_
(
allocator
,
256
),
r
oot_
(
kRegexInvalidState
),
stat
eCount_
()
{
GenericRegex
(
const
Ch
*
source
,
Allocator
*
allocator
=
0
)
:
states_
(
allocator
,
256
),
r
anges_
(
allocator
,
256
),
root_
(
kRegexInvalidState
),
stateCount_
(),
rang
eCount_
()
{
StringStream
is
(
source
);
StringStream
is
(
source
);
Parse
(
is
);
Parse
(
is
);
}
}
...
@@ -76,8 +83,12 @@ public:
...
@@ -76,8 +83,12 @@ public:
next
->
Clear
();
next
->
Clear
();
for
(
const
SizeType
*
s
=
current
->
template
Bottom
<
SizeType
>
();
s
!=
current
->
template
End
<
SizeType
>
();
++
s
)
{
for
(
const
SizeType
*
s
=
current
->
template
Bottom
<
SizeType
>
();
s
!=
current
->
template
End
<
SizeType
>
();
++
s
)
{
const
State
&
sr
=
GetState
(
*
s
);
const
State
&
sr
=
GetState
(
*
s
);
if
(
sr
.
codepoint
==
kAnyCharacterClass
||
sr
.
codepoint
==
codepoint
)
if
(
sr
.
codepoint
==
codepoint
||
sr
.
codepoint
==
kAnyCharacterClass
||
(
sr
.
codepoint
==
kRangeCharacterClass
&&
MatchRange
(
sr
.
rangeStart
,
codepoint
)))
{
AddState
(
stateSet
,
*
next
,
sr
.
out
);
AddState
(
stateSet
,
*
next
,
sr
.
out
);
}
}
}
Stack
<
Allocator
>*
temp
=
current
;
Stack
<
Allocator
>*
temp
=
current
;
current
=
next
;
current
=
next
;
...
@@ -109,10 +120,19 @@ private:
...
@@ -109,10 +120,19 @@ private:
};
};
static
const
unsigned
kAnyCharacterClass
=
0xFFFFFFFF
;
//!< For '.'
static
const
unsigned
kAnyCharacterClass
=
0xFFFFFFFF
;
//!< For '.'
static
const
unsigned
kRangeCharacterClass
=
0xFFFFFFFE
;
static
const
unsigned
kRangeNegationFlag
=
0x80000000
;
struct
Range
{
unsigned
start
;
//
unsigned
end
;
SizeType
next
;
};
struct
State
{
struct
State
{
SizeType
out
;
//!< Equals to kInvalid for matching state
SizeType
out
;
//!< Equals to kInvalid for matching state
SizeType
out1
;
//!< Equals to non-kInvalid for split
SizeType
out1
;
//!< Equals to non-kInvalid for split
SizeType
rangeStart
;
unsigned
codepoint
;
unsigned
codepoint
;
};
};
...
@@ -132,6 +152,16 @@ private:
...
@@ -132,6 +152,16 @@ private:
return
states_
.
template
Bottom
<
State
>
()[
index
];
return
states_
.
template
Bottom
<
State
>
()[
index
];
}
}
Range
&
GetRange
(
SizeType
index
)
{
RAPIDJSON_ASSERT
(
index
<
rangeCount_
);
return
ranges_
.
template
Bottom
<
Range
>
()[
index
];
}
const
Range
&
GetRange
(
SizeType
index
)
const
{
RAPIDJSON_ASSERT
(
index
<
rangeCount_
);
return
ranges_
.
template
Bottom
<
Range
>
()[
index
];
}
void
AddState
(
unsigned
*
stateSet
,
Stack
<
Allocator
>&
l
,
SizeType
index
)
const
{
void
AddState
(
unsigned
*
stateSet
,
Stack
<
Allocator
>&
l
,
SizeType
index
)
const
{
if
(
index
==
kRegexInvalidState
)
if
(
index
==
kRegexInvalidState
)
return
;
return
;
...
@@ -147,34 +177,17 @@ private:
...
@@ -147,34 +177,17 @@ private:
}
}
}
}
SizeType
NewState
(
SizeType
out
,
SizeType
out1
,
unsigned
codepoint
)
{
bool
MatchRange
(
SizeType
rangeIndex
,
unsigned
codepoint
)
const
{
State
*
s
=
states_
.
template
Push
<
State
>
();
bool
yes
=
(
GetRange
(
rangeIndex
).
start
&
kRangeNegationFlag
)
==
0
;
s
->
out
=
out
;
while
(
rangeIndex
!=
kRegexInvalidRange
)
{
s
->
out1
=
out1
;
const
Range
&
r
=
GetRange
(
rangeIndex
);
s
->
codepoint
=
codepoint
;
if
(
codepoint
>=
(
r
.
start
&
~
kRangeNegationFlag
)
&&
codepoint
<=
r
.
end
)
return
stateCount_
++
;
return
yes
;
}
rangeIndex
=
r
.
next
;
SizeType
Append
(
SizeType
l1
,
SizeType
l2
)
{
SizeType
old
=
l1
;
while
(
GetState
(
l1
).
out
!=
kRegexInvalidState
)
l1
=
GetState
(
l1
).
out
;
GetState
(
l1
).
out
=
l2
;
return
old
;
}
void
Patch
(
SizeType
l
,
SizeType
s
)
{
for
(
SizeType
next
;
l
!=
kRegexInvalidState
;
l
=
next
)
{
next
=
GetState
(
l
).
out
;
GetState
(
l
).
out
=
s
;
}
}
return
!
yes
;
}
}
void
PushOperand
(
Stack
<
Allocator
>&
operandStack
,
unsigned
codepoint
)
{
SizeType
s
=
NewState
(
kRegexInvalidState
,
kRegexInvalidState
,
codepoint
);
*
operandStack
.
template
Push
<
Frag
>
()
=
Frag
(
s
,
s
);
}
template
<
typename
InputStream
>
template
<
typename
InputStream
>
void
Parse
(
InputStream
&
is
)
{
void
Parse
(
InputStream
&
is
)
{
Allocator
allocator
;
Allocator
allocator
;
...
@@ -231,6 +244,18 @@ private:
...
@@ -231,6 +244,18 @@ private:
ImplicitConcatenation
(
atomCountStack
,
operatorStack
);
ImplicitConcatenation
(
atomCountStack
,
operatorStack
);
break
;
break
;
case
'['
:
{
SizeType
range
;
if
(
!
ParseRange
(
is
,
&
range
))
return
;
SizeType
s
=
NewState
(
kRegexInvalidState
,
kRegexInvalidState
,
kRangeCharacterClass
);
GetState
(
s
).
rangeStart
=
range
;
*
operandStack
.
template
Push
<
Frag
>
()
=
Frag
(
s
,
s
);
}
ImplicitConcatenation
(
atomCountStack
,
operatorStack
);
break
;
default
:
default
:
PushOperand
(
operandStack
,
codepoint
);
PushOperand
(
operandStack
,
codepoint
);
ImplicitConcatenation
(
atomCountStack
,
operatorStack
);
ImplicitConcatenation
(
atomCountStack
,
operatorStack
);
...
@@ -258,6 +283,41 @@ private:
...
@@ -258,6 +283,41 @@ private:
}
}
}
}
SizeType
NewState
(
SizeType
out
,
SizeType
out1
,
unsigned
codepoint
)
{
State
*
s
=
states_
.
template
Push
<
State
>
();
s
->
out
=
out
;
s
->
out1
=
out1
;
s
->
codepoint
=
codepoint
;
s
->
rangeStart
=
kRegexInvalidRange
;
return
stateCount_
++
;
}
void
PushOperand
(
Stack
<
Allocator
>&
operandStack
,
unsigned
codepoint
)
{
SizeType
s
=
NewState
(
kRegexInvalidState
,
kRegexInvalidState
,
codepoint
);
*
operandStack
.
template
Push
<
Frag
>
()
=
Frag
(
s
,
s
);
}
void
ImplicitConcatenation
(
Stack
<
Allocator
>&
atomCountStack
,
Stack
<
Allocator
>&
operatorStack
)
{
if
(
*
atomCountStack
.
template
Top
<
unsigned
>
())
*
operatorStack
.
template
Push
<
Operator
>
()
=
kConcatenation
;
(
*
atomCountStack
.
template
Top
<
unsigned
>
())
++
;
}
SizeType
Append
(
SizeType
l1
,
SizeType
l2
)
{
SizeType
old
=
l1
;
while
(
GetState
(
l1
).
out
!=
kRegexInvalidState
)
l1
=
GetState
(
l1
).
out
;
GetState
(
l1
).
out
=
l2
;
return
old
;
}
void
Patch
(
SizeType
l
,
SizeType
s
)
{
for
(
SizeType
next
;
l
!=
kRegexInvalidState
;
l
=
next
)
{
next
=
GetState
(
l
).
out
;
GetState
(
l
).
out
=
s
;
}
}
bool
Eval
(
Stack
<
Allocator
>&
operandStack
,
Operator
op
)
{
bool
Eval
(
Stack
<
Allocator
>&
operandStack
,
Operator
op
)
{
switch
(
op
)
{
switch
(
op
)
{
case
kConcatenation
:
case
kConcatenation
:
...
@@ -314,15 +374,72 @@ private:
...
@@ -314,15 +374,72 @@ private:
}
}
}
}
void
ImplicitConcatenation
(
Stack
<
Allocator
>&
atomCountStack
,
Stack
<
Allocator
>&
operatorStack
)
{
template
<
typename
InputStream
>
if
(
*
atomCountStack
.
template
Top
<
unsigned
>
())
bool
ParseRange
(
InputStream
&
is
,
SizeType
*
range
)
{
*
operatorStack
.
template
Push
<
Operator
>
()
=
kConcatenation
;
bool
isBegin
=
true
;
(
*
atomCountStack
.
template
Top
<
unsigned
>
())
++
;
bool
negate
=
false
;
int
step
=
0
;
SizeType
start
=
kRegexInvalidRange
;
SizeType
current
=
kRegexInvalidRange
;
unsigned
codepoint
;
while
(
Encoding
::
Decode
(
is
,
&
codepoint
)
&&
codepoint
!=
0
)
{
if
(
isBegin
&&
codepoint
==
'^'
)
negate
=
true
;
else
if
(
codepoint
==
']'
)
{
if
(
step
==
2
)
{
// Add trailing '-'
SizeType
r
=
NewRange
(
'-'
);
RAPIDJSON_ASSERT
(
current
!=
kRegexInvalidRange
);
GetRange
(
current
).
next
=
r
;
}
if
(
negate
)
GetRange
(
start
).
start
|=
kRangeNegationFlag
;
*
range
=
start
;
return
true
;
}
else
{
switch
(
step
)
{
case
1
:
if
(
codepoint
==
'-'
)
{
step
++
;
break
;
}
// fall through to step 0 for other characters
case
0
:
{
SizeType
r
=
NewRange
(
codepoint
);
if
(
current
!=
kRegexInvalidRange
)
GetRange
(
current
).
next
=
r
;
if
(
start
==
kRegexInvalidRange
)
start
=
r
;
current
=
r
;
}
step
=
1
;
break
;
default
:
RAPIDJSON_ASSERT
(
step
==
2
);
GetRange
(
current
).
end
=
codepoint
;
step
=
0
;
}
}
isBegin
=
false
;
}
return
false
;
}
SizeType
NewRange
(
unsigned
codepoint
)
{
Range
*
r
=
ranges_
.
template
Push
<
Range
>
();
r
->
start
=
r
->
end
=
codepoint
;
r
->
next
=
kRegexInvalidRange
;
return
rangeCount_
++
;
}
}
Stack
<
Allocator
>
states_
;
Stack
<
Allocator
>
states_
;
Stack
<
Allocator
>
ranges_
;
SizeType
root_
;
SizeType
root_
;
SizeType
stateCount_
;
SizeType
stateCount_
;
SizeType
rangeCount_
;
};
};
typedef
GenericRegex
<
UTF8
<>
>
Regex
;
typedef
GenericRegex
<
UTF8
<>
>
Regex
;
...
...
test/unittest/regextest.cpp
View file @
1784afe5
...
@@ -241,4 +241,90 @@ TEST(Regex, AnyCharacter) {
...
@@ -241,4 +241,90 @@ TEST(Regex, AnyCharacter) {
EXPECT_FALSE
(
re
.
Match
(
"aa"
));
EXPECT_FALSE
(
re
.
Match
(
"aa"
));
}
}
TEST
(
Regex
,
CharacterRange1
)
{
Regex
re
(
"[abc]"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Match
(
"a"
));
EXPECT_TRUE
(
re
.
Match
(
"b"
));
EXPECT_TRUE
(
re
.
Match
(
"c"
));
EXPECT_FALSE
(
re
.
Match
(
""
));
EXPECT_FALSE
(
re
.
Match
(
"`"
));
EXPECT_FALSE
(
re
.
Match
(
"d"
));
EXPECT_FALSE
(
re
.
Match
(
"aa"
));
}
TEST
(
Regex
,
CharacterRange2
)
{
Regex
re
(
"[^abc]"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Match
(
"`"
));
EXPECT_TRUE
(
re
.
Match
(
"d"
));
EXPECT_FALSE
(
re
.
Match
(
"a"
));
EXPECT_FALSE
(
re
.
Match
(
"b"
));
EXPECT_FALSE
(
re
.
Match
(
"c"
));
EXPECT_FALSE
(
re
.
Match
(
""
));
EXPECT_FALSE
(
re
.
Match
(
"aa"
));
}
TEST
(
Regex
,
CharacterRange3
)
{
Regex
re
(
"[a-c]"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Match
(
"a"
));
EXPECT_TRUE
(
re
.
Match
(
"b"
));
EXPECT_TRUE
(
re
.
Match
(
"c"
));
EXPECT_FALSE
(
re
.
Match
(
""
));
EXPECT_FALSE
(
re
.
Match
(
"`"
));
EXPECT_FALSE
(
re
.
Match
(
"d"
));
EXPECT_FALSE
(
re
.
Match
(
"aa"
));
}
TEST
(
Regex
,
CharacterRange4
)
{
Regex
re
(
"[^a-c]"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Match
(
"`"
));
EXPECT_TRUE
(
re
.
Match
(
"d"
));
EXPECT_FALSE
(
re
.
Match
(
"a"
));
EXPECT_FALSE
(
re
.
Match
(
"b"
));
EXPECT_FALSE
(
re
.
Match
(
"c"
));
EXPECT_FALSE
(
re
.
Match
(
""
));
EXPECT_FALSE
(
re
.
Match
(
"aa"
));
}
TEST
(
Regex
,
CharacterRange5
)
{
Regex
re
(
"[-]"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Match
(
"-"
));
EXPECT_FALSE
(
re
.
Match
(
""
));
EXPECT_FALSE
(
re
.
Match
(
"a"
));
}
TEST
(
Regex
,
CharacterRange6
)
{
Regex
re
(
"[a-]"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Match
(
"a"
));
EXPECT_TRUE
(
re
.
Match
(
"-"
));
EXPECT_FALSE
(
re
.
Match
(
""
));
EXPECT_FALSE
(
re
.
Match
(
"`"
));
EXPECT_FALSE
(
re
.
Match
(
"b"
));
}
TEST
(
Regex
,
CharacterRange7
)
{
Regex
re
(
"[-a]"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Match
(
"a"
));
EXPECT_TRUE
(
re
.
Match
(
"-"
));
EXPECT_FALSE
(
re
.
Match
(
""
));
EXPECT_FALSE
(
re
.
Match
(
"`"
));
EXPECT_FALSE
(
re
.
Match
(
"b"
));
}
TEST
(
Regex
,
CharacterRange8
)
{
Regex
re
(
"[a-zA-Z0-9]*"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Match
(
"Milo"
));
EXPECT_TRUE
(
re
.
Match
(
"MT19937"
));
EXPECT_TRUE
(
re
.
Match
(
"43"
));
EXPECT_FALSE
(
re
.
Match
(
"a_b"
));
EXPECT_FALSE
(
re
.
Match
(
"!"
));
}
#undef EURO
#undef EURO
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment