Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
R
rapidjson
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
rapidjson
Commits
3eb19cea
Commit
3eb19cea
authored
May 29, 2015
by
Milo Yip
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add Search(), ^ and $ assertions to regex
parent
a5ac3b5d
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
153 additions
and
66 deletions
+153
-66
regex.h
include/rapidjson/internal/regex.h
+97
-64
regextest.cpp
test/unittest/regextest.cpp
+56
-2
No files found.
include/rapidjson/internal/regex.h
View file @
3eb19cea
...
...
@@ -62,7 +62,7 @@ class GenericRegex {
public
:
typedef
typename
Encoding
::
Ch
Ch
;
GenericRegex
(
const
Ch
*
source
,
Allocator
*
allocator
=
0
)
:
states_
(
allocator
,
256
),
ranges_
(
allocator
,
256
),
root_
(
kRegexInvalidState
),
stateCount_
(),
rangeCount
_
()
{
GenericRegex
(
const
Ch
*
source
,
Allocator
*
allocator
=
0
)
:
states_
(
allocator
,
256
),
ranges_
(
allocator
,
256
),
root_
(
kRegexInvalidState
),
stateCount_
(),
rangeCount_
(),
anchorBegin_
(),
anchorEnd
_
()
{
StringStream
ss
(
source
);
DecodedStream
<
StringStream
>
ds
(
ss
);
Parse
(
ds
);
...
...
@@ -77,49 +77,22 @@ public:
template
<
typename
InputStream
>
bool
Match
(
InputStream
&
is
)
const
{
RAPIDJSON_ASSERT
(
IsValid
());
DecodedStream
<
InputStream
>
ds
(
is
);
Allocator
allocator
;
Stack
<
Allocator
>
state0
(
&
allocator
,
stateCount_
*
sizeof
(
SizeType
));
Stack
<
Allocator
>
state1
(
&
allocator
,
stateCount_
*
sizeof
(
SizeType
));
Stack
<
Allocator
>
*
current
=
&
state0
,
*
next
=
&
state1
;
const
size_t
stateSetSize
=
(
stateCount_
+
31
)
/
32
*
4
;
unsigned
*
stateSet
=
static_cast
<
unsigned
*>
(
allocator
.
Malloc
(
stateSetSize
));
std
::
memset
(
stateSet
,
0
,
stateSetSize
);
AddState
(
stateSet
,
*
current
,
root_
);
unsigned
codepoint
;
while
(
!
current
->
Empty
()
&&
(
codepoint
=
ds
.
Take
())
!=
0
)
{
std
::
memset
(
stateSet
,
0
,
stateSetSize
);
next
->
Clear
();
for
(
const
SizeType
*
s
=
current
->
template
Bottom
<
SizeType
>
();
s
!=
current
->
template
End
<
SizeType
>
();
++
s
)
{
const
State
&
sr
=
GetState
(
*
s
);
if
(
sr
.
codepoint
==
codepoint
||
sr
.
codepoint
==
kAnyCharacterClass
||
(
sr
.
codepoint
==
kRangeCharacterClass
&&
MatchRange
(
sr
.
rangeStart
,
codepoint
)))
{
AddState
(
stateSet
,
*
next
,
sr
.
out
);
}
}
Stack
<
Allocator
>*
temp
=
current
;
current
=
next
;
next
=
temp
;
return
SearchWithAnchoring
(
is
,
true
,
true
);
}
Allocator
::
Free
(
stateSet
);
for
(
const
SizeType
*
s
=
current
->
template
Bottom
<
SizeType
>
();
s
!=
current
->
template
End
<
SizeType
>
();
++
s
)
if
(
GetState
(
*
s
).
out
==
kRegexInvalidState
)
return
true
;
bool
Match
(
const
Ch
*
s
)
const
{
StringStream
is
(
s
);
return
Match
(
is
);
}
return
false
;
template
<
typename
InputStream
>
bool
Search
(
InputStream
&
is
)
const
{
return
SearchWithAnchoring
(
is
,
anchorBegin_
,
anchorEnd_
);
}
bool
Match
(
const
Ch
*
s
)
{
bool
Search
(
const
Ch
*
s
)
const
{
StringStream
is
(
s
);
return
Mat
ch
(
is
);
return
Sear
ch
(
is
);
}
private
:
...
...
@@ -193,32 +166,6 @@ private:
return
ranges_
.
template
Bottom
<
Range
>
()[
index
];
}
void
AddState
(
unsigned
*
stateSet
,
Stack
<
Allocator
>&
l
,
SizeType
index
)
const
{
if
(
index
==
kRegexInvalidState
)
return
;
const
State
&
s
=
GetState
(
index
);
if
(
s
.
out1
!=
kRegexInvalidState
)
{
// Split
AddState
(
stateSet
,
l
,
s
.
out
);
AddState
(
stateSet
,
l
,
s
.
out1
);
}
else
if
(
!
(
stateSet
[
index
>>
5
]
&
(
1
<<
(
index
&
31
))))
{
stateSet
[
index
>>
5
]
|=
(
1
<<
(
index
&
31
));
*
l
.
template
Push
<
SizeType
>
()
=
index
;
}
}
bool
MatchRange
(
SizeType
rangeIndex
,
unsigned
codepoint
)
const
{
bool
yes
=
(
GetRange
(
rangeIndex
).
start
&
kRangeNegationFlag
)
==
0
;
while
(
rangeIndex
!=
kRegexInvalidRange
)
{
const
Range
&
r
=
GetRange
(
rangeIndex
);
if
(
codepoint
>=
(
r
.
start
&
~
kRangeNegationFlag
)
&&
codepoint
<=
r
.
end
)
return
yes
;
rangeIndex
=
r
.
next
;
}
return
!
yes
;
}
template
<
typename
InputStream
>
void
Parse
(
DecodedStream
<
InputStream
>&
ds
)
{
Allocator
allocator
;
...
...
@@ -231,6 +178,14 @@ private:
unsigned
codepoint
;
while
(
ds
.
Peek
()
!=
0
)
{
switch
(
codepoint
=
ds
.
Take
())
{
case
'^'
:
anchorBegin_
=
true
;
break
;
case
'$'
:
anchorEnd_
=
true
;
break
;
case
'|'
:
while
(
!
operatorStack
.
Empty
()
&&
*
operatorStack
.
template
Top
<
Operator
>
()
<
kAlternation
)
if
(
!
Eval
(
operandStack
,
*
operatorStack
.
template
Pop
<
Operator
>
(
1
)))
...
...
@@ -567,6 +522,8 @@ private:
bool
CharacterEscape
(
DecodedStream
<
InputStream
>&
ds
,
unsigned
*
escapedCodepoint
)
{
unsigned
codepoint
;
switch
(
codepoint
=
ds
.
Take
())
{
case
'^'
:
case
'$'
:
case
'|'
:
case
'('
:
case
')'
:
...
...
@@ -590,11 +547,87 @@ private:
}
}
template
<
typename
InputStream
>
bool
SearchWithAnchoring
(
InputStream
&
is
,
bool
anchorBegin
,
bool
anchorEnd
)
const
{
RAPIDJSON_ASSERT
(
IsValid
());
DecodedStream
<
InputStream
>
ds
(
is
);
Allocator
allocator
;
Stack
<
Allocator
>
state0
(
&
allocator
,
stateCount_
*
sizeof
(
SizeType
));
Stack
<
Allocator
>
state1
(
&
allocator
,
stateCount_
*
sizeof
(
SizeType
));
Stack
<
Allocator
>
*
current
=
&
state0
,
*
next
=
&
state1
;
const
size_t
stateSetSize
=
(
stateCount_
+
31
)
/
32
*
4
;
unsigned
*
stateSet
=
static_cast
<
unsigned
*>
(
allocator
.
Malloc
(
stateSetSize
));
std
::
memset
(
stateSet
,
0
,
stateSetSize
);
bool
matched
=
false
;
matched
=
AddState
(
stateSet
,
*
current
,
root_
);
unsigned
codepoint
;
while
(
!
current
->
Empty
()
&&
(
codepoint
=
ds
.
Take
())
!=
0
)
{
std
::
memset
(
stateSet
,
0
,
stateSetSize
);
next
->
Clear
();
matched
=
false
;
for
(
const
SizeType
*
s
=
current
->
template
Bottom
<
SizeType
>
();
s
!=
current
->
template
End
<
SizeType
>
();
++
s
)
{
const
State
&
sr
=
GetState
(
*
s
);
if
(
sr
.
codepoint
==
codepoint
||
sr
.
codepoint
==
kAnyCharacterClass
||
(
sr
.
codepoint
==
kRangeCharacterClass
&&
MatchRange
(
sr
.
rangeStart
,
codepoint
)))
{
matched
=
AddState
(
stateSet
,
*
next
,
sr
.
out
)
||
matched
;
if
(
!
anchorEnd
&&
matched
)
goto
exit
;
}
if
(
!
anchorBegin
)
AddState
(
stateSet
,
*
next
,
root_
);
}
Stack
<
Allocator
>*
temp
=
current
;
current
=
next
;
next
=
temp
;
}
exit
:
Allocator
::
Free
(
stateSet
);
return
matched
;
}
// Return whether the added states is a match state
bool
AddState
(
unsigned
*
stateSet
,
Stack
<
Allocator
>&
l
,
SizeType
index
)
const
{
if
(
index
==
kRegexInvalidState
)
return
true
;
const
State
&
s
=
GetState
(
index
);
if
(
s
.
out1
!=
kRegexInvalidState
)
{
// Split
bool
matched
=
AddState
(
stateSet
,
l
,
s
.
out
);
matched
=
AddState
(
stateSet
,
l
,
s
.
out1
)
||
matched
;
return
matched
;
}
else
if
(
!
(
stateSet
[
index
>>
5
]
&
(
1
<<
(
index
&
31
))))
{
stateSet
[
index
>>
5
]
|=
(
1
<<
(
index
&
31
));
*
l
.
template
Push
<
SizeType
>
()
=
index
;
return
GetState
(
index
).
out
==
kRegexInvalidState
;
}
}
bool
MatchRange
(
SizeType
rangeIndex
,
unsigned
codepoint
)
const
{
bool
yes
=
(
GetRange
(
rangeIndex
).
start
&
kRangeNegationFlag
)
==
0
;
while
(
rangeIndex
!=
kRegexInvalidRange
)
{
const
Range
&
r
=
GetRange
(
rangeIndex
);
if
(
codepoint
>=
(
r
.
start
&
~
kRangeNegationFlag
)
&&
codepoint
<=
r
.
end
)
return
yes
;
rangeIndex
=
r
.
next
;
}
return
!
yes
;
}
Stack
<
Allocator
>
states_
;
Stack
<
Allocator
>
ranges_
;
SizeType
root_
;
SizeType
stateCount_
;
SizeType
rangeCount_
;
bool
anchorBegin_
;
bool
anchorEnd_
;
};
typedef
GenericRegex
<
UTF8
<>
>
Regex
;
...
...
test/unittest/regextest.cpp
View file @
3eb19cea
...
...
@@ -432,11 +432,65 @@ TEST(Regex, CharacterRange8) {
EXPECT_FALSE
(
re
.
Match
(
"!"
));
}
TEST
(
Regex
,
Search
)
{
Regex
re
(
"abc"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Search
(
"abc"
));
EXPECT_TRUE
(
re
.
Search
(
"_abc"
));
EXPECT_TRUE
(
re
.
Search
(
"abc_"
));
EXPECT_TRUE
(
re
.
Search
(
"_abc_"
));
EXPECT_TRUE
(
re
.
Search
(
"__abc__"
));
EXPECT_TRUE
(
re
.
Search
(
"abcabc"
));
EXPECT_FALSE
(
re
.
Search
(
"a"
));
EXPECT_FALSE
(
re
.
Search
(
"ab"
));
EXPECT_FALSE
(
re
.
Search
(
"bc"
));
EXPECT_FALSE
(
re
.
Search
(
"cba"
));
}
TEST
(
Regex
,
Search_BeginAnchor
)
{
Regex
re
(
"^abc"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Search
(
"abc"
));
EXPECT_TRUE
(
re
.
Search
(
"abc_"
));
EXPECT_TRUE
(
re
.
Search
(
"abcabc"
));
EXPECT_FALSE
(
re
.
Search
(
"_abc"
));
EXPECT_FALSE
(
re
.
Search
(
"_abc_"
));
EXPECT_FALSE
(
re
.
Search
(
"a"
));
EXPECT_FALSE
(
re
.
Search
(
"ab"
));
EXPECT_FALSE
(
re
.
Search
(
"bc"
));
EXPECT_FALSE
(
re
.
Search
(
"cba"
));
}
TEST
(
Regex
,
Search_EndAnchor
)
{
Regex
re
(
"abc$"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Search
(
"abc"
));
EXPECT_TRUE
(
re
.
Search
(
"_abc"
));
EXPECT_TRUE
(
re
.
Search
(
"abcabc"
));
EXPECT_FALSE
(
re
.
Search
(
"abc_"
));
EXPECT_FALSE
(
re
.
Search
(
"_abc_"
));
EXPECT_FALSE
(
re
.
Search
(
"a"
));
EXPECT_FALSE
(
re
.
Search
(
"ab"
));
EXPECT_FALSE
(
re
.
Search
(
"bc"
));
EXPECT_FALSE
(
re
.
Search
(
"cba"
));
}
TEST
(
Regex
,
Search_BothAnchor
)
{
Regex
re
(
"^abc$"
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Search
(
"abc"
));
EXPECT_FALSE
(
re
.
Search
(
""
));
EXPECT_FALSE
(
re
.
Search
(
"a"
));
EXPECT_FALSE
(
re
.
Search
(
"b"
));
EXPECT_FALSE
(
re
.
Search
(
"ab"
));
EXPECT_FALSE
(
re
.
Search
(
"abcd"
));
}
TEST
(
Regex
,
Escape
)
{
const
char
*
s
=
"
\\
|
\\
(
\\
)
\\
?
\\
*
\\
+
\\
.
\\
[
\\
]
\\
{
\\
}
\\\\\\
f
\\
n
\\
r
\\
t
\\
v[
\\
b][
\\
[][
\\
]]"
;
const
char
*
s
=
"
\\
^
\\
$
\\
|
\\
(
\\
)
\\
?
\\
*
\\
+
\\
.
\\
[
\\
]
\\
{
\\
}
\\\\\\
f
\\
n
\\
r
\\
t
\\
v[
\\
b][
\\
[][
\\
]]"
;
Regex
re
(
s
);
ASSERT_TRUE
(
re
.
IsValid
());
EXPECT_TRUE
(
re
.
Match
(
"|()?*+.[]{}
\\\x0C\n\r\t\x0B\b
[]"
));
EXPECT_TRUE
(
re
.
Match
(
"
^$
|()?*+.[]{}
\\\x0C\n\r\t\x0B\b
[]"
));
EXPECT_FALSE
(
re
.
Match
(
s
));
// Not escaping
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment