Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
P
protobuf
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
protobuf
Commits
b2210088
Commit
b2210088
authored
Dec 11, 2009
by
kenton@google.com
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix UTF-8 validity checks to not do unaligned reads.
parent
de747794
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
60 additions
and
35 deletions
+60
-35
structurally_valid.cc
src/google/protobuf/stubs/structurally_valid.cc
+48
-33
structurally_valid_unittest.cc
src/google/protobuf/stubs/structurally_valid_unittest.cc
+12
-2
No files found.
src/google/protobuf/stubs/structurally_valid.cc
View file @
b2210088
...
...
@@ -371,36 +371,44 @@ int UTF8GenericScan(const UTF8ScanObj* st,
// Do state-table scan
int
e
=
0
;
uint8
c
;
// Do fast for groups of 8 identity bytes.
// This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
// including slowing slightly on cr/lf/ht
//----------------------------
const
uint8
*
Tbl2
=
&
st
->
fast_state
[
0
];
uint32
losub
=
st
->
losub
;
uint32
hiadd
=
st
->
hiadd
;
while
(
src
<
srclimit8
)
{
uint32
s0123
=
(
reinterpret_cast
<
const
uint32
*>
(
src
))[
0
];
uint32
s4567
=
(
reinterpret_cast
<
const
uint32
*>
(
src
))[
1
];
src
+=
8
;
// This is a fast range check for all bytes in [lowsub..0x80-hiadd)
uint32
temp
=
(
s0123
-
losub
)
|
(
s0123
+
hiadd
)
|
(
s4567
-
losub
)
|
(
s4567
+
hiadd
);
if
((
temp
&
0x80808080
)
!=
0
)
{
// We typically end up here on cr/lf/ht; src was incremented
int
e0123
=
(
Tbl2
[
src
[
-
8
]]
|
Tbl2
[
src
[
-
7
]])
|
(
Tbl2
[
src
[
-
6
]]
|
Tbl2
[
src
[
-
5
]]);
if
(
e0123
!=
0
)
{
src
-=
8
;
break
;
}
// Exit on Non-interchange
e0123
=
(
Tbl2
[
src
[
-
4
]]
|
Tbl2
[
src
[
-
3
]])
|
(
Tbl2
[
src
[
-
2
]]
|
Tbl2
[
src
[
-
1
]]);
if
(
e0123
!=
0
)
{
src
-=
4
;
break
;
}
// Exit on Non-interchange
// Else OK, go around again
const
uint32
losub
=
st
->
losub
;
const
uint32
hiadd
=
st
->
hiadd
;
// Check initial few bytes one at a time until 8-byte aligned
//----------------------------
while
((((
uintptr_t
)
src
&
0x07
)
!=
0
)
&&
(
src
<
srclimit
)
&&
Tbl2
[
src
[
0
]]
==
0
)
{
src
++
;
}
if
(((
uintptr_t
)
src
&
0x07
)
==
0
)
{
// Do fast for groups of 8 identity bytes.
// This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
// including slowing slightly on cr/lf/ht
//----------------------------
while
(
src
<
srclimit8
)
{
uint32
s0123
=
(
reinterpret_cast
<
const
uint32
*>
(
src
))[
0
];
uint32
s4567
=
(
reinterpret_cast
<
const
uint32
*>
(
src
))[
1
];
src
+=
8
;
// This is a fast range check for all bytes in [lowsub..0x80-hiadd)
uint32
temp
=
(
s0123
-
losub
)
|
(
s0123
+
hiadd
)
|
(
s4567
-
losub
)
|
(
s4567
+
hiadd
);
if
((
temp
&
0x80808080
)
!=
0
)
{
// We typically end up here on cr/lf/ht; src was incremented
int
e0123
=
(
Tbl2
[
src
[
-
8
]]
|
Tbl2
[
src
[
-
7
]])
|
(
Tbl2
[
src
[
-
6
]]
|
Tbl2
[
src
[
-
5
]]);
if
(
e0123
!=
0
)
{
src
-=
8
;
break
;
}
// Exit on Non-interchange
e0123
=
(
Tbl2
[
src
[
-
4
]]
|
Tbl2
[
src
[
-
3
]])
|
(
Tbl2
[
src
[
-
2
]]
|
Tbl2
[
src
[
-
1
]]);
if
(
e0123
!=
0
)
{
src
-=
4
;
break
;
}
// Exit on Non-interchange
// Else OK, go around again
}
}
}
//----------------------------
...
...
@@ -470,10 +478,17 @@ int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
int
rest_consumed
;
int
exit_reason
;
do
{
while
((
src
<
srclimit8
)
&&
(((
reinterpret_cast
<
const
uint32
*>
(
src
)[
0
]
|
reinterpret_cast
<
const
uint32
*>
(
src
)[
1
])
&
0x80808080
)
==
0
))
{
src
+=
8
;
// Check initial few bytes one at a time until 8-byte aligned
while
((((
uintptr_t
)
src
&
0x07
)
!=
0
)
&&
(
src
<
srclimit
)
&&
(
src
[
0
]
<
0x80
))
{
src
++
;
}
if
(((
uintptr_t
)
src
&
0x07
)
==
0
)
{
while
((
src
<
srclimit8
)
&&
(((
reinterpret_cast
<
const
uint32
*>
(
src
)[
0
]
|
reinterpret_cast
<
const
uint32
*>
(
src
)[
1
])
&
0x80808080
)
==
0
))
{
src
+=
8
;
}
}
while
((
src
<
srclimit
)
&&
(
src
[
0
]
<
0x80
))
{
src
++
;
...
...
src/google/protobuf/stubs/structurally_valid_unittest.cc
View file @
b2210088
...
...
@@ -13,15 +13,25 @@ TEST(StructurallyValidTest, ValidUTF8String) {
// On GCC, this string can be written as:
// "abcd 1234 - \u2014\u2013\u2212"
// MSVC seems to interpret \u differently.
string
valid_str
(
"abcd 1234 -
\342\200\224\342\200\223\342\210\222
"
);
string
valid_str
(
"abcd 1234 -
\342\200\224\342\200\223\342\210\222
- xyz789
"
);
EXPECT_TRUE
(
IsStructurallyValidUTF8
(
valid_str
.
data
(),
valid_str
.
size
()));
// Additional check for pointer alignment
for
(
int
i
=
1
;
i
<
8
;
++
i
)
{
EXPECT_TRUE
(
IsStructurallyValidUTF8
(
valid_str
.
data
()
+
i
,
valid_str
.
size
()
-
i
));
}
}
TEST
(
StructurallyValidTest
,
InvalidUTF8String
)
{
string
invalid_str
(
"
\xA0\xB0
"
);
const
string
invalid_str
(
"abcd
\xA0\xB0\xA0\xB0\xA0\xB0
- xyz789
"
);
EXPECT_FALSE
(
IsStructurallyValidUTF8
(
invalid_str
.
data
(),
invalid_str
.
size
()));
// Additional check for pointer alignment
for
(
int
i
=
1
;
i
<
8
;
++
i
)
{
EXPECT_FALSE
(
IsStructurallyValidUTF8
(
invalid_str
.
data
()
+
i
,
invalid_str
.
size
()
-
i
));
}
}
}
// namespace
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment