Commit b2210088 authored by kenton@google.com's avatar kenton@google.com

Fix UTF-8 validity checks to not do unaligned reads.

parent de747794
...@@ -371,36 +371,44 @@ int UTF8GenericScan(const UTF8ScanObj* st, ...@@ -371,36 +371,44 @@ int UTF8GenericScan(const UTF8ScanObj* st,
// Do state-table scan // Do state-table scan
int e = 0; int e = 0;
uint8 c; uint8 c;
// Do fast for groups of 8 identity bytes.
// This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
// including slowing slightly on cr/lf/ht
//----------------------------
const uint8* Tbl2 = &st->fast_state[0]; const uint8* Tbl2 = &st->fast_state[0];
uint32 losub = st->losub; const uint32 losub = st->losub;
uint32 hiadd = st->hiadd; const uint32 hiadd = st->hiadd;
while (src < srclimit8) { // Check initial few bytes one at a time until 8-byte aligned
uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; //----------------------------
uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; while ((((uintptr_t)src & 0x07) != 0) &&
src += 8; (src < srclimit) &&
// This is a fast range check for all bytes in [lowsub..0x80-hiadd) Tbl2[src[0]] == 0) {
uint32 temp = (s0123 - losub) | (s0123 + hiadd) | src++;
(s4567 - losub) | (s4567 + hiadd); }
if ((temp & 0x80808080) != 0) { if (((uintptr_t)src & 0x07) == 0) {
// We typically end up here on cr/lf/ht; src was incremented // Do fast for groups of 8 identity bytes.
int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
(Tbl2[src[-6]] | Tbl2[src[-5]]); // including slowing slightly on cr/lf/ht
if (e0123 != 0) { //----------------------------
src -= 8; while (src < srclimit8) {
break; uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
} // Exit on Non-interchange uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | src += 8;
(Tbl2[src[-2]] | Tbl2[src[-1]]); // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
if (e0123 != 0) { uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
src -= 4; (s4567 - losub) | (s4567 + hiadd);
break; if ((temp & 0x80808080) != 0) {
} // Exit on Non-interchange // We typically end up here on cr/lf/ht; src was incremented
// Else OK, go around again int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
(Tbl2[src[-6]] | Tbl2[src[-5]]);
if (e0123 != 0) {
src -= 8;
break;
} // Exit on Non-interchange
e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
(Tbl2[src[-2]] | Tbl2[src[-1]]);
if (e0123 != 0) {
src -= 4;
break;
} // Exit on Non-interchange
// Else OK, go around again
}
} }
} }
//---------------------------- //----------------------------
...@@ -470,10 +478,17 @@ int UTF8GenericScanFastAscii(const UTF8ScanObj* st, ...@@ -470,10 +478,17 @@ int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
int rest_consumed; int rest_consumed;
int exit_reason; int exit_reason;
do { do {
while ((src < srclimit8) && // Check initial few bytes one at a time until 8-byte aligned
(((reinterpret_cast<const uint32*>(src)[0] | while ((((uintptr_t)src & 0x07) != 0) &&
reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { (src < srclimit) && (src[0] < 0x80)) {
src += 8; src++;
}
if (((uintptr_t)src & 0x07) == 0) {
while ((src < srclimit8) &&
(((reinterpret_cast<const uint32*>(src)[0] |
reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
src += 8;
}
} }
while ((src < srclimit) && (src[0] < 0x80)) { while ((src < srclimit) && (src[0] < 0x80)) {
src++; src++;
......
...@@ -13,15 +13,25 @@ TEST(StructurallyValidTest, ValidUTF8String) { ...@@ -13,15 +13,25 @@ TEST(StructurallyValidTest, ValidUTF8String) {
// On GCC, this string can be written as: // On GCC, this string can be written as:
// "abcd 1234 - \u2014\u2013\u2212" // "abcd 1234 - \u2014\u2013\u2212"
// MSVC seems to interpret \u differently. // MSVC seems to interpret \u differently.
string valid_str("abcd 1234 - \342\200\224\342\200\223\342\210\222"); string valid_str("abcd 1234 - \342\200\224\342\200\223\342\210\222 - xyz789");
EXPECT_TRUE(IsStructurallyValidUTF8(valid_str.data(), EXPECT_TRUE(IsStructurallyValidUTF8(valid_str.data(),
valid_str.size())); valid_str.size()));
// Additional check for pointer alignment
for (int i = 1; i < 8; ++i) {
EXPECT_TRUE(IsStructurallyValidUTF8(valid_str.data() + i,
valid_str.size() - i));
}
} }
TEST(StructurallyValidTest, InvalidUTF8String) { TEST(StructurallyValidTest, InvalidUTF8String) {
string invalid_str("\xA0\xB0"); const string invalid_str("abcd\xA0\xB0\xA0\xB0\xA0\xB0 - xyz789");
EXPECT_FALSE(IsStructurallyValidUTF8(invalid_str.data(), EXPECT_FALSE(IsStructurallyValidUTF8(invalid_str.data(),
invalid_str.size())); invalid_str.size()));
// Additional check for pointer alignment
for (int i = 1; i < 8; ++i) {
EXPECT_FALSE(IsStructurallyValidUTF8(invalid_str.data() + i,
invalid_str.size() - i));
}
} }
} // namespace } // namespace
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment