Commit a45bcbba authored by miloyip@gmail.com's avatar miloyip@gmail.com

Rewrite UTF8::Validate() to obtain better performance.

git-svn-id: https://rapidjson.googlecode.com/svn/trunk@35 c5894555-1306-4e8d-425f-1f6f381ee07c
parent 827de60f
......@@ -357,46 +357,41 @@ struct UTF8 {
template <typename InputStream, typename OutputStream>
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
static const unsigned char utf8d[] = {
//! \todo optimization
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
// Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
// With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
static const unsigned char type[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
};
#define COPY() os.Put(c = is.Take())
#define TRANS(mask) if (!(type[(unsigned char)c] & mask)) return false
#define TAIL() COPY(); TRANS(0x70)
Ch c;
os.Put(c = is.Take());
if ((unsigned char) c < 0x80)
COPY();
if (!(c & 0x80))
return true;
unsigned type = utf8d[(unsigned char)c];
unsigned state = utf8d[256 + type];
if (state == 12)
return false;
while (state) {
os.Put(c = is.Take());
unsigned type = utf8d[(unsigned char)c];
state = utf8d[256 + state + type];
if (state == 12)
return false;
};
return true;
switch (type[(unsigned char)c]) {
case 2: TAIL(); return true;
case 3: TAIL(); TAIL(); return true;
case 4: COPY(); TRANS(0x50); TAIL(); return true;
case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return true;
case 6: TAIL(); TAIL(); TAIL(); return true;
case 10: COPY(); TRANS(0x20); TAIL(); return true;
case 11: COPY(); TRANS(0x60); TAIL(); return true;
default: return false;
}
#undef COPY
#undef TRANS
#undef TAIL
}
};
......
......@@ -31,7 +31,7 @@ struct GenericStringBuffer {
return stack_.template Bottom<Ch>();
}
size_t Size() const { return stack_.Size(); }
size_t GetSize() const { return stack_.GetSize(); }
static const size_t kDefaultCapacity = 256;
mutable internal::Stack<Allocator> stack_;
......
......@@ -232,6 +232,18 @@ TEST_F(RapidJson, SIMD_SUFFIX(Whitespace)) {
}
}
TEST_F(RapidJson, UTF8_Validate) {
StringBuffer os(0, length_ + 1);
for (int i = 0; i < kTrialCount; i++) {
StringStream is(json_);
os.Clear();
while (is.Peek() != '\0')
UTF8<>::Validate(is, os);
EXPECT_EQ(length_, os.GetSize());
}
}
// Depreciated.
//TEST_F(RapidJson, FileStream_Read) {
// for (int i = 0; i < kTrialCount; i++) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment