Commit a8d631fb authored by miloyip@gmail.com's avatar miloyip@gmail.com

Added Transcoder for converting Encoding during parsing.

git-svn-id: https://rapidjson.googlecode.com/svn/trunk@39 c5894555-1306-4e8d-425f-1f6f381ee07c
parent bdf6da64
......@@ -697,11 +697,11 @@ public:
\param stream Input stream to be parsed.
\return The document itself for fluent API.
*/
template <unsigned parseFlags, typename Stream>
GenericDocument& ParseStream(Stream& stream) {
template <unsigned parseFlags, typename SourceEncoding, typename InputStream>
GenericDocument& ParseStream(InputStream& is) {
ValueType::SetNull(); // Remove existing root if exist
GenericReader<Encoding> reader;
if (reader.Parse<parseFlags>(stream, *this)) {
GenericReader<SourceEncoding, Encoding> reader;
if (reader.Parse<parseFlags>(is, *this)) {
RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object
RawAssign(*stack_.template Pop<ValueType>(1));
parseError_ = 0;
......@@ -720,21 +720,31 @@ public:
\param str Mutable zero-terminated string to be parsed.
\return The document itself for fluent API.
*/
template <unsigned parseFlags>
template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& ParseInsitu(Ch* str) {
GenericInsituStringStream<Encoding> s(str);
return ParseStream<parseFlags | kParseInsituFlag>(s);
return ParseStream<parseFlags | kParseInsituFlag, SourceEncoding>(s);
}
template <unsigned parseFlags>
GenericDocument& ParseInsitu(Ch* str) {
return ParseInsitu<parseFlags, Encoding>(str);
}
//! Parse JSON text from a read-only string.
/*! \tparam parseFlags Combination of ParseFlag (must not contain kParseInsituFlag).
\param str Read-only zero-terminated string to be parsed.
*/
template <unsigned parseFlags>
template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& Parse(const Ch* str) {
RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag));
GenericStringStream<Encoding> s(str);
return ParseStream<parseFlags>(s);
GenericStringStream<SourceEncoding> s(str);
return ParseStream<parseFlags, SourceEncoding>(s);
}
template <unsigned parseFlags>
GenericDocument& Parse(const Ch* str) {
return Parse<parseFlags, Encoding>(str);
}
//! Whether a parse error was occured in the last parsing.
......@@ -752,8 +762,8 @@ public:
//! Get the capacity of stack in bytes.
size_t GetStackCapacity() const { return stack_.GetCapacity(); }
private:
friend class GenericReader<Encoding>; // for Reader to call the following private handler functions
//private:
//friend class GenericReader<Encoding>; // for Reader to call the following private handler functions
// Implementation of Handler
void Null() { new (stack_.template Push<ValueType>()) ValueType(); }
......@@ -785,6 +795,7 @@ private:
stack_.template Top<ValueType>()->SetArrayRaw(elements, elementCount, GetAllocator());
}
private:
void ClearStack() {
if (Allocator::kNeedFree)
while (stack_.GetSize() > 0) // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects)
......
......@@ -355,24 +355,39 @@ struct UTF8 {
}
}
template <typename InputStream>
static bool Decode(InputStream& is, unsigned* codepoint) {
#define COPY() c = is.Take(); *codepoint = (*codepoint << 6) | ((unsigned char)c & 0x3Fu)
#define TRANS(mask) result &= ((GetType(c) & mask) != 0)
#define TAIL() COPY(); TRANS(0x70)
Ch c = is.Take();
if (!(c & 0x80)) {
*codepoint = (unsigned char)c;
return true;
}
unsigned char type = GetType(c);
*codepoint = (0xFF >> type) & (unsigned char)c;
bool result = true;
switch (type) {
case 2: TAIL(); return result;
case 3: TAIL(); TAIL(); return result;
case 4: COPY(); TRANS(0x50); TAIL(); return result;
case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result;
case 6: TAIL(); TAIL(); TAIL(); return result;
case 10: COPY(); TRANS(0x20); TAIL(); return result;
case 11: COPY(); TRANS(0x60); TAIL(); return result;
default: return false;
}
#undef COPY
#undef TRANS
#undef TAIL
}
template <typename InputStream, typename OutputStream>
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
// Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
// With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
static const unsigned char type[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
};
#define COPY() os.Put(c = is.Take())
#define TRANS(mask) result &= ((type[(unsigned char)c] & mask) != 0)
#define TRANS(mask) result &= ((GetType(c) & mask) != 0)
#define TAIL() COPY(); TRANS(0x70)
Ch c;
COPY();
......@@ -380,7 +395,7 @@ struct UTF8 {
return true;
bool result = true;
switch (type[(unsigned char)c]) {
switch (GetType(c)) {
case 2: TAIL(); return result;
case 3: TAIL(); TAIL(); return result;
case 4: COPY(); TRANS(0x50); TAIL(); return result;
......@@ -394,6 +409,24 @@ struct UTF8 {
#undef TRANS
#undef TAIL
}
RAPIDJSON_FORCEINLINE static unsigned char GetType(unsigned char c) {
// Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
// With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
static const unsigned char type[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
};
return type[c];
}
};
///////////////////////////////////////////////////////////////////////////////
......@@ -464,6 +497,41 @@ struct UTF32 {
}
};
///////////////////////////////////////////////////////////////////////////////
// Transcoder
template<typename SourceEncoding, typename TargetEncoding>
struct Transcoder {
template<typename InputStream, typename OutputStream>
static bool Transcode(InputStream& is, OutputStream& os) {
unsigned codepoint;
if (!SourceEncoding::Decode(is, &codepoint))
return false;
TargetEncoding::Encode(os, codepoint);
return true;
}
template<typename InputStream, typename OutputStream>
static bool Validate(InputStream& is, OutputStream& os) {
return Transcode(is, os);
}
};
//! Specialization of Transcoder with same source and target encoding.
template<typename Encoding>
struct Transcoder<Encoding, Encoding> {
template<typename InputStream, typename OutputStream>
static bool Transcode(InputStream& is, OutputStream& os) {
os.Put(is.Take());
return true;
}
template<typename InputStream, typename OutputStream>
static bool Validate(InputStream& is, OutputStream& os) {
return Encoding::Validate(is, os);
}
};
///////////////////////////////////////////////////////////////////////////////
// Stream
......
......@@ -189,10 +189,10 @@ template<> inline void SkipWhitespace(StringStream& stream) {
\tparam Encoding Encoding of both the stream and the parse output.
\tparam Allocator Allocator type for stack.
*/
template <typename Encoding, typename Allocator = MemoryPoolAllocator<> >
template <typename SourceEncoding, typename TargetEncoding, typename Allocator = MemoryPoolAllocator<> >
class GenericReader {
public:
typedef typename Encoding::Ch Ch;
typedef typename SourceEncoding::Ch Ch;
//! Constructor.
/*! \param allocator Optional allocator for allocating stack memory. (Only use for non-destructive parsing)
......@@ -365,8 +365,8 @@ private:
struct StackStream {
StackStream(internal::Stack<Allocator>& stack) : stack_(stack), length_(0) {}
void Put(Ch c) {
*stack_.template Push<Ch>() = c;
void Put(typename TargetEncoding::Ch c) {
*stack_.template Push<typename TargetEncoding::Ch>() = c;
++length_;
}
internal::Stack<Allocator>& stack_;
......@@ -382,12 +382,12 @@ private:
ParseStringToStream<parseFlags>(s, s);
size_t length = s.PutEnd(head) - 1;
RAPIDJSON_ASSERT(length <= 0xFFFFFFFF);
handler.String(head, SizeType(length), false);
handler.String((typename TargetEncoding::Ch*)head, SizeType(length), false);
}
else {
StackStream stackStream(stack_);
ParseStringToStream<parseFlags>(s, stackStream);
handler.String(stack_.template Pop<Ch>(stackStream.length_), stackStream.length_ - 1, true);
handler.String(stack_.template Pop<typename TargetEncoding::Ch>(stackStream.length_), stackStream.length_ - 1, true);
}
stream = s; // Restore stream
}
......@@ -427,7 +427,7 @@ private:
RAPIDJSON_PARSE_ERROR("The second \\u in surrogate pair is invalid", input.Tell() - 2);
codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000;
}
Encoding::Encode(output, codepoint);
TargetEncoding::Encode(output, codepoint);
}
else
RAPIDJSON_PARSE_ERROR("Unknown escape character", input.Tell() - 1);
......@@ -441,12 +441,12 @@ private:
RAPIDJSON_PARSE_ERROR("lacks ending quotation before the end of string", input.Tell() - 1);
else if ((unsigned)c < 0x20) // RFC 4627: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
RAPIDJSON_PARSE_ERROR("Incorrect unescaped character in string", input.Tell() - 1);
else if (parseFlags & kParseValidateEncodingFlag) {
if (!Encoding::Validate(input, output))
else {
if (parseFlags & kParseValidateEncodingFlag ?
!Transcoder<SourceEncoding, TargetEncoding>::Validate(input, output) :
!Transcoder<SourceEncoding, TargetEncoding>::Transcode(input, output))
RAPIDJSON_PARSE_ERROR("Invalid encoding", input.Tell());
}
else
output.Put(input.Take()); // Normal character, just copy
}
}
......@@ -632,7 +632,7 @@ private:
}; // class GenericReader
//! Reader with UTF8 encoding and default allocator.
typedef GenericReader<UTF8<> > Reader;
typedef GenericReader<UTF8<>, UTF8<> > Reader;
} // namespace rapidjson
......
......@@ -203,14 +203,14 @@ TEST(Reader, ParseString) {
Encoding::Ch* buffer = StrDup(x); \
GenericInsituStringStream<Encoding> is(buffer); \
ParseStringHandler<Encoding> h; \
GenericReader<Encoding> reader; \
GenericReader<Encoding, Encoding> reader; \
reader.ParseString<kParseInsituFlag | kParseValidateEncodingFlag>(is, h); \
EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h.str_)); \
EXPECT_EQ(StrLen(e), h.length_); \
free(buffer); \
GenericStringStream<Encoding> s(x); \
ParseStringHandler<Encoding> h2; \
GenericReader<Encoding> reader2; \
GenericReader<Encoding, Encoding> reader2; \
reader2.ParseString<0>(s, h2); \
EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h2.str_)); \
EXPECT_EQ(StrLen(e), h2.length_); \
......@@ -277,6 +277,17 @@ TEST(Reader, ParseString) {
}
}
TEST(Reader, ParseString_Transcoding) {
const char* x = "\"Hello\"";
const wchar_t* e = L"Hello";
GenericStringStream<UTF8<> > is(x);
GenericReader<UTF8<>, UTF16<> > reader;
ParseStringHandler<UTF16<> > h;
reader.ParseString<0>(is, h);
EXPECT_EQ(0, StrCmp<UTF16<>::Ch>(e, h.str_));
EXPECT_EQ(StrLen(e), h.length_);
}
TEST(Reader, ParseString_NonDestructive) {
StringStream s("\"Hello\\nWorld\"");
ParseStringHandler<UTF8<> > h;
......@@ -403,7 +414,7 @@ TEST(Reader, ParseArray_Error) {
strncpy(buffer, str, 1000); \
InsituStringStream s(buffer); \
BaseReaderHandler<> h; \
GenericReader<UTF8<>, CrtAllocator> reader; \
GenericReader<UTF8<>, UTF8<>, CrtAllocator> reader; \
EXPECT_FALSE(reader.Parse<0>(s, h)); \
}
......@@ -507,7 +518,7 @@ TEST(Reader, ParseObject_Error) {
strncpy(buffer, str, 1000); \
InsituStringStream s(buffer); \
BaseReaderHandler<> h; \
GenericReader<UTF8<>, CrtAllocator> reader; \
GenericReader<UTF8<>, UTF8<>, CrtAllocator> reader; \
EXPECT_FALSE(reader.Parse<0>(s, h)); \
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment