Commit a8d631fb authored by miloyip@gmail.com's avatar miloyip@gmail.com

Added Transcoder for converting Encoding during parsing.

git-svn-id: https://rapidjson.googlecode.com/svn/trunk@39 c5894555-1306-4e8d-425f-1f6f381ee07c
parent bdf6da64
...@@ -697,11 +697,11 @@ public: ...@@ -697,11 +697,11 @@ public:
\param stream Input stream to be parsed. \param stream Input stream to be parsed.
\return The document itself for fluent API. \return The document itself for fluent API.
*/ */
template <unsigned parseFlags, typename Stream> template <unsigned parseFlags, typename SourceEncoding, typename InputStream>
GenericDocument& ParseStream(Stream& stream) { GenericDocument& ParseStream(InputStream& is) {
ValueType::SetNull(); // Remove existing root if exist ValueType::SetNull(); // Remove existing root if exist
GenericReader<Encoding> reader; GenericReader<SourceEncoding, Encoding> reader;
if (reader.Parse<parseFlags>(stream, *this)) { if (reader.Parse<parseFlags>(is, *this)) {
RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object
RawAssign(*stack_.template Pop<ValueType>(1)); RawAssign(*stack_.template Pop<ValueType>(1));
parseError_ = 0; parseError_ = 0;
...@@ -720,21 +720,31 @@ public: ...@@ -720,21 +720,31 @@ public:
\param str Mutable zero-terminated string to be parsed. \param str Mutable zero-terminated string to be parsed.
\return The document itself for fluent API. \return The document itself for fluent API.
*/ */
template <unsigned parseFlags> template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& ParseInsitu(Ch* str) { GenericDocument& ParseInsitu(Ch* str) {
GenericInsituStringStream<Encoding> s(str); GenericInsituStringStream<Encoding> s(str);
return ParseStream<parseFlags | kParseInsituFlag>(s); return ParseStream<parseFlags | kParseInsituFlag, SourceEncoding>(s);
}
template <unsigned parseFlags>
GenericDocument& ParseInsitu(Ch* str) {
return ParseInsitu<parseFlags, Encoding>(str);
} }
//! Parse JSON text from a read-only string. //! Parse JSON text from a read-only string.
/*! \tparam parseFlags Combination of ParseFlag (must not contain kParseInsituFlag). /*! \tparam parseFlags Combination of ParseFlag (must not contain kParseInsituFlag).
\param str Read-only zero-terminated string to be parsed. \param str Read-only zero-terminated string to be parsed.
*/ */
template <unsigned parseFlags> template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& Parse(const Ch* str) { GenericDocument& Parse(const Ch* str) {
RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag)); RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag));
GenericStringStream<Encoding> s(str); GenericStringStream<SourceEncoding> s(str);
return ParseStream<parseFlags>(s); return ParseStream<parseFlags, SourceEncoding>(s);
}
template <unsigned parseFlags>
GenericDocument& Parse(const Ch* str) {
return Parse<parseFlags, Encoding>(str);
} }
//! Whether a parse error was occured in the last parsing. //! Whether a parse error was occured in the last parsing.
...@@ -752,8 +762,8 @@ public: ...@@ -752,8 +762,8 @@ public:
//! Get the capacity of stack in bytes. //! Get the capacity of stack in bytes.
size_t GetStackCapacity() const { return stack_.GetCapacity(); } size_t GetStackCapacity() const { return stack_.GetCapacity(); }
private: //private:
friend class GenericReader<Encoding>; // for Reader to call the following private handler functions //friend class GenericReader<Encoding>; // for Reader to call the following private handler functions
// Implementation of Handler // Implementation of Handler
void Null() { new (stack_.template Push<ValueType>()) ValueType(); } void Null() { new (stack_.template Push<ValueType>()) ValueType(); }
...@@ -785,6 +795,7 @@ private: ...@@ -785,6 +795,7 @@ private:
stack_.template Top<ValueType>()->SetArrayRaw(elements, elementCount, GetAllocator()); stack_.template Top<ValueType>()->SetArrayRaw(elements, elementCount, GetAllocator());
} }
private:
void ClearStack() { void ClearStack() {
if (Allocator::kNeedFree) if (Allocator::kNeedFree)
while (stack_.GetSize() > 0) // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects) while (stack_.GetSize() > 0) // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects)
......
...@@ -355,24 +355,39 @@ struct UTF8 { ...@@ -355,24 +355,39 @@ struct UTF8 {
} }
} }
template <typename InputStream>
static bool Decode(InputStream& is, unsigned* codepoint) {
#define COPY() c = is.Take(); *codepoint = (*codepoint << 6) | ((unsigned char)c & 0x3Fu)
#define TRANS(mask) result &= ((GetType(c) & mask) != 0)
#define TAIL() COPY(); TRANS(0x70)
Ch c = is.Take();
if (!(c & 0x80)) {
*codepoint = (unsigned char)c;
return true;
}
unsigned char type = GetType(c);
*codepoint = (0xFF >> type) & (unsigned char)c;
bool result = true;
switch (type) {
case 2: TAIL(); return result;
case 3: TAIL(); TAIL(); return result;
case 4: COPY(); TRANS(0x50); TAIL(); return result;
case 5: COPY(); TRANS(0x10); COPY(); TAIL(); return result;
case 6: TAIL(); TAIL(); TAIL(); return result;
case 10: COPY(); TRANS(0x20); TAIL(); return result;
case 11: COPY(); TRANS(0x60); TAIL(); return result;
default: return false;
}
#undef COPY
#undef TRANS
#undef TAIL
}
template <typename InputStream, typename OutputStream> template <typename InputStream, typename OutputStream>
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
// Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
// With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
static const unsigned char type[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
};
#define COPY() os.Put(c = is.Take()) #define COPY() os.Put(c = is.Take())
#define TRANS(mask) result &= ((type[(unsigned char)c] & mask) != 0) #define TRANS(mask) result &= ((GetType(c) & mask) != 0)
#define TAIL() COPY(); TRANS(0x70) #define TAIL() COPY(); TRANS(0x70)
Ch c; Ch c;
COPY(); COPY();
...@@ -380,7 +395,7 @@ struct UTF8 { ...@@ -380,7 +395,7 @@ struct UTF8 {
return true; return true;
bool result = true; bool result = true;
switch (type[(unsigned char)c]) { switch (GetType(c)) {
case 2: TAIL(); return result; case 2: TAIL(); return result;
case 3: TAIL(); TAIL(); return result; case 3: TAIL(); TAIL(); return result;
case 4: COPY(); TRANS(0x50); TAIL(); return result; case 4: COPY(); TRANS(0x50); TAIL(); return result;
...@@ -394,6 +409,24 @@ struct UTF8 { ...@@ -394,6 +409,24 @@ struct UTF8 {
#undef TRANS #undef TRANS
#undef TAIL #undef TAIL
} }
RAPIDJSON_FORCEINLINE static unsigned char GetType(unsigned char c) {
// Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
// With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
static const unsigned char type[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
};
return type[c];
}
}; };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
...@@ -464,6 +497,41 @@ struct UTF32 { ...@@ -464,6 +497,41 @@ struct UTF32 {
} }
}; };
///////////////////////////////////////////////////////////////////////////////
// Transcoder
template<typename SourceEncoding, typename TargetEncoding>
struct Transcoder {
template<typename InputStream, typename OutputStream>
static bool Transcode(InputStream& is, OutputStream& os) {
unsigned codepoint;
if (!SourceEncoding::Decode(is, &codepoint))
return false;
TargetEncoding::Encode(os, codepoint);
return true;
}
template<typename InputStream, typename OutputStream>
static bool Validate(InputStream& is, OutputStream& os) {
return Transcode(is, os);
}
};
//! Specialization of Transcoder with same source and target encoding.
template<typename Encoding>
struct Transcoder<Encoding, Encoding> {
template<typename InputStream, typename OutputStream>
static bool Transcode(InputStream& is, OutputStream& os) {
os.Put(is.Take());
return true;
}
template<typename InputStream, typename OutputStream>
static bool Validate(InputStream& is, OutputStream& os) {
return Encoding::Validate(is, os);
}
};
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Stream // Stream
......
...@@ -189,10 +189,10 @@ template<> inline void SkipWhitespace(StringStream& stream) { ...@@ -189,10 +189,10 @@ template<> inline void SkipWhitespace(StringStream& stream) {
\tparam Encoding Encoding of both the stream and the parse output. \tparam Encoding Encoding of both the stream and the parse output.
\tparam Allocator Allocator type for stack. \tparam Allocator Allocator type for stack.
*/ */
template <typename Encoding, typename Allocator = MemoryPoolAllocator<> > template <typename SourceEncoding, typename TargetEncoding, typename Allocator = MemoryPoolAllocator<> >
class GenericReader { class GenericReader {
public: public:
typedef typename Encoding::Ch Ch; typedef typename SourceEncoding::Ch Ch;
//! Constructor. //! Constructor.
/*! \param allocator Optional allocator for allocating stack memory. (Only use for non-destructive parsing) /*! \param allocator Optional allocator for allocating stack memory. (Only use for non-destructive parsing)
...@@ -365,8 +365,8 @@ private: ...@@ -365,8 +365,8 @@ private:
struct StackStream { struct StackStream {
StackStream(internal::Stack<Allocator>& stack) : stack_(stack), length_(0) {} StackStream(internal::Stack<Allocator>& stack) : stack_(stack), length_(0) {}
void Put(Ch c) { void Put(typename TargetEncoding::Ch c) {
*stack_.template Push<Ch>() = c; *stack_.template Push<typename TargetEncoding::Ch>() = c;
++length_; ++length_;
} }
internal::Stack<Allocator>& stack_; internal::Stack<Allocator>& stack_;
...@@ -382,12 +382,12 @@ private: ...@@ -382,12 +382,12 @@ private:
ParseStringToStream<parseFlags>(s, s); ParseStringToStream<parseFlags>(s, s);
size_t length = s.PutEnd(head) - 1; size_t length = s.PutEnd(head) - 1;
RAPIDJSON_ASSERT(length <= 0xFFFFFFFF); RAPIDJSON_ASSERT(length <= 0xFFFFFFFF);
handler.String(head, SizeType(length), false); handler.String((typename TargetEncoding::Ch*)head, SizeType(length), false);
} }
else { else {
StackStream stackStream(stack_); StackStream stackStream(stack_);
ParseStringToStream<parseFlags>(s, stackStream); ParseStringToStream<parseFlags>(s, stackStream);
handler.String(stack_.template Pop<Ch>(stackStream.length_), stackStream.length_ - 1, true); handler.String(stack_.template Pop<typename TargetEncoding::Ch>(stackStream.length_), stackStream.length_ - 1, true);
} }
stream = s; // Restore stream stream = s; // Restore stream
} }
...@@ -427,7 +427,7 @@ private: ...@@ -427,7 +427,7 @@ private:
RAPIDJSON_PARSE_ERROR("The second \\u in surrogate pair is invalid", input.Tell() - 2); RAPIDJSON_PARSE_ERROR("The second \\u in surrogate pair is invalid", input.Tell() - 2);
codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000;
} }
Encoding::Encode(output, codepoint); TargetEncoding::Encode(output, codepoint);
} }
else else
RAPIDJSON_PARSE_ERROR("Unknown escape character", input.Tell() - 1); RAPIDJSON_PARSE_ERROR("Unknown escape character", input.Tell() - 1);
...@@ -441,12 +441,12 @@ private: ...@@ -441,12 +441,12 @@ private:
RAPIDJSON_PARSE_ERROR("lacks ending quotation before the end of string", input.Tell() - 1); RAPIDJSON_PARSE_ERROR("lacks ending quotation before the end of string", input.Tell() - 1);
else if ((unsigned)c < 0x20) // RFC 4627: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF else if ((unsigned)c < 0x20) // RFC 4627: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
RAPIDJSON_PARSE_ERROR("Incorrect unescaped character in string", input.Tell() - 1); RAPIDJSON_PARSE_ERROR("Incorrect unescaped character in string", input.Tell() - 1);
else if (parseFlags & kParseValidateEncodingFlag) { else {
if (!Encoding::Validate(input, output)) if (parseFlags & kParseValidateEncodingFlag ?
!Transcoder<SourceEncoding, TargetEncoding>::Validate(input, output) :
!Transcoder<SourceEncoding, TargetEncoding>::Transcode(input, output))
RAPIDJSON_PARSE_ERROR("Invalid encoding", input.Tell()); RAPIDJSON_PARSE_ERROR("Invalid encoding", input.Tell());
} }
else
output.Put(input.Take()); // Normal character, just copy
} }
} }
...@@ -632,7 +632,7 @@ private: ...@@ -632,7 +632,7 @@ private:
}; // class GenericReader }; // class GenericReader
//! Reader with UTF8 encoding and default allocator. //! Reader with UTF8 encoding and default allocator.
typedef GenericReader<UTF8<> > Reader; typedef GenericReader<UTF8<>, UTF8<> > Reader;
} // namespace rapidjson } // namespace rapidjson
......
...@@ -203,14 +203,14 @@ TEST(Reader, ParseString) { ...@@ -203,14 +203,14 @@ TEST(Reader, ParseString) {
Encoding::Ch* buffer = StrDup(x); \ Encoding::Ch* buffer = StrDup(x); \
GenericInsituStringStream<Encoding> is(buffer); \ GenericInsituStringStream<Encoding> is(buffer); \
ParseStringHandler<Encoding> h; \ ParseStringHandler<Encoding> h; \
GenericReader<Encoding> reader; \ GenericReader<Encoding, Encoding> reader; \
reader.ParseString<kParseInsituFlag | kParseValidateEncodingFlag>(is, h); \ reader.ParseString<kParseInsituFlag | kParseValidateEncodingFlag>(is, h); \
EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h.str_)); \ EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h.str_)); \
EXPECT_EQ(StrLen(e), h.length_); \ EXPECT_EQ(StrLen(e), h.length_); \
free(buffer); \ free(buffer); \
GenericStringStream<Encoding> s(x); \ GenericStringStream<Encoding> s(x); \
ParseStringHandler<Encoding> h2; \ ParseStringHandler<Encoding> h2; \
GenericReader<Encoding> reader2; \ GenericReader<Encoding, Encoding> reader2; \
reader2.ParseString<0>(s, h2); \ reader2.ParseString<0>(s, h2); \
EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h2.str_)); \ EXPECT_EQ(0, StrCmp<Encoding::Ch>(e, h2.str_)); \
EXPECT_EQ(StrLen(e), h2.length_); \ EXPECT_EQ(StrLen(e), h2.length_); \
...@@ -277,6 +277,17 @@ TEST(Reader, ParseString) { ...@@ -277,6 +277,17 @@ TEST(Reader, ParseString) {
} }
} }
TEST(Reader, ParseString_Transcoding) {
const char* x = "\"Hello\"";
const wchar_t* e = L"Hello";
GenericStringStream<UTF8<> > is(x);
GenericReader<UTF8<>, UTF16<> > reader;
ParseStringHandler<UTF16<> > h;
reader.ParseString<0>(is, h);
EXPECT_EQ(0, StrCmp<UTF16<>::Ch>(e, h.str_));
EXPECT_EQ(StrLen(e), h.length_);
}
TEST(Reader, ParseString_NonDestructive) { TEST(Reader, ParseString_NonDestructive) {
StringStream s("\"Hello\\nWorld\""); StringStream s("\"Hello\\nWorld\"");
ParseStringHandler<UTF8<> > h; ParseStringHandler<UTF8<> > h;
...@@ -403,7 +414,7 @@ TEST(Reader, ParseArray_Error) { ...@@ -403,7 +414,7 @@ TEST(Reader, ParseArray_Error) {
strncpy(buffer, str, 1000); \ strncpy(buffer, str, 1000); \
InsituStringStream s(buffer); \ InsituStringStream s(buffer); \
BaseReaderHandler<> h; \ BaseReaderHandler<> h; \
GenericReader<UTF8<>, CrtAllocator> reader; \ GenericReader<UTF8<>, UTF8<>, CrtAllocator> reader; \
EXPECT_FALSE(reader.Parse<0>(s, h)); \ EXPECT_FALSE(reader.Parse<0>(s, h)); \
} }
...@@ -507,7 +518,7 @@ TEST(Reader, ParseObject_Error) { ...@@ -507,7 +518,7 @@ TEST(Reader, ParseObject_Error) {
strncpy(buffer, str, 1000); \ strncpy(buffer, str, 1000); \
InsituStringStream s(buffer); \ InsituStringStream s(buffer); \
BaseReaderHandler<> h; \ BaseReaderHandler<> h; \
GenericReader<UTF8<>, CrtAllocator> reader; \ GenericReader<UTF8<>, UTF8<>, CrtAllocator> reader; \
EXPECT_FALSE(reader.Parse<0>(s, h)); \ EXPECT_FALSE(reader.Parse<0>(s, h)); \
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment