Commit dd25c965 authored by Milo Yip's avatar Milo Yip

Merge pull request #553 from miloyip/issue158_parsestdstring

Issue158 parsestdstring
parents ff12c04a 3595b1f6
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
#include "reader.h" #include "reader.h"
#include "internal/meta.h" #include "internal/meta.h"
#include "internal/strfunc.h" #include "internal/strfunc.h"
#include "memorystream.h"
#include "encodedstream.h"
#include <new> // placement new #include <new> // placement new
#ifdef _MSC_VER #ifdef _MSC_VER
...@@ -2224,6 +2226,42 @@ public: ...@@ -2224,6 +2226,42 @@ public:
GenericDocument& Parse(const Ch* str) { GenericDocument& Parse(const Ch* str) {
return Parse<kParseDefaultFlags>(str); return Parse<kParseDefaultFlags>(str);
} }
template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& Parse(const typename SourceEncoding::Ch* str, size_t length) {
RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag));
MemoryStream ms(static_cast<const char*>(str), length * sizeof(typename SourceEncoding::Ch));
EncodedInputStream<SourceEncoding, MemoryStream> is(ms);
ParseStream<parseFlags, SourceEncoding>(is);
return *this;
}
template <unsigned parseFlags>
GenericDocument& Parse(const Ch* str, size_t length) {
return Parse<parseFlags, Encoding>(str, length);
}
GenericDocument& Parse(const Ch* str, size_t length) {
return Parse<kParseDefaultFlags>(str, length);
}
#if RAPIDJSON_HAS_STDSTRING
template <unsigned parseFlags, typename SourceEncoding>
GenericDocument& Parse(const std::basic_string<typename SourceEncoding::Ch>& str) {
// c_str() is constant complexity according to standard. Should be faster than Parse(const char*, size_t)
return Parse<parseFlags, SourceEncoding>(str.c_str());
}
template <unsigned parseFlags>
GenericDocument& Parse(const std::basic_string<Ch>& str) {
return Parse<parseFlags, Encoding>(str);
}
GenericDocument& Parse(const std::basic_string<Ch>& str) {
return Parse<kParseDefaultFlags>(str);
}
#endif // RAPIDJSON_HAS_STDSTRING
//!@} //!@}
//!@name Handling parse errors //!@name Handling parse errors
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#define RAPIDJSON_ENCODEDSTREAM_H_ #define RAPIDJSON_ENCODEDSTREAM_H_
#include "stream.h" #include "stream.h"
#include "memorystream.h"
#ifdef __GNUC__ #ifdef __GNUC__
RAPIDJSON_DIAG_PUSH RAPIDJSON_DIAG_PUSH
...@@ -62,6 +63,30 @@ private: ...@@ -62,6 +63,30 @@ private:
Ch current_; Ch current_;
}; };
//! Specialized for UTF8 MemoryStream.
template <>
class EncodedInputStream<UTF8<>, MemoryStream> {
public:
typedef UTF8<>::Ch Ch;
EncodedInputStream(MemoryStream& is) : is_(is) {
if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take();
if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take();
if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take();
}
Ch Peek() const { return is_.Peek(); }
Ch Take() { return is_.Take(); }
size_t Tell() const { return is_.Tell(); }
// Not implemented
void Put(Ch) {}
void Flush() {}
Ch* PutBegin() { return 0; }
size_t PutEnd(Ch*) { return 0; }
MemoryStream& is_;
};
//! Output byte stream wrapper with statically bound encoding. //! Output byte stream wrapper with statically bound encoding.
/*! /*!
\tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
......
...@@ -42,8 +42,8 @@ struct MemoryStream { ...@@ -42,8 +42,8 @@ struct MemoryStream {
MemoryStream(const Ch *src, size_t size) : src_(src), begin_(src), end_(src + size), size_(size) {} MemoryStream(const Ch *src, size_t size) : src_(src), begin_(src), end_(src + size), size_(size) {}
Ch Peek() const { return (src_ == end_) ? '\0' : *src_; } Ch Peek() const { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_; }
Ch Take() { return (src_ == end_) ? '\0' : *src_++; } Ch Take() { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_++; }
size_t Tell() const { return static_cast<size_t>(src_ - begin_); } size_t Tell() const { return static_cast<size_t>(src_ - begin_); }
Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "allocators.h" #include "allocators.h"
#include "stream.h" #include "stream.h"
#include "encodedstream.h"
#include "internal/meta.h" #include "internal/meta.h"
#include "internal/stack.h" #include "internal/stack.h"
#include "internal/strtod.h" #include "internal/strtod.h"
...@@ -259,6 +260,12 @@ void SkipWhitespace(InputStream& is) { ...@@ -259,6 +260,12 @@ void SkipWhitespace(InputStream& is) {
s.Take(); s.Take();
} }
inline const char* SkipWhitespace(const char* p, const char* end) {
while (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
++p;
return p;
}
#ifdef RAPIDJSON_SSE42 #ifdef RAPIDJSON_SSE42
//! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once. //! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once.
inline const char *SkipWhitespace_SIMD(const char* p) { inline const char *SkipWhitespace_SIMD(const char* p) {
...@@ -295,6 +302,34 @@ inline const char *SkipWhitespace_SIMD(const char* p) { ...@@ -295,6 +302,34 @@ inline const char *SkipWhitespace_SIMD(const char* p) {
} }
} }
inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
// Fast return for single non-whitespace
if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
++p;
else
return p;
// The middle of string using SIMD
static const char whitespace[16] = " \n\r\t";
const __m128i w = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespace[0]));
for (; p <= end - 16; p += 16) {
const __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
const int r = _mm_cvtsi128_si32(_mm_cmpistrm(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY));
if (r != 0) { // some of characters is non-whitespace
#ifdef _MSC_VER // Find the index of first non-whitespace
unsigned long offset;
_BitScanForward(&offset, r);
return p + offset;
#else
return p + __builtin_ffs(r) - 1;
#endif
}
}
return SkipWhitespace(p, end);
}
#elif defined(RAPIDJSON_SSE2) #elif defined(RAPIDJSON_SSE2)
//! Skip whitespace with SSE2 instructions, testing 16 8-byte characters at once. //! Skip whitespace with SSE2 instructions, testing 16 8-byte characters at once.
...@@ -342,6 +377,44 @@ inline const char *SkipWhitespace_SIMD(const char* p) { ...@@ -342,6 +377,44 @@ inline const char *SkipWhitespace_SIMD(const char* p) {
} }
} }
inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
// Fast return for single non-whitespace
if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
++p;
else
return p;
// The rest of string
#define C16(c) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }
static const char whitespaces[4][16] = { C16(' '), C16('\n'), C16('\r'), C16('\t') };
#undef C16
const __m128i w0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[0][0]));
const __m128i w1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[1][0]));
const __m128i w2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[2][0]));
const __m128i w3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&whitespaces[3][0]));
for (; p <= end - 16; p += 16) {
const __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
__m128i x = _mm_cmpeq_epi8(s, w0);
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1));
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2));
x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3));
unsigned short r = static_cast<unsigned short>(~_mm_movemask_epi8(x));
if (r != 0) { // some of characters may be non-whitespace
#ifdef _MSC_VER // Find the index of first non-whitespace
unsigned long offset;
_BitScanForward(&offset, r);
return p + offset;
#else
return p + __builtin_ffs(r) - 1;
#endif
}
}
return SkipWhitespace(p, end);
}
#endif // RAPIDJSON_SSE2 #endif // RAPIDJSON_SSE2
#ifdef RAPIDJSON_SIMD #ifdef RAPIDJSON_SIMD
...@@ -354,6 +427,10 @@ template<> inline void SkipWhitespace(InsituStringStream& is) { ...@@ -354,6 +427,10 @@ template<> inline void SkipWhitespace(InsituStringStream& is) {
template<> inline void SkipWhitespace(StringStream& is) { template<> inline void SkipWhitespace(StringStream& is) {
is.src_ = SkipWhitespace_SIMD(is.src_); is.src_ = SkipWhitespace_SIMD(is.src_);
} }
template<> inline void SkipWhitespace(EncodedInputStream<UTF8<>, MemoryStream>& is) {
is.is_.src_ = SkipWhitespace_SIMD(is.is_.src_, is.is_.end_);
}
#endif // RAPIDJSON_SIMD #endif // RAPIDJSON_SIMD
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
......
...@@ -30,6 +30,8 @@ ...@@ -30,6 +30,8 @@
# define RAPIDJSON_SSE2 # define RAPIDJSON_SSE2
#endif #endif
#define RAPIDJSON_HAS_STDSTRING 1
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Google Test // Google Test
......
...@@ -187,6 +187,25 @@ TEST_F(RapidJson, SIMD_SUFFIX(DocumentParse_MemoryPoolAllocator)) { ...@@ -187,6 +187,25 @@ TEST_F(RapidJson, SIMD_SUFFIX(DocumentParse_MemoryPoolAllocator)) {
} }
} }
TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseLength_MemoryPoolAllocator)) {
for (size_t i = 0; i < kTrialCount; i++) {
Document doc;
doc.Parse(json_, length_);
ASSERT_TRUE(doc.IsObject());
}
}
#if RAPIDJSON_HAS_STDSTRING
TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseStdString_MemoryPoolAllocator)) {
const std::string s(json_, length_);
for (size_t i = 0; i < kTrialCount; i++) {
Document doc;
doc.Parse(s);
ASSERT_TRUE(doc.IsObject());
}
}
#endif
TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseIterative_MemoryPoolAllocator)) { TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseIterative_MemoryPoolAllocator)) {
for (size_t i = 0; i < kTrialCount; i++) { for (size_t i = 0; i < kTrialCount; i++) {
Document doc; Document doc;
......
...@@ -34,6 +34,8 @@ void ParseCheck(DocumentType& doc) { ...@@ -34,6 +34,8 @@ void ParseCheck(DocumentType& doc) {
typedef typename DocumentType::ValueType ValueType; typedef typename DocumentType::ValueType ValueType;
EXPECT_FALSE(doc.HasParseError()); EXPECT_FALSE(doc.HasParseError());
if (doc.HasParseError())
printf("Error: %d at %zu\n", static_cast<int>(doc.GetParseError()), doc.GetErrorOffset());
EXPECT_TRUE(static_cast<ParseResult>(doc)); EXPECT_TRUE(static_cast<ParseResult>(doc));
EXPECT_TRUE(doc.IsObject()); EXPECT_TRUE(doc.IsObject());
...@@ -93,6 +95,26 @@ void ParseTest() { ...@@ -93,6 +95,26 @@ void ParseTest() {
doc.ParseInsitu(buffer); doc.ParseInsitu(buffer);
ParseCheck(doc); ParseCheck(doc);
free(buffer); free(buffer);
// Parse(const Ch*, size_t)
size_t length = strlen(json);
buffer = reinterpret_cast<char*>(malloc(length * 2));
memcpy(buffer, json, length);
memset(buffer + length, 'X', length);
#if RAPIDJSON_HAS_STDSTRING
std::string s2(buffer, length); // backup buffer
#endif
doc.SetNull();
doc.Parse(buffer, length);
free(buffer);
ParseCheck(doc);
#if RAPIDJSON_HAS_STDSTRING
// Parse(std::string)
doc.SetNull();
doc.Parse(s2);
ParseCheck(doc);
#endif
} }
TEST(Document, Parse) { TEST(Document, Parse) {
...@@ -140,6 +162,42 @@ static FILE* OpenEncodedFile(const char* filename) { ...@@ -140,6 +162,42 @@ static FILE* OpenEncodedFile(const char* filename) {
return 0; return 0;
} }
TEST(Document, Parse_Encoding) {
const char* json = " { \"hello\" : \"world\", \"t\" : true , \"f\" : false, \"n\": null, \"i\":123, \"pi\": 3.1416, \"a\":[1, 2, 3, 4] } ";
typedef GenericDocument<UTF16<> > DocumentType;
DocumentType doc;
// Parse<unsigned, SourceEncoding>(const SourceEncoding::Ch*)
// doc.Parse<kParseDefaultFlags, UTF8<> >(json);
// EXPECT_FALSE(doc.HasParseError());
// EXPECT_EQ(0, StrCmp(doc[L"hello"].GetString(), L"world"));
// Parse<unsigned, SourceEncoding>(const SourceEncoding::Ch*, size_t)
size_t length = strlen(json);
char* buffer = reinterpret_cast<char*>(malloc(length * 2));
memcpy(buffer, json, length);
memset(buffer + length, 'X', length);
#if RAPIDJSON_HAS_STDSTRING
std::string s2(buffer, length); // backup buffer
#endif
doc.SetNull();
doc.Parse<kParseDefaultFlags, UTF8<> >(buffer, length);
free(buffer);
EXPECT_FALSE(doc.HasParseError());
if (doc.HasParseError())
printf("Error: %d at %zu\n", static_cast<int>(doc.GetParseError()), doc.GetErrorOffset());
EXPECT_EQ(0, StrCmp(doc[L"hello"].GetString(), L"world"));
#if RAPIDJSON_HAS_STDSTRING
// Parse<unsigned, SourceEncoding>(std::string)
doc.SetNull();
doc.Parse<kParseDefaultFlags, UTF8<> >(s2);
EXPECT_FALSE(doc.HasParseError());
EXPECT_EQ(0, StrCmp(doc[L"hello"].GetString(), L"world"));
#endif
}
TEST(Document, ParseStream_EncodedInputStream) { TEST(Document, ParseStream_EncodedInputStream) {
// UTF8 -> UTF16 // UTF8 -> UTF16
FILE* fp = OpenEncodedFile("utf8.json"); FILE* fp = OpenEncodedFile("utf8.json");
......
...@@ -73,6 +73,28 @@ TEST(SIMD, SIMD_SUFFIX(SkipWhitespace)) { ...@@ -73,6 +73,28 @@ TEST(SIMD, SIMD_SUFFIX(SkipWhitespace)) {
TestSkipWhitespace<InsituStringStream>(); TestSkipWhitespace<InsituStringStream>();
} }
TEST(SIMD, SIMD_SUFFIX(SkipWhitespace_EncodedMemoryStream)) {
for (size_t step = 1; step < 32; step++) {
char buffer[1024];
for (size_t i = 0; i < 1024; i++)
buffer[i] = " \t\r\n"[i % 4];
for (size_t i = 0; i < 1024; i += step)
buffer[i] = 'X';
MemoryStream ms(buffer, 1024);
EncodedInputStream<UTF8<>, MemoryStream> s(ms);
size_t i = 0;
for (;;) {
SkipWhitespace(s);
if (s.Peek() == '\0')
break;
//EXPECT_EQ(i, s.Tell());
EXPECT_EQ('X', s.Take());
i += step;
}
}
}
struct ScanCopyUnescapedStringHandler : BaseReaderHandler<UTF8<>, ScanCopyUnescapedStringHandler> { struct ScanCopyUnescapedStringHandler : BaseReaderHandler<UTF8<>, ScanCopyUnescapedStringHandler> {
bool String(const char* str, size_t length, bool) { bool String(const char* str, size_t length, bool) {
memcpy(buffer, str, length + 1); memcpy(buffer, str, length + 1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment