Commit b45c5408 authored by Milo Yip's avatar Milo Yip Committed by GitHub

Merge pull request #932 from JunHe77/master

Added ARM-Neon support for SIMD.SkipWhitespace*
parents d2fce924 2291258b
......@@ -256,7 +256,7 @@ Alternatively, if we don't want to explicitly refer to the root value of `addres
3. What is SIMD? How it is applied in RapidJSON?
[SIMD](http://en.wikipedia.org/wiki/SIMD) instructions can perform parallel computation in modern CPUs. RapidJSON support Intel's SSE2/SSE4.2 to accelerate whitespace skipping. This improves performance of parsing indent formatted JSON. Define `RAPIDJSON_SSE2` or `RAPIDJSON_SSE42` macro to enable this feature. However, running the executable on a machine without such instruction set support will make it crash.
[SIMD](http://en.wikipedia.org/wiki/SIMD) instructions can perform parallel computation in modern CPUs. RapidJSON support Intel's SSE2/SSE4.2 and ARM's Neon to accelerate whitespace/tabspace/carriage-return/line-feed skipping. This improves performance of parsing indent formatted JSON. Define `RAPIDJSON_SSE2`, `RAPIDJSON_SSE42` or `RAPIDJSON_NEON` macro to enable this feature. However, running the executable on a machine without such instruction set support will make it crash.
4. Does it consume a lot of memory?
......
......@@ -257,7 +257,7 @@
3. 什是是 SIMD?它如何用于 RapidJSON?
[SIMD](http://en.wikipedia.org/wiki/SIMD) 指令可以在现代 CPU 中执行并行运算。RapidJSON 支持了 Intel 的 SSE2/SSE4.2 去加速跳过空白字符。在解析含缩进的 JSON 时,这能提升性能。只要定义名为 `RAPIDJSON_SSE2``RAPIDJSON_SSE42` 的宏,就能启动这个功能。然而,若在不支持这些指令集的机器上执行这些可执行文件,会导致崩溃。
[SIMD](http://en.wikipedia.org/wiki/SIMD) 指令可以在现代 CPU 中执行并行运算。RapidJSON 支持使用 Intel 的 SSE2/SSE4.2 和 ARM 的 Neon 来加速对空白符、制表符、回车符和换行符的过滤处理。在解析含缩进的 JSON 时,这能提升性能。只要定义名为 `RAPIDJSON_SSE2``RAPIDJSON_SSE42``RAPIDJSON_NEON` 的宏,就能启动这个功能。然而,若在不支持这些指令集的机器上执行这些可执行文件,会导致崩溃。
4. 它会消耗许多内存么?
......
......@@ -183,17 +183,20 @@ void SkipWhitespace(InputStream& s) {
However, this requires 4 comparisons and a few branching for each character. This was found to be a hot spot.
To accelerate this process, SIMD was applied to compare 16 characters with 4 white spaces for each iteration. Currently RapidJSON only supports SSE2 and SSE4.2 instructions for this. And it is only activated for UTF-8 memory streams, including string stream or *in situ* parsing.
To accelerate this process, SIMD was applied to compare 16 characters with 4 white spaces for each iteration. Currently RapidJSON supports SSE2, SSE4.2 and ARM Neon instructions for this. And it is only activated for UTF-8 memory streams, including string stream or *in situ* parsing.
To enable this optimization, need to define `RAPIDJSON_SSE2` or `RAPIDJSON_SSE42` before including `rapidjson.h`. Some compilers can detect the setting, as in `perftest.h`:
To enable this optimization, need to define `RAPIDJSON_SSE2`, `RAPIDJSON_SSE42` or `RAPIDJSON_NEON` before including `rapidjson.h`. Some compilers can detect the setting, as in `perftest.h`:
~~~cpp
// __SSE2__ and __SSE4_2__ are recognized by gcc, clang, and the Intel compiler.
// We use -march=native with gmake to enable -msse2 and -msse4.2, if supported.
// Likewise, __ARM_NEON is used to detect Neon.
#if defined(__SSE4_2__)
# define RAPIDJSON_SSE42
#elif defined(__SSE2__)
# define RAPIDJSON_SSE2
#elif defined(__ARM_NEON)
# define RAPIDJSON_NEON
#endif
~~~
......
......@@ -183,17 +183,20 @@ void SkipWhitespace(InputStream& s) {
但是,这需要对每个字符进行4次比较以及一些分支。这被发现是一个热点。
为了加速这一处理,RapidJSON 使用 SIMD 来在一次迭代中比较16个字符和4个空格。目前 RapidJSON 只支持 SSE2 和 SSE4.2 指令。同时它也只会对 UTF-8 内存流启用,包括字符串流或 *原位* 解析。
为了加速这一处理,RapidJSON 使用 SIMD 来在一次迭代中比较16个字符和4个空格。目前 RapidJSON 支持 SSE2 , SSE4.2 和 ARM Neon 指令。同时它也只会对 UTF-8 内存流启用,包括字符串流或 *原位* 解析。
你可以通过在包含 `rapidjson.h` 之前定义 `RAPIDJSON_SSE2` `RAPIDJSON_SSE42` 来启用这个优化。一些编译器可以检测这个设置,如 `perftest.h`
你可以通过在包含 `rapidjson.h` 之前定义 `RAPIDJSON_SSE2` `RAPIDJSON_SSE42``RAPIDJSON_NEON` 来启用这个优化。一些编译器可以检测这个设置,如 `perftest.h`
~~~cpp
// __SSE2__ 和 __SSE4_2__ 可被 gcc、clang 和 Intel 编译器识别:
// 如果支持的话,我们在 gmake 中使用了 -march=native 来启用 -msse2 和 -msse4.2
// 同样的, __ARM_NEON 被用于识别Neon
#if defined(__SSE4_2__)
# define RAPIDJSON_SSE42
#elif defined(__SSE2__)
# define RAPIDJSON_SSE2
#elif defined(__ARM_NEON)
# define RAPIDJSON_NEON
#endif
~~~
......
......@@ -325,17 +325,17 @@
#endif
///////////////////////////////////////////////////////////////////////////////
// RAPIDJSON_SSE2/RAPIDJSON_SSE42/RAPIDJSON_SIMD
// RAPIDJSON_SSE2/RAPIDJSON_SSE42/RAPIDJSON_NEON/RAPIDJSON_SIMD
/*! \def RAPIDJSON_SIMD
\ingroup RAPIDJSON_CONFIG
\brief Enable SSE2/SSE4.2 optimization.
\brief Enable SSE2/SSE4.2/Neon optimization.
RapidJSON supports optimized implementations for some parsing operations
based on the SSE2 or SSE4.2 SIMD extensions on modern Intel-compatible
processors.
based on the SSE2, SSE4.2 or NEon SIMD extensions on modern Intel
or ARM compatible processors.
To enable these optimizations, two different symbols can be defined;
To enable these optimizations, three different symbols can be defined;
\code
// Enable SSE2 optimization.
#define RAPIDJSON_SSE2
......@@ -344,13 +344,17 @@
#define RAPIDJSON_SSE42
\endcode
\c RAPIDJSON_SSE42 takes precedence, if both are defined.
// Enable ARM Neon optimization.
#define RAPIDJSON_NEON
\endcode
\c RAPIDJSON_SSE42 takes precedence over SSE2, if both are defined.
If any of these symbols is defined, RapidJSON defines the macro
\c RAPIDJSON_SIMD to indicate the availability of the optimized code.
*/
#if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) \
|| defined(RAPIDJSON_DOXYGEN_RUNNING)
|| defined(RAPIDJSON_NEON) || defined(RAPIDJSON_DOXYGEN_RUNNING)
#define RAPIDJSON_SIMD
#endif
......
......@@ -33,6 +33,8 @@
#include <nmmintrin.h>
#elif defined(RAPIDJSON_SSE2)
#include <emmintrin.h>
#elif defined(RAPIDJSON_NEON)
#include <arm_neon.h>
#endif
#ifdef _MSC_VER
......@@ -411,7 +413,92 @@ inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
return SkipWhitespace(p, end);
}
#endif // RAPIDJSON_SSE2
#elif defined(RAPIDJSON_NEON)
//! Skip whitespace with ARM Neon instructions, testing 16 8-byte characters at once.
inline const char *SkipWhitespace_SIMD(const char* p) {
// Fast return for single non-whitespace
if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')
++p;
else
return p;
// 16-byte align to the next boundary
const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
while (p != nextAligned)
if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')
++p;
else
return p;
const uint8x16_t w0 = vmovq_n_u8(' ');
const uint8x16_t w1 = vmovq_n_u8('\n');
const uint8x16_t w2 = vmovq_n_u8('\r');
const uint8x16_t w3 = vmovq_n_u8('\t');
for (;; p += 16) {
const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p));
uint8x16_t x = vceqq_u8(s, w0);
x = vorrq_u8(x, vceqq_u8(s, w1));
x = vorrq_u8(x, vceqq_u8(s, w2));
x = vorrq_u8(x, vceqq_u8(s, w3));
x = vmvnq_u8(x); // Negate
x = vrev64q_u8(x); // Rev in 64
uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract
uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract
if (low == 0) {
if (high != 0) {
int lz =__builtin_clzll(high);;
return p + 8 + (lz >> 3);
}
} else {
int lz = __builtin_clzll(low);;
return p + (lz >> 3);
}
}
}
inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
// Fast return for single non-whitespace
if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t'))
++p;
else
return p;
const uint8x16_t w0 = vmovq_n_u8(' ');
const uint8x16_t w1 = vmovq_n_u8('\n');
const uint8x16_t w2 = vmovq_n_u8('\r');
const uint8x16_t w3 = vmovq_n_u8('\t');
for (; p <= end - 16; p += 16) {
const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p));
uint8x16_t x = vceqq_u8(s, w0);
x = vorrq_u8(x, vceqq_u8(s, w1));
x = vorrq_u8(x, vceqq_u8(s, w2));
x = vorrq_u8(x, vceqq_u8(s, w3));
x = vmvnq_u8(x); // Negate
x = vrev64q_u8(x); // Rev in 64
uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract
uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract
if (low == 0) {
if (high != 0) {
int lz = __builtin_clzll(high);
return p + 8 + (lz >> 3);
}
} else {
int lz = __builtin_clzll(low);
return p + (lz >> 3);
}
}
return SkipWhitespace(p, end);
}
#endif // RAPIDJSON_NEON
#ifdef RAPIDJSON_SIMD
//! Template function specialization for InsituStringStream
......@@ -1129,7 +1216,180 @@ private:
is.src_ = is.dst_ = p;
}
#endif
#elif defined(RAPIDJSON_NEON)
// StringStream -> StackStream<char>
static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(StringStream& is, StackStream<char>& os) {
const char* p = is.src_;
// Scan one by one until alignment (unaligned load may cross page boundary and cause crash)
const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
while (p != nextAligned)
if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) {
is.src_ = p;
return;
}
else
os.Put(*p++);
// The rest of string using SIMD
const uint8x16_t s0 = vmovq_n_u8('"');
const uint8x16_t s1 = vmovq_n_u8('\\');
const uint8x16_t s2 = vmovq_n_u8('\b');
const uint8x16_t s3 = vmovq_n_u8(32);
for (;; p += 16) {
const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p));
uint8x16_t x = vceqq_u8(s, s0);
x = vorrq_u8(x, vceqq_u8(s, s1));
x = vorrq_u8(x, vceqq_u8(s, s2));
x = vorrq_u8(x, vcltq_u8(s, s3));
x = vrev64q_u8(x); // Rev in 64
uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract
uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract
SizeType length = 0;
bool escaped = false;
if (low == 0) {
if (high != 0) {
unsigned lz = (unsigned)__builtin_clzll(high);;
length = 8 + (lz >> 3);
escaped = true;
}
} else {
unsigned lz = (unsigned)__builtin_clzll(low);;
length = lz >> 3;
escaped = true;
}
if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped
if (length != 0) {
char* q = reinterpret_cast<char*>(os.Push(length));
for (size_t i = 0; i < length; i++)
q[i] = p[i];
p += length;
}
break;
}
vst1q_u8(reinterpret_cast<uint8_t *>(os.Push(16)), s);
}
is.src_ = p;
}
// InsituStringStream -> InsituStringStream
static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(InsituStringStream& is, InsituStringStream& os) {
RAPIDJSON_ASSERT(&is == &os);
(void)os;
if (is.src_ == is.dst_) {
SkipUnescapedString(is);
return;
}
char* p = is.src_;
char *q = is.dst_;
// Scan one by one until alignment (unaligned load may cross page boundary and cause crash)
const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
while (p != nextAligned)
if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) {
is.src_ = p;
is.dst_ = q;
return;
}
else
*q++ = *p++;
// The rest of string using SIMD
const uint8x16_t s0 = vmovq_n_u8('"');
const uint8x16_t s1 = vmovq_n_u8('\\');
const uint8x16_t s2 = vmovq_n_u8('\b');
const uint8x16_t s3 = vmovq_n_u8(32);
for (;; p += 16, q += 16) {
const uint8x16_t s = vld1q_u8(reinterpret_cast<uint8_t *>(p));
uint8x16_t x = vceqq_u8(s, s0);
x = vorrq_u8(x, vceqq_u8(s, s1));
x = vorrq_u8(x, vceqq_u8(s, s2));
x = vorrq_u8(x, vcltq_u8(s, s3));
x = vrev64q_u8(x); // Rev in 64
uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract
uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract
SizeType length = 0;
bool escaped = false;
if (low == 0) {
if (high != 0) {
unsigned lz = (unsigned)__builtin_clzll(high);
length = 8 + (lz >> 3);
escaped = true;
}
} else {
unsigned lz = (unsigned)__builtin_clzll(low);
length = lz >> 3;
escaped = true;
}
if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped
for (const char* pend = p + length; p != pend; ) {
*q++ = *p++;
}
break;
}
vst1q_u8(reinterpret_cast<uint8_t *>(q), s);
}
is.src_ = p;
is.dst_ = q;
}
// When read/write pointers are the same for insitu stream, just skip unescaped characters
static RAPIDJSON_FORCEINLINE void SkipUnescapedString(InsituStringStream& is) {
RAPIDJSON_ASSERT(is.src_ == is.dst_);
char* p = is.src_;
// Scan one by one until alignment (unaligned load may cross page boundary and cause crash)
const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
for (; p != nextAligned; p++)
if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) {
is.src_ = is.dst_ = p;
return;
}
// The rest of string using SIMD
const uint8x16_t s0 = vmovq_n_u8('"');
const uint8x16_t s1 = vmovq_n_u8('\\');
const uint8x16_t s2 = vmovq_n_u8('\b');
const uint8x16_t s3 = vmovq_n_u8(32);
for (;; p += 16) {
const uint8x16_t s = vld1q_u8(reinterpret_cast<uint8_t *>(p));
uint8x16_t x = vceqq_u8(s, s0);
x = vorrq_u8(x, vceqq_u8(s, s1));
x = vorrq_u8(x, vceqq_u8(s, s2));
x = vorrq_u8(x, vcltq_u8(s, s3));
x = vrev64q_u8(x); // Rev in 64
uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract
uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract
if (low == 0) {
if (high != 0) {
int lz = __builtin_clzll(high);
p += 8 + (lz >> 3);
break;
}
} else {
int lz = __builtin_clzll(low);
p += lz >> 3;
break;
}
}
is.src_ = is.dst_ = p;
}
#endif // RAPIDJSON_NEON
template<typename InputStream, bool backup, bool pushOnTake>
class NumberStream;
......
......@@ -32,6 +32,8 @@
#include <nmmintrin.h>
#elif defined(RAPIDJSON_SSE2)
#include <emmintrin.h>
#elif defined(RAPIDJSON_NEON)
#include <arm_neon.h>
#endif
#ifdef _MSC_VER
......@@ -619,7 +621,75 @@ inline bool Writer<StringBuffer>::ScanWriteUnescapedString(StringStream& is, siz
is.src_ = p;
return RAPIDJSON_LIKELY(is.Tell() < length);
}
#endif // defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42)
#elif defined(RAPIDJSON_NEON)
template<>
inline bool Writer<StringBuffer>::ScanWriteUnescapedString(StringStream& is, size_t length) {
if (length < 16)
return RAPIDJSON_LIKELY(is.Tell() < length);
if (!RAPIDJSON_LIKELY(is.Tell() < length))
return false;
const char* p = is.src_;
const char* end = is.head_ + length;
const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15));
const char* endAligned = reinterpret_cast<const char*>(reinterpret_cast<size_t>(end) & static_cast<size_t>(~15));
if (nextAligned > end)
return true;
while (p != nextAligned)
if (*p < 0x20 || *p == '\"' || *p == '\\') {
is.src_ = p;
return RAPIDJSON_LIKELY(is.Tell() < length);
}
else
os_->PutUnsafe(*p++);
// The rest of string using SIMD
const uint8x16_t s0 = vmovq_n_u8('"');
const uint8x16_t s1 = vmovq_n_u8('\\');
const uint8x16_t s2 = vmovq_n_u8('\b');
const uint8x16_t s3 = vmovq_n_u8(32);
for (; p != endAligned; p += 16) {
const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p));
uint8x16_t x = vceqq_u8(s, s0);
x = vorrq_u8(x, vceqq_u8(s, s1));
x = vorrq_u8(x, vceqq_u8(s, s2));
x = vorrq_u8(x, vcltq_u8(s, s3));
x = vrev64q_u8(x); // Rev in 64
uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract
uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract
SizeType len = 0;
bool escaped = false;
if (low == 0) {
if (high != 0) {
unsigned lz = (unsigned)__builtin_clzll(high);
len = 8 + (lz >> 3);
escaped = true;
}
} else {
unsigned lz = (unsigned)__builtin_clzll(low);
len = lz >> 3;
escaped = true;
}
if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped
char* q = reinterpret_cast<char*>(os_->PushUnsafe(len));
for (size_t i = 0; i < len; i++)
q[i] = p[i];
p += len;
break;
}
vst1q_u8(reinterpret_cast<uint8_t *>(os_->PushUnsafe(16)), s);
}
is.src_ = p;
return RAPIDJSON_LIKELY(is.Tell() < length);
}
#endif // RAPIDJSON_NEON
RAPIDJSON_NAMESPACE_END
......
......@@ -24,10 +24,13 @@
// __SSE2__ and __SSE4_2__ are recognized by gcc, clang, and the Intel compiler.
// We use -march=native with gmake to enable -msse2 and -msse4.2, if supported.
// Likewise, __ARM_NEON is used to detect Neon.
#if defined(__SSE4_2__)
# define RAPIDJSON_SSE42
#elif defined(__SSE2__)
# define RAPIDJSON_SSE2
#elif defined(__ARM_NEON)
# define RAPIDJSON_NEON
#endif
#define RAPIDJSON_HAS_STDSTRING 1
......
......@@ -28,6 +28,8 @@
#define SIMD_SUFFIX(name) name##_SSE2
#elif defined(RAPIDJSON_SSE42)
#define SIMD_SUFFIX(name) name##_SSE42
#elif defined(RAPIDJSON_NEON)
#define SIMD_SUFFIX(name) name##_NEON
#else
#define SIMD_SUFFIX(name) name
#endif
......
......@@ -21,6 +21,8 @@
# define RAPIDJSON_SSE42
#elif defined(__SSE2__)
# define RAPIDJSON_SSE2
#elif defined(__ARM_NEON)
# define RAPIDJSON_NEON
#endif
#define RAPIDJSON_NAMESPACE rapidjson_simd
......@@ -41,6 +43,8 @@ using namespace rapidjson_simd;
#define SIMD_SUFFIX(name) name##_SSE2
#elif defined(RAPIDJSON_SSE42)
#define SIMD_SUFFIX(name) name##_SSE42
#elif defined(RAPIDJSON_NEON)
#define SIMD_SUFFIX(name) name##_NEON
#else
#define SIMD_SUFFIX(name) name
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment