Commit f3ad618d authored by fbarchard@google.com's avatar fbarchard@google.com

Sum of Square Error ported to AVX2

BUG=187
TEST=compare_unittest
Review URL: https://webrtc-codereview.appspot.com/1099009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@572 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 5f885866
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 571
Version: 572
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 571
#define LIBYUV_VERSION 572
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -71,12 +71,19 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#elif !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
#endif
#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
#endif
// Visual C 2012 required for AVX2.
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) && _MSC_VER >= 1700
#define HAS_SUMSQUAREERROR_AVX2
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
#endif
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
int count) {
......@@ -86,16 +93,24 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
if (TestCpuFlag(kCpuHasNEON)) {
SumSquareError = SumSquareError_NEON;
}
#elif defined(HAS_SUMSQUAREERROR_SSE2)
#endif
#if defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2;
}
#endif
// 32K values will fit a 32bit int return value from SumSquareError.
// After each block of 32K, accumulate into 64 bit int.
const int kBlockSize = 1 << 15; // 32768;
#if defined(HAS_SUMSQUAREERROR_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
// Note only used for multiples of 32 so count is not checked.
SumSquareError = SumSquareError_AVX2;
}
#endif
// SumSquareError returns values 0 to 65535 for each squared difference.
// Up to 65536 of those can be summed and remain within a uint32.
// After each block of 65536 pixels, accumulate into a uint64.
const int kBlockSize = 65536;
uint64 sse = 0;
#ifdef _OPENMP
#pragma omp parallel for reduction(+: sse)
......@@ -105,13 +120,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
src_a += count & ~(kBlockSize - 1);
src_b += count & ~(kBlockSize - 1);
int remainder = count & (kBlockSize - 1) & ~15;
int remainder = count & (kBlockSize - 1) & ~31;
if (remainder) {
sse += SumSquareError(src_a, src_b, remainder);
src_a += remainder;
src_b += remainder;
}
remainder = count & 15;
remainder = count & 31;
if (remainder) {
sse += SumSquareError_C(src_a, src_b, remainder);
}
......@@ -122,20 +137,30 @@ LIBYUV_API
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
if (stride_a == width && stride_b == width) {
return ComputeSumSquareError(src_a, src_b, width * height);
}
uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
SumSquareError_C;
#if defined(HAS_SUMSQUAREERROR_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SumSquareError = SumSquareError_NEON;
}
#elif defined(HAS_SUMSQUAREERROR_SSE2)
#endif
#if defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
SumSquareError = SumSquareError_SSE2;
}
#endif
#if defined(HAS_SUMSQUAREERROR_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
SumSquareError = SumSquareError_AVX2;
}
#endif
uint64 sse = 0;
for (int h = 0; h < height; ++h) {
sse += SumSquareError(src_a, src_b, width);
......
......@@ -56,6 +56,50 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
}
}
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable: 4752)
__declspec(naked) __declspec(align(16))
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
vpxor ymm0, ymm0, ymm0 // sum
vpxor ymm5, ymm5, ymm5 // for unpack.
sub edx, eax
align 16
wloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
sub ecx, 32
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
vpunpckhbw ymm1, ymm1, ymm5
vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
vpaddd ymm0, ymm0, ymm1
vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
vpaddd ymm0, ymm0, ymm1
vpermq ymm1, ymm0, 0x02 // high + low lane.
vpaddd ymm4, ymm0, ymm1
vzeroupper // TODO(fbarchard): Remove.
movd eax, xmm4
ret
}
}
#endif // _MSC_VER >= 1700
#define HAS_HASHDJB2_SSE41
static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static const uvec32 kHashMul0 = {
......@@ -140,8 +184,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
ret
}
}
#endif // _M_IX86
#endif // !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment