Sum of Square Error ported to AVX2

BUG=187 TEST=compare_unittest Review URL: https://webrtc-codereview.appspot.com/1099009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@572 16f28f9a-4ce2-e073-06de-1de4eb20be90

Sum of Square Error ported to AVX2
BUG=187 TEST=compare_unittest Review URL: https://webrtc-codereview.appspot.com/1099009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@572 16f28f9a-4ce2-e073-06de-1de4eb20be90
f3ad618d · fbarchard@google.com · 5f885866 · f3ad618d · f3ad618d · f3ad618d
Commit f3ad618d authored Feb 13, 2013 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 81 additions and 13 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

compare.cc source/compare.cc +34 -9

compare_win.cc source/compare_win.cc +45 -2

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 571
+Version: 572
 License: BSD
 License File: LICENSE


--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 571
+#define LIBYUV_VERSION 572

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -71,12 +71,19 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SUMSQUAREERROR_NEON
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
-#elif !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
+#endif
+#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
    defined(__x86_64__) || defined(__i386__))
 #define HAS_SUMSQUAREERROR_SSE2
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
 #endif
+// Visual C 2012 required for AVX2.
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) && _MSC_VER >= 1700
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif

+// TODO(fbarchard): Refactor into row function.
 LIBYUV_API
 uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
                             int count) {
@@ -86,16 +93,24 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  if (TestCpuFlag(kCpuHasNEON)) {
    SumSquareError = SumSquareError_NEON;
  }
-#elif defined(HAS_SUMSQUAREERROR_SSE2)
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
    // Note only used for multiples of 16 so count is not checked.
    SumSquareError = SumSquareError_SSE2;
  }
 #endif
-  // 32K values will fit a 32bit int return value from SumSquareError.
-  // After each block of 32K, accumulate into 64 bit int.
-  const int kBlockSize = 1 << 15;  // 32768;
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    // Note only used for multiples of 32 so count is not checked.
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
+  // SumSquareError returns values 0 to 65535 for each squared difference.
+  // Up to 65536 of those can be summed and remain within a uint32.
+  // After each block of 65536 pixels, accumulate into a uint64.
+  const int kBlockSize = 65536;
  uint64 sse = 0;
 #ifdef _OPENMP
 #pragma omp parallel for reduction(+: sse)
@@ -105,13 +120,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  }
  src_a += count & ~(kBlockSize - 1);
  src_b += count & ~(kBlockSize - 1);
-  int remainder = count & (kBlockSize - 1) & ~15;
+  int remainder = count & (kBlockSize - 1) & ~31;
  if (remainder) {
    sse += SumSquareError(src_a, src_b, remainder);
    src_a += remainder;
    src_b += remainder;
  }
-  remainder = count & 15;
+  remainder = count & 31;
  if (remainder) {
    sse += SumSquareError_C(src_a, src_b, remainder);
  }
@@ -122,20 +137,30 @@ LIBYUV_API
 uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
                                  const uint8* src_b, int stride_b,
                                  int width, int height) {
+
+  if (stride_a == width && stride_b == width) {
+    return ComputeSumSquareError(src_a, src_b, width * height);
+  }
+
  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
      SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    SumSquareError = SumSquareError_NEON;
  }
-#elif defined(HAS_SUMSQUAREERROR_SSE2)
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
      IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
    SumSquareError = SumSquareError_SSE2;
  }
 #endif
-
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
  uint64 sse = 0;
  for (int h = 0; h < height; ++h) {
    sse += SumSquareError(src_a, src_b, width);

--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -56,6 +56,50 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  }
 }

+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // for unpack.
+    sub        edx, eax
+
+    align      16
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    sub        ecx, 32
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm4, ymm0, ymm1
+    vzeroupper                   // TODO(fbarchard): Remove.
+    movd       eax, xmm4
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
 #define HAS_HASHDJB2_SSE41
 static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
 static const uvec32 kHashMul0 = {
@@ -140,8 +184,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    ret
  }
 }
-
-#endif  // _M_IX86
+#endif  // !defined(YUV_DISABLE_ASM) && defined(_M_IX86)

 #ifdef __cplusplus
 }  // extern "C"