Use vmovd to avoid switch to sse mode

BUG=none TEST=c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=*Psnr* Review URL: https://webrtc-codereview.appspot.com/1097013 git-svn-id: http://libyuv.googlecode.com/svn/trunk@573 16f28f9a-4ce2-e073-06de-1de4eb20be90

Use vmovd to avoid switch to sse mode
BUG=none TEST=c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=*Psnr* Review URL: https://webrtc-codereview.appspot.com/1097013 git-svn-id: http://libyuv.googlecode.com/svn/trunk@573 16f28f9a-4ce2-e073-06de-1de4eb20be90
408e5743 · fbarchard@google.com · f3ad618d · 408e5743 · 408e5743 · 408e5743
Commit 408e5743 authored Feb 14, 2013 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 11 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

compare.cc source/compare.cc +15 -0

compare_win.cc source/compare_win.cc +8 -9

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 572
+Version: 573
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 572
+#define LIBYUV_VERSION 573
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -102,7 +102,9 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  }
 #endif
 #if defined(HAS_SUMSQUAREERROR_AVX2)
+  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2)) {
+    clear = true;
    // Note only used for multiples of 32 so count is not checked.
    SumSquareError = SumSquareError_AVX2;
  }
@@ -130,6 +132,12 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  if (remainder) {
    sse += SumSquareError_C(src_a, src_b, remainder);
  }
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
  return sse;
 }
@@ -157,7 +165,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
  }
 #endif
 #if defined(HAS_SUMSQUAREERROR_AVX2)
+  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+    clear = true;
    SumSquareError = SumSquareError_AVX2;
  }
 #endif
@@ -168,6 +178,11 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
    src_b += stride_b;
  }
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
  return sse;
 }

--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -47,9 +47,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    paddd      xmm0, xmm2
    jg         wloop
-    pshufd     xmm1, xmm0, 0EEh
+    pshufd     xmm1, xmm0, 0xee
    paddd      xmm0, xmm1
-    pshufd     xmm1, xmm0, 01h
+    pshufd     xmm1, xmm0, 0x01
    paddd      xmm0, xmm1
    movd       eax, xmm0
    ret
@@ -67,7 +67,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    mov        edx, [esp + 8]    // src_b
    mov        ecx, [esp + 12]   // count
    vpxor      ymm0, ymm0, ymm0  // sum
-    vpxor      ymm5, ymm5, ymm5  // for unpack.
+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
    sub        edx, eax
    align      16
@@ -92,9 +92,8 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
    vpaddd     ymm0, ymm0, ymm1
    vpermq     ymm1, ymm0, 0x02  // high + low lane.
-    vpaddd     ymm4, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm1
-    vzeroupper                   // TODO(fbarchard): Remove.
+    vmovd      eax, xmm0
-    movd       eax, xmm4
    ret
  }
 }
@@ -173,14 +172,14 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    sub        ecx, 16
    paddd      xmm1, xmm3
-    pshufd     xmm2, xmm1, 14    // upper 2 dwords
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 1
+    pshufd     xmm2, xmm1, 0x01
    paddd      xmm1, xmm2
    paddd      xmm0, xmm1
    jg         wloop
-    movd       eax, xmm0        // return hash
+    movd       eax, xmm0         // return hash
    ret
  }
 }