ARGBMirror use SSE2 pshufd instruction instead of SSSE3 pshufb.

BUG=269 TESTED=local benchmark for ARGBMirror R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/32509004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1176 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBMirror use SSE2 pshufd instruction instead of SSSE3 pshufb.
BUG=269 TESTED=local benchmark for ARGBMirror R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/32509004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1176 16f28f9a-4ce2-e073-06de-1de4eb20be90
ef67597b · fbarchard@google.com · 91f240c5 · ef67597b · ef67597b · ef67597b
Commit ef67597b authored Nov 21, 2014 by fbarchard@google.com
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1175
+Version: 1176
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -68,7 +68,7 @@ extern "C" {
 #define HAS_ARGBCOPYYTOALPHAROW_SSE2
 #define HAS_ARGBGRAYROW_SSSE3
 #define HAS_ARGBLUMACOLORTABLEROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
 #define HAS_ARGBMULTIPLYROW_SSE2
 #define HAS_ARGBPOLYNOMIALROW_SSE2
 #define HAS_ARGBQUANTIZEROW_SSE2
@@ -836,11 +836,11 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                   int width);
 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
 void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1175
+#define LIBYUV_VERSION 1176
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -502,11 +502,11 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
+#if defined(HAS_ARGBMIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_SSSE3;
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
    }
  }
 #endif

--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -109,11 +109,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
    }
  }
 #endif
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
+#if defined(HAS_ARGBMIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_SSSE3;
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
    }
  }
 #endif

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -646,9 +646,6 @@ MANY(MirrorRow_Any_NEON, MirrorRow_NEON, MirrorRow_C, 1, 15)
 #ifdef HAS_ARGBMIRRORROW_AVX2
 MANY(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, ARGBMirrorRow_C, 4, 7)
 #endif
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-MANY(ARGBMirrorRow_Any_SSSE3, ARGBMirrorRow_SSSE3, ARGBMirrorRow_C, 4, 3)
-#endif
 #ifdef HAS_ARGBMIRRORROW_SSE2
 MANY(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, ARGBMirrorRow_C, 4, 3)
 #endif

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2306,21 +2306,16 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 }
 #endif  // HAS_MIRRORROW_UV_SSSE3
-#ifdef HAS_ARGBMIRRORROW_SSSE3
+#ifdef HAS_ARGBMIRRORROW_SSE2
-// Shuffle table for reversing the bytes.
-static uvec8 kARGBShuffleMirror = {
-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = (intptr_t)(width);
  asm volatile (
    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
-    "movdqa    %3,%%xmm5                       \n"
    LABELALIGN
  "1:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
    "lea       " MEMLEA(-0x10,0) ",%0          \n"
    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    "lea       " MEMLEA(0x10,1) ",%1           \n"
@@ -2332,11 +2327,11 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  : "m"(kARGBShuffleMirror)  // %3
  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm5"
+    , "xmm0"
 #endif
  );
 }
-#endif  // HAS_ARGBMIRRORROW_SSSE3
+#endif  // HAS_ARGBMIRRORROW_SSE2
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
@@ -2351,9 +2346,9 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  "1:                                          \n"
    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1           \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                        \n"
+    "sub        $0x20,%2                       \n"
-    "jg         1b                              \n"
+    "jg         1b                             \n"
    "vzeroupper                                \n"
  : "+r"(src),  // %0
    "+r"(dst),  // %1

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2507,26 +2507,20 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 }
 #endif  // HAS_MIRRORROW_UV_SSSE3
-#ifdef HAS_ARGBMIRRORROW_SSSE3
+#ifdef HAS_ARGBMIRRORROW_SSE2
-// Shuffle table for reversing the bytes.
-static const uvec8 kARGBShuffleMirror = {
-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
 __declspec(naked) __declspec(align(16))
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  __asm {
    mov       eax, [esp + 4]   // src
    mov       edx, [esp + 8]   // dst
    mov       ecx, [esp + 12]  // width
    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
-    movdqa    xmm5, kARGBShuffleMirror
    align      4
 convertloop:
    movdqu    xmm0, [eax]
    lea       eax, [eax - 16]
-    pshufb    xmm0, xmm5
+    pshufd    xmm0, xmm0, 0x1b
    movdqu    [edx], xmm0
    lea       edx, [edx + 16]
    sub       ecx, 4
@@ -2534,7 +2528,7 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
    ret
  }
 }
-#endif  // HAS_ARGBMIRRORROW_SSSE3
+#endif  // HAS_ARGBMIRRORROW_SSE2
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.