reorder stores for FastConvertYUVToABGRRow_SSSE3 and…

reorder stores for FastConvertYUVToABGRRow_SSSE3 and FastConvertYUVToBGRARow_SSSE3. ReverseRow_SSE2. cpu detect allow environment variable override set LIBYUV_DISABLE_SSSE3=1 set LIBYUV_DISABLE_SSE2=1. Reorder stores in rotate for core2 BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/317010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@107 16f28f9a-4ce2-e073-06de-1de4eb20be90

reorder stores for FastConvertYUVToABGRRow_SSSE3 and…
reorder stores for FastConvertYUVToABGRRow_SSSE3 and FastConvertYUVToBGRARow_SSSE3. ReverseRow_SSE2. cpu detect allow environment variable override set LIBYUV_DISABLE_SSSE3=1 set LIBYUV_DISABLE_SSE2=1. Reorder stores in rotate for core2 BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/317010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@107 16f28f9a-4ce2-e073-06de-1de4eb20be90
373cdbdc · fbarchard@google.com · 8b9759c4 · 373cdbdc · 373cdbdc · 373cdbdc
Commit 373cdbdc authored Dec 14, 2011 by fbarchard@google.com
7 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 106
+Version: 107
 License: BSD
 License File: LICENSE

--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -10,6 +10,7 @@
 #include "libyuv/cpu_id.h"
+#include <stdlib.h>  // for getenv
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
@@ -55,6 +56,15 @@ int InitCpuFlags() {
  cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
              (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
              kCpuInitialized;
+  // environment variable overrides for testing.
+  if (getenv("LIBYUV_DISABLE_SSE2")) {
+    cpu_info_ &= ~kCpuHasSSE2;
+  }
+  // environment variable overrides for testing.
+  if (getenv("LIBYUV_DISABLE_SSSE3")) {
+    cpu_info_ &= ~kCpuHasSSSE3;
+  }
 #elif defined(__ANDROID__) && defined(__ARM_NEON__)
  uint64_t features = android_getCpuFeatures();
  cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -340,6 +340,18 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
    ReverseRow = ReverseRow_SSSE3;
  } else
+#endif
+#if defined(HAS_REVERSE_ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 32) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16) &&
+      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+    ReverseRow = ReverseRow_SSE2;
+  } else
 #endif
  {
    ReverseRow = ReverseRow_C;

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -867,6 +867,14 @@ void RotatePlane180(const uint8* src, int src_stride,
      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    ReverseRow = ReverseRow_SSSE3;
  } else
+#endif
+#if defined(HAS_REVERSE_ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ReverseRow = ReverseRow_SSE2;
+  } else
 #endif
  {
    ReverseRow = ReverseRow_C;
@@ -1019,8 +1027,8 @@ __asm {
    lea       eax, [eax - 16]
    pshufb    xmm0, xmm5
    movlpd    qword ptr [edx], xmm0
-    lea       edx, [edx + 8]
    movhpd    qword ptr [edi], xmm0
+    lea       edx, [edx + 8]
    lea       edi, [edi + 8]
    sub       ecx, 8
    ja        convertloop
@@ -1044,8 +1052,8 @@ void ReverseRowUV_SSSE3(const uint8* src,
  "lea        -16(%0),%0                       \n"
  "pshufb     %%xmm5,%%xmm0                    \n"
  "movlpd     %%xmm0,(%1)                      \n"
-  "lea        8(%1),%1                         \n"
  "movhpd     %%xmm0,(%2)                      \n"
+  "lea        8(%1),%1                         \n"
  "lea        8(%2),%2                         \n"
  "sub        $8,%3                            \n"
  "ja         1b                               \n"

--- a/source/row.h
+++ b/source/row.h
@@ -65,6 +65,7 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
 #define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
 #define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
 #define HAS_REVERSE_ROW_SSSE3
+#define HAS_REVERSE_ROW_SSE2
 #endif
 // The following are available on Neon platforms
@@ -102,6 +103,9 @@ void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 #ifdef HAS_REVERSE_ROW_SSSE3
 void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
 #endif
+#ifdef HAS_REVERSE_ROW_SSE2
+void ReverseRow_SSE2(const uint8* src, uint8* dst, int width);
+#endif
 #ifdef HAS_REVERSE_ROW_NEON
 void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
 #endif

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -17,16 +17,22 @@ namespace libyuv {
 extern "C" {
 #endif
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
 #ifdef HAS_ARGBTOUVROW_SSSE3
-vec8 kARGBToU = {
+CONST vec8 kARGBToU = {
  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
 };
-uvec8 kARGBToV = {
+CONST uvec8 kARGBToV = {
  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
 };
-uvec8 kAddUV128 = {
+CONST uvec8 kAddUV128 = {
  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
@@ -35,31 +41,31 @@ uvec8 kAddUV128 = {
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Constant multiplication table for converting ARGB to I400.
-vec8 kARGBToY = {
+CONST vec8 kARGBToY = {
  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
-uvec8 kAddY16 = {
+CONST uvec8 kAddY16 = {
  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 // Shuffle table for converting BG24 to ARGB.
-uvec8 kShuffleMaskBG24ToARGB = {
+CONST uvec8 kShuffleMaskBG24ToARGB = {
  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 // Shuffle table for converting RAW to ARGB.
-uvec8 kShuffleMaskRAWToARGB = {
+CONST uvec8 kShuffleMaskRAWToARGB = {
  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 // Shuffle table for converting ABGR to ARGB.
-uvec8 kShuffleMaskABGRToARGB = {
+CONST uvec8 kShuffleMaskABGRToARGB = {
  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
 };
 // Shuffle table for converting BGRA to ARGB.
-uvec8 kShuffleMaskBGRAToARGB = {
+CONST uvec8 kShuffleMaskBGRAToARGB = {
  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
 };
@@ -352,7 +358,7 @@ struct {
  vec16 kUVBiasR;
  vec16 kYSub16;
  vec16 kYToRgb;
-} SIMD_ALIGNED(kYuvConstants) = {
+} CONST SIMD_ALIGNED(kYuvConstants) = {
  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
@@ -445,8 +451,8 @@ void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,  // rdi
    "punpcklbw   %%xmm2,%%xmm5                 \n"
    "movdqa      %%xmm5,%%xmm0                 \n"
    "punpcklwd   %%xmm1,%%xmm5                 \n"
-    "movdqa      %%xmm5,(%3)                   \n"
    "punpckhwd   %%xmm1,%%xmm0                 \n"
+    "movdqa      %%xmm5,(%3)                   \n"
    "movdqa      %%xmm0,0x10(%3)               \n"
    "lea         0x20(%3),%3                   \n"
    "sub         $0x8,%4                       \n"
@@ -480,8 +486,8 @@ void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,  // rdi
    "punpcklbw   %%xmm5,%%xmm0                 \n"
    "movdqa      %%xmm2,%%xmm1                 \n"
    "punpcklwd   %%xmm0,%%xmm2                 \n"
-    "movdqa      %%xmm2,(%3)                   \n"
    "punpckhwd   %%xmm0,%%xmm1                 \n"
+    "movdqa      %%xmm2,(%3)                   \n"
    "movdqa      %%xmm1,0x10(%3)               \n"
    "lea         0x20(%3),%3                   \n"
    "sub         $0x8,%4                       \n"
@@ -640,11 +646,8 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
 #ifdef HAS_REVERSE_ROW_SSSE3
-// TODO(fbarchard): define CONST macro that is static const for linux, but
-// does nothing for gcc on OSX (which has an internal compiler fault)
 // Shuffle table for reversing the bytes.
-uvec8 kShuffleReverse = {
+CONST uvec8 kShuffleReverse = {
  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
@@ -653,14 +656,14 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
  asm volatile (
  "movdqa     %3,%%xmm5                        \n"
  "lea        -0x10(%0,%2,1),%0                \n"
-"1:                                            \n"
+  "1:                                          \n"
-  "movdqa     (%0),%%xmm0                      \n"
+    "movdqa     (%0),%%xmm0                    \n"
-  "lea        -0x10(%0),%0                     \n"
+    "lea        -0x10(%0),%0                   \n"
-  "pshufb     %%xmm5,%%xmm0                    \n"
+    "pshufb     %%xmm5,%%xmm0                  \n"
-  "movdqa     %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,(%1)                    \n"
-  "lea        0x10(%1),%1                      \n"
+    "lea        0x10(%1),%1                    \n"
-  "sub        $0x10,%2                         \n"
+    "sub        $0x10,%2                       \n"
-  "ja         1b                               \n"
+    "ja         1b                             \n"
  : "+r"(src),  // %0
    "+r"(dst),  // %1
    "+r"(temp_width)  // %2
@@ -673,6 +676,38 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
 }
 #endif
+#ifdef HAS_REVERSE_ROW_SSE2
+void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile (
+  "lea        -0x10(%0,%2,1),%0                \n"
+  "1:                                          \n"
+    "movdqa     (%0),%%xmm0                    \n"
+    "lea        -0x10(%0),%0                   \n"
+    "movdqa     %%xmm0,%%xmm1                  \n"
+    "psllw      $0x8,%%xmm0                    \n"
+    "psrlw      $0x8,%%xmm1                    \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "pshuflw    $0x1b,%%xmm0,%%xmm0            \n"
+    "pshufhw    $0x1b,%%xmm0,%%xmm0            \n"
+    "pshufd     $0x4e,%%xmm0,%%xmm0            \n"
+    "movdqa     %%xmm0,(%1)                    \n"
+    "lea        0x10(%1),%1                    \n"
+    "sub        $0x10,%2                       \n"
+    "ja         1b                             \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -654,8 +654,8 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
    punpcklbw  xmm5, xmm2           // AR
    movdqa     xmm0, xmm5
    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
-    movdqa     [edx], xmm5
    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
+    movdqa     [edx], xmm5
    movdqa     [edx + 16], xmm0
    lea        edx,  [edx + 32]
@@ -694,8 +694,8 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
    punpcklbw  xmm0, xmm5           // BA
    movdqa     xmm1, xmm2
    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
-    movdqa     [edx], xmm2
    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
+    movdqa     [edx], xmm2
    movdqa     [edx + 16], xmm1
    lea        edx,  [edx + 32]
@@ -794,7 +794,7 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
 convertloop:
    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    movq       xmm0, [eax]
+    movq       xmm0, qword ptr [eax]
    lea        eax, [eax + 8]
    punpcklbw  xmm0, xmm0           // Y.Y
    psubusw    xmm0, xmm3
@@ -849,6 +849,33 @@ __asm {
 }
 #endif
+#ifdef HAS_REVERSE_ROW_SSE2
+__declspec(naked)
+void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
+__asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    lea       eax, [eax + ecx - 16]
+ convertloop:
+    movdqa    xmm0, [eax]
+    lea       eax, [eax - 16]
+    movdqa    xmm1, xmm0        // swap bytes
+    psllw     xmm0, 8
+    psrlw     xmm1, 8
+    por       xmm0, xmm1
+    pshuflw   xmm0, xmm0, 0x1b  // swap words
+    pshufhw   xmm0, xmm0, 0x1b
+    pshufd    xmm0, xmm0, 0x4e
+    movdqa    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    ja        convertloop
+    ret
+  }
+}
+#endif
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv