SSSE3 version using pshufb for ARGBAttenuateRow_SSSE3

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/490011 git-svn-id: http://libyuv.googlecode.com/svn/trunk@243 16f28f9a-4ce2-e073-06de-1de4eb20be90

SSSE3 version using pshufb for ARGBAttenuateRow_SSSE3
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/490011 git-svn-id: http://libyuv.googlecode.com/svn/trunk@243 16f28f9a-4ce2-e073-06de-1de4eb20be90
f2c86d01 · fbarchard@google.com · 8ed54222 · f2c86d01 · f2c86d01 · f2c86d01
Commit f2c86d01 authored Apr 18, 2012 by fbarchard@google.com
Showing with 66 additions and 3 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

planar_functions.cc source/planar_functions.cc +7 -0

row.h source/row.h +6 -0

row_win.cc source/row_win.cc +51 -1

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 242
+Version: 243
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 242
+#define LIBYUV_VERSION 243
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -893,6 +893,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
    ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
  }
 #endif
+#if defined(HAS_ARGBATTENUATE_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+  }
+#endif
  for (int y = 0; y < height; ++y) {
    ARGBAttenuateRow(src_argb, dst_argb, width);

--- a/source/row.h
+++ b/source/row.h
@@ -69,6 +69,11 @@ extern "C" {
 #define HAS_ARGBATTENUATE_SSE2
 #endif
+// The following are available on Windows 32 bit
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_ARGBATTENUATE_SSSE3
+#endif
 // The following are available on Neon platforms
 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_MIRRORROW_NEON
@@ -363,6 +368,7 @@ void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
 #ifdef __cplusplus
 }  // extern "C"

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2334,8 +2334,58 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    ret
  }
 }
 #endif  // HAS_ARGBATTENUATE_SSE2
+#ifdef HAS_ARGBATTENUATE_SSSE3
+// Shuffle table duplicating alpha
+static const uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pslld      xmm3, 24
+    movdqa     xmm4, kShuffleAlpha0
+    movdqa     xmm5, kShuffleAlpha1
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]      // read 4 pixels
+    pshufb     xmm0, xmm4       // isolate first 2 alphas 
+    movdqa     xmm1, [eax]      // read 4 pixels
+    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1       // rgb * a
+    movdqa     xmm1, [eax]      // read 4 pixels
+    pshufb     xmm1, xmm5       // isolate next 2 alphas
+    movdqa     xmm2, [eax]      // read 4 pixels
+    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqa     xmm2, [eax]      // mask original alpha
+    pand       xmm2, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm2       // copy original alpha
+    sub        ecx, 4
+    movdqa     [eax + edx], xmm0
+    lea        eax, [eax + 16]
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATE_SSSE3
 #endif  // _M_IX86
 #ifdef __cplusplus