I422ToRGB24Row_SSSE3 in 1 pass. Internally converts to ARGB then packs down to RGB.

BUG=116 TEST=libyuv unittest Review URL: https://webrtc-codereview.appspot.com/863013 git-svn-id: http://libyuv.googlecode.com/svn/trunk@399 16f28f9a-4ce2-e073-06de-1de4eb20be90

I422ToRGB24Row_SSSE3 in 1 pass. Internally converts to ARGB then packs down to RGB.
BUG=116 TEST=libyuv unittest Review URL: https://webrtc-codereview.appspot.com/863013 git-svn-id: http://libyuv.googlecode.com/svn/trunk@399 16f28f9a-4ce2-e073-06de-1de4eb20be90
827de16b · fbarchard@google.com · 6b5a8eff · 827de16b · 827de16b · 827de16b
Commit 827de16b authored Oct 09, 2012 by fbarchard@google.com
6 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 398
+Version: 399
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -100,6 +100,8 @@ extern "C" {
 #define HAS_RGBATOARGBROW_SSSE3
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORAWROW_SSSE3
 #endif

 // The following are disabled when SSSE3 is available:
@@ -436,6 +438,19 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
                         uint8* rgba_buf,
                         int width);

+// RGB24/RAW are unaligned.
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width);
+
+void I422ToRAWRow_SSSE3(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width);
+
 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
@@ -528,6 +543,19 @@ void I422ToRGBARow_Any_SSSE3(const uint8* y_buf,
                             uint8* rgba_buf,
                             int width);

+// RGB24/RAW are unaligned.
+void I422ToRGB24Row_Any_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+
+void I422ToRAWRow_Any_SSSE3(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width);
+
 void YToARGBRow_SSE2(const uint8* y_buf,
                     uint8* argb_buf,
                     int width);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 398
+#define LIBYUV_VERSION 399

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -928,12 +928,9 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
    }
  }
-  }
 #endif

  for (int y = 0; y < height; ++y) {
@@ -982,12 +979,9 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
      I422ToRAWRow = I422ToRAWRow_SSSE3;
    }
  }
-  }
 #endif

  for (int y = 0; y < height; ++y) {

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1023,9 +1023,9 @@ YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
 YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
 #endif
 #ifdef HAS_I422TORGB24ROW_SSSE3
-YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3,                 \
-     I422ToRGB24Row_C, 1)
-YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
+// I422ToRGB24Row_SSSE3 is unaligned.
+YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1)
+YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1)
 #endif
 #ifdef HAS_I422TORGBAROW_SSSE3
 YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -122,6 +122,16 @@ static const uvec8 kShuffleMaskARGBToRAW = {
  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
 };

+// Shuffle table for converting ARGBToRGB24 for I420ToRGB24.  First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW_0 = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+
 __declspec(naked) __declspec(align(16))
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
  __asm {
@@ -1654,6 +1664,100 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
  }
 }

+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb24_buf,
+                          int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgb24
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pxor       xmm4, xmm4
+    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, kShuffleMaskARGBToRGB24
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into RRGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm2           // RR
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
+    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
+    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
+    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
+    movq       qword ptr [edx], xmm0  // First 8 bytes
+    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
+    lea        edx,  [edx + 24]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToRAWRow_SSSE3(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* raw_buf,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // raw
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pxor       xmm4, xmm4
+    movdqa     xmm5, kShuffleMaskARGBToRAW_0
+    movdqa     xmm6, kShuffleMaskARGBToRAW
+
+    align      16
+ convertloop:
+    READYUV422
+    YUVTORGB
+
+    // Step 3: Weave into RRGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm2           // RR
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
+    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
+    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
+    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
+    movq       qword ptr [edx], xmm0  // First 8 bytes
+    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
+    lea        edx,  [edx + 24]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))