MergeUV backport to SSE2

BUG=135 TESTED=unitest I420ToNV12 Review URL: https://webrtc-codereview.appspot.com/943006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@447 16f28f9a-4ce2-e073-06de-1de4eb20be90

MergeUV backport to SSE2
BUG=135 TESTED=unitest I420ToNV12 Review URL: https://webrtc-codereview.appspot.com/943006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@447 16f28f9a-4ce2-e073-06de-1de4eb20be90
1dafd444 · fbarchard@google.com · 6bb9f53f · 1dafd444 · 1dafd444 · 1dafd444
Commit 1dafd444 authored Oct 26, 2012 by fbarchard@google.com
7 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 446
+Version: 447
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -86,6 +86,7 @@ extern "C" {
 #define HAS_YUY2TOYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I422TOUYVYROW_SSE2
+#define HAS_MERGEUV_SSE2
 // Effects
 #define HAS_ARGBAFFINEROW_SSE2
@@ -120,6 +121,7 @@ extern "C" {
 #define HAS_UYVYTOYROW_AVX2
 #define HAS_YUY2TOYROW_MMX
 #define HAS_UYVYTOYROW_MMX
+#define HAS_MERGEUV_SSE2
 #endif
 // The following are disabled when SSSE3 is available:
@@ -311,6 +313,8 @@ void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
               int width);
+void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width);
 void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                  int width);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 446
+#define LIBYUV_VERSION 447
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -521,7 +521,14 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
  int halfwidth = (width + 1) >> 1;
  void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                  int width) = MergeUV_C;
-#if defined(HAS_SPLITUV_NEON)
+#if defined(HAS_MERGEUV_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) &&
+      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+      IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+    MergeUV = MergeUV_SSE2;
+  }
+#elif defined(HAS_MERGEUV_NEON)
  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
    MergeUV = MergeUV_NEON;
  }
@@ -529,7 +536,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
  int halfheight = (height + 1) >> 1;
  for (int y = 0; y < halfheight; ++y) {
-    // Copy a row of UV.
+    // Merge a row of U and V into a row of UV.
    MergeUV(src_u, src_v, dst_uv, halfwidth);
    src_u += src_stride_u;
    src_v += src_stride_v;

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2547,6 +2547,37 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 }
 #endif  // HAS_SPLITUV_SSE2
+#ifdef HAS_MERGEUV_SSE2
+void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    ".p2align   4                                \n"
+  "1:                                            \n"
+    "movdqa    (%0),%%xmm0                       \n"
+    "movdqa    (%0,%1,1),%%xmm1                  \n"
+    "lea       0x10(%0),%0                       \n"
+    "movdqa    %%xmm0,%%xmm2                     \n"
+    "punpcklbw %%xmm1,%%xmm0                     \n"
+    "punpckhbw %%xmm1,%%xmm2                     \n"
+    "movdqa    %%xmm0,(%2)                       \n"
+    "movdqa    %%xmm2,0x10(%2)                   \n"
+    "lea       0x20(%2),%2                       \n"
+    "sub       $0x10,%3                          \n"
+    "jg        1b                                \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2"
+#endif
+  );
+}
+#endif  // HAS_MERGEUV_SSE2
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  asm volatile (

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2620,6 +2620,38 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 }
 #endif  // HAS_SPLITUV_SSE2
+#ifdef HAS_MERGEUV_SSE2
+__declspec(naked) __declspec(align(16))
+void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]      // read 16 U's
+    movdqa     xmm1, [eax + edx]  // and 16 V's
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, xmm0
+    punpcklbw  xmm0, xmm1       // first 8 UV pairs
+    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    movdqa     [edi], xmm0
+    movdqa     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+    pop        edi
+    ret
+  }
+}
+#endif  //  HAS_MERGEUV_SSE2
 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
 __declspec(naked) __declspec(align(16))

--- a/source/row_x86.asm
+++ b/source/row_x86.asm
-; 
+;
 ; Copyright 2012 The LibYuv Project Authors. All rights reserved.
-; 
+;
 ; Use of this source code is governed by a BSD-style license
 ; that can be found in the LICENSE file in the root of the source
 ; tree. An additional intellectual property rights grant can be found
 ; in the file PATENTS.  All contributing project authors may
 ; be found in the AUTHORS file in the root of the source tree.
-; 
+;
 %ifdef __YASM_VERSION_ID__
 %if __YASM_VERSION_ID__ < 01020000h
@@ -17,6 +17,8 @@
 SECTION .text
+; cglobal numeric constants are parameters, gpr regs, mm regs
 ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 %macro YUY2TOYROW 2-3
@@ -63,8 +65,7 @@ YUY2TOYROW YUY2,u,_Unaligned
 YUY2TOYROW UYVY,a,
 YUY2TOYROW UYVY,u,_Unaligned
-; void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 
+; void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
-;                             int pix) {
 %macro SPLITUV 1-2
 cglobal SplitUV%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
@@ -103,3 +104,36 @@ INIT_YMM AVX2
 SPLITUV a,
 SPLITUV u,_Unaligned
+; void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+;                   int width);
+%macro MergeUV 1-2
+cglobal MergeUV%2, 4, 4, 3, src_u, src_v, dst_uv, pix
+    sub        src_vq, src_uq
+    ALIGN      16
+.convertloop:
+    mov%1      m0, [src_uq]
+    mov%1      m1, [src_vq]
+    lea        src_uq, [src_uq + mmsize]
+    mova       m2, m0
+    punpcklbw  m0, m0, m1       // first 8 UV pairs
+    punpckhbw  m2, m2, m1       // next 8 UV pairs
+    mov%1      [dst_uvq], m0
+    mov%1      [dst_uvq + mmsize], m2
+    lea        dst_uvq, [dst_uvq + mmsize * 2]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+INIT_MMX MMX
+MERGEUV a,
+MERGEUV u,_Unaligned
+INIT_XMM SSE2
+MERGEUV a,
+MERGEUV u,_Unaligned
+INIT_YMM AVX2
+MERGEUV a,
+MERGEUV u,_Unaligned