MergeUV aligned and SplitUV cpu detect combined with width check.

BUG=none TEST=libyuvTest.I420ToNV12_Any Review URL: https://webrtc-codereview.appspot.com/937005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@451 16f28f9a-4ce2-e073-06de-1de4eb20be90

MergeUV aligned and SplitUV cpu detect combined with width check.
BUG=none TEST=libyuvTest.I420ToNV12_Any Review URL: https://webrtc-codereview.appspot.com/937005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@451 16f28f9a-4ce2-e073-06de-1de4eb20be90
e0d8648b · fbarchard@google.com · 818b7102 · e0d8648b · e0d8648b · e0d8648b
Commit e0d8648b authored Oct 27, 2012 by fbarchard@google.com
9 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 449
+Version: 451
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -121,7 +121,8 @@ extern "C" {
 #define HAS_UYVYTOYROW_AVX2
 #define HAS_YUY2TOYROW_MMX
 #define HAS_UYVYTOYROW_MMX
-#define HAS_MERGEUV_SSE2
+#define HAS_MERGEUV_AVX2
+#define HAS_MERGEUV_MMX
 #endif
 // The following are disabled when SSSE3 is available:
@@ -324,8 +325,22 @@ void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
               int width);
 void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                  int width);
+void MergeUV_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width);
 void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                  int width);
+void MergeUV_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
+                            uint8* dst_uv, int width);
+void MergeUV_Unaligned_AVX2(const uint8* src_u, const uint8* src_v,
+                            uint8* dst_uv, int width);
+void MergeUV_Unaligned_NEON(const uint8* src_u, const uint8* src_v,
+                            uint8* dst_uv, int width);
+void MergeUV_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width);
+void MergeUV_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width);
+void MergeUV_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width);
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_X86(const uint8* src, uint8* dst, int count);
@@ -720,6 +735,24 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
                              uint8* rgb_buf,
                              int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width);
 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
                      uint8* dst_u, uint8* dst_v, int pix);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 449
+#define LIBYUV_VERSION 451
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -368,8 +368,7 @@ static int X420ToI420(const uint8* src_y,
  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
      SplitUV_C;
 #if defined(HAS_SPLITUV_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
-    if (halfwidth >= 16) {
    SplitUV = SplitUV_Any_SSE2;
    if (IS_ALIGNED(halfwidth, 16)) {
      SplitUV = SplitUV_Unaligned_SSE2;
@@ -380,11 +379,9 @@ static int X420ToI420(const uint8* src_y,
      }
    }
  }
-  }
 #endif
 #if defined(HAS_SPLITUV_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
-    if (halfwidth >= 32) {
    SplitUV = SplitUV_Any_AVX2;
    if (IS_ALIGNED(halfwidth, 32)) {
      SplitUV = SplitUV_Unaligned_AVX2;
@@ -395,11 +392,9 @@ static int X420ToI420(const uint8* src_y,
      }
    }
  }
-  }
 #endif
 #if defined(HAS_SPLITUV_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
-    if (halfwidth >= 16) {
    SplitUV = SplitUV_Any_NEON;
    if (IS_ALIGNED(halfwidth, 16)) {
      SplitUV = SplitUV_Unaligned_NEON;
@@ -410,11 +405,9 @@ static int X420ToI420(const uint8* src_y,
      }
    }
  }
-  }
 #endif
 #if defined(HAS_SPLITUV_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
-    if (halfwidth >= 16) {
    SplitUV = SplitUV_Any_MIPS_DSPR2;
    if (IS_ALIGNED(halfwidth, 16)) {
      SplitUV = SplitUV_Unaligned_MIPS_DSPR2;
@@ -425,7 +418,6 @@ static int X420ToI420(const uint8* src_y,
      }
    }
  }
-  }
 #endif
  if (dst_y) {

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -521,18 +521,46 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
  int halfwidth = (width + 1) >> 1;
  void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                  int width) = MergeUV_C;
-#if defined(HAS_MERGEUV_SSE2)
+#if defined(HAS_SPLITUV_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) &&
+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
-      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+    MergeUV = MergeUV_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUV = MergeUV_Unaligned_SSE2;
+      if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
          IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
          IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
        MergeUV = MergeUV_SSE2;
      }
-#elif defined(HAS_MERGEUV_NEON)
+    }
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
+  }
+#endif
+#if defined(HAS_SPLITUV_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+    MergeUV = MergeUV_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUV = MergeUV_Unaligned_AVX2;
+      if (IS_ALIGNED(src_u, 32) && IS_ALIGNED(src_stride_u, 32) &&
+          IS_ALIGNED(src_v, 32) && IS_ALIGNED(src_stride_v, 32) &&
+          IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
+        MergeUV = MergeUV_AVX2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SPLITUV_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+    MergeUV = MergeUV_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUV = MergeUV_Unaligned_NEON;
+      if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+          IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+          IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
        MergeUV = MergeUV_NEON;
      }
+    }
+  }
 #endif
  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
  int halfheight = (height + 1) >> 1;
  for (int y = 0; y < halfheight; ++y) {

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1190,32 +1190,53 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
 #endif
 #undef UV422ANY
-#define SPLITUVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK)                \
+#define SPLITUVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                     \
    void NAMEANY(const uint8* src_uv,                                          \
                 uint8* dst_u, uint8* dst_v, int width) {                      \
      int n = width & ~MASK;                                                   \
      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
-      ANYTOUV_C(src_uv  + n * BPP,                                             \
+      ANYTOUV_C(src_uv + n * 2,                                                \
                dst_u + n,                                                     \
                dst_v + n,                                                     \
                width & MASK);                                                 \
    }
 #ifdef HAS_SPLITUV_SSE2
-SPLITUVANY(SplitUV_Any_SSE2, SplitUV_Unaligned_SSE2, SplitUV_C, 2, 15)
+SPLITUVANY(SplitUV_Any_SSE2, SplitUV_Unaligned_SSE2, SplitUV_C, 15)
 #endif
 #ifdef HAS_SPLITUV_AVX2
-SPLITUVANY(SplitUV_Any_AVX2, SplitUV_Unaligned_AVX2, SplitUV_C, 2, 31)
+SPLITUVANY(SplitUV_Any_AVX2, SplitUV_Unaligned_AVX2, SplitUV_C, 31)
 #endif
 #ifdef HAS_SPLITUV_NEON
-SPLITUVANY(SplitUV_Any_NEON, SplitUV_Unaligned_NEON, SplitUV_C, 2, 15)
+SPLITUVANY(SplitUV_Any_NEON, SplitUV_Unaligned_NEON, SplitUV_C, 15)
 #endif
 #ifdef HAS_SPLITUV_MIPS_DSPR2
-SPLITUVANY(SplitUV_Any_MIPS_DSPR2, SplitUV_Unaligned_MIPS_DSPR2, SplitUV_C,
+SPLITUVANY(SplitUV_Any_MIPS_DSPR2, SplitUV_Unaligned_MIPS_DSPR2, SplitUV_C, 15)
-           2, 15)
 #endif
 #undef SPLITUVANY
+#define MERGEUVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                     \
+    void NAMEANY(const uint8* src_u, const uint8* src_v,                       \
+                 uint8* dst_uv, int width) {                                   \
+      int n = width & ~MASK;                                                   \
+      ANYTOUV_SIMD(src_u, src_v, dst_uv, n);                                   \
+      ANYTOUV_C(src_u + n,                                                     \
+                src_v + n,                                                     \
+                dst_uv + n * 2,                                                \
+                width & MASK);                                                 \
+    }
+#ifdef HAS_MERGEUV_SSE2
+MERGEUVANY(MergeUV_Any_SSE2, MergeUV_Unaligned_SSE2, MergeUV_C, 15)
+#endif
+#ifdef HAS_MERGEUV_AVX2
+MERGEUVANY(MergeUV_Any_AVX2, MergeUV_Unaligned_AVX2, MergeUV_C, 31)
+#endif
+#ifdef HAS_MERGEUV_NEON
+MERGEUVANY(MergeUV_Any_NEON, MergeUV_Unaligned_NEON, MergeUV_C, 15)
+#endif
+#undef MERGEUVANY
 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
                               const int32* previous_cumsum, int width) {
  int32 row_sum[4] = {0, 0, 0, 0};

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -380,10 +380,32 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 #ifdef HAS_MERGEUV_NEON
 // Reads 16 U's and V's and writes out 16 pairs of UV.
+// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
 void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                  int width) {
  asm volatile (
    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.u8    {q0}, [%0:128]!                \n"  // load U
+    "vld1.u8    {q1}, [%1:128]!                \n"  // load V
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vst2.u8    {q0, q1}, [%2:128]!            \n"  // store 16 pairs of UV
+    "bgt        1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "memory", "cc", "q0", "q1"  // Clobber List
+  );
+}
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUV_Unaligned_NEON(const uint8* src_u, const uint8* src_v,
+                            uint8* dst_uv, int width) {
+  asm volatile (
+    ".p2align  2                               \n"
  "1:                                          \n"
    "vld1.u8    {q0}, [%0]!                    \n"  // load U
    "vld1.u8    {q1}, [%1]!                    \n"  // load V

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2576,6 +2576,35 @@ void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #endif
  );
 }
+void MergeUV_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
+                            uint8* dst_uv, int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    ".p2align   4                                \n"
+  "1:                                            \n"
+    "movdqu    (%0),%%xmm0                       \n"
+    "movdqu    (%0,%1,1),%%xmm1                  \n"
+    "lea       0x10(%0),%0                       \n"
+    "movdqa    %%xmm0,%%xmm2                     \n"
+    "punpcklbw %%xmm1,%%xmm0                     \n"
+    "punpckhbw %%xmm1,%%xmm2                     \n"
+    "movdqu    %%xmm0,(%2)                       \n"
+    "movdqu    %%xmm2,0x10(%2)                   \n"
+    "lea       0x20(%2),%2                       \n"
+    "sub       $0x10,%3                          \n"
+    "jg        1b                                \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2"
+#endif
+  );
+}
 #endif  // HAS_MERGEUV_SSE2
 #ifdef HAS_COPYROW_SSE2

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2650,6 +2650,36 @@ void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
    ret
  }
 }
+__declspec(naked) __declspec(align(16))
+void MergeUV_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
+                            uint8* dst_uv, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm1, [eax + edx]  // and 16 V's
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, xmm0
+    punpcklbw  xmm0, xmm1       // first 8 UV pairs
+    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+    pop        edi
+    ret
+  }
+}
 #endif  //  HAS_MERGEUV_SSE2
 #ifdef HAS_COPYROW_SSE2