Add MSA optimized I422ToYUY2Row, I422ToUYVYRow functions

R=fbarchard@google.com BUG=libyuv:634 Performance gains :- I422ToYUY2Row_MSA - ~12x I422ToYUY2Row_Any_MSA - ~7x I422ToUYVYRow_MSA - ~12x I422ToUYVYRow_Any_MSA - ~7x Review URL: https://codereview.chromium.org/2378753004 .

Add MSA optimized I422ToYUY2Row, I422ToUYVYRow functions
R=fbarchard@google.com BUG=libyuv:634 Performance gains :- I422ToYUY2Row_MSA - ~12x I422ToYUY2Row_Any_MSA - ~7x I422ToUYVYRow_MSA - ~12x I422ToUYVYRow_Any_MSA - ~7x Review URL: https://codereview.chromium.org/2378753004 .
7018f5be · Frank Barchard · aa197ee1 · 7018f5be · 7018f5be · 7018f5be
Commit 7018f5be authored Oct 04, 2016 by Frank Barchard
6 changed files
--- a/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
@@ -71,6 +71,19 @@
 }
 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) {           \
+  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
+  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
+}
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
 #endif  /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
 #endif  // INCLUDE_LIBYUV_MACROS_MSA_H_
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -370,6 +370,8 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #define HAS_MIRRORROW_MSA
 #define HAS_ARGBMIRRORROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
+#define HAS_I422TOUYVYROW_MSA
 #endif
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -1769,6 +1771,22 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_uyvy, int width);
+void I422ToYUY2Row_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_yuy2, int width);
+void I422ToUYVYRow_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_MSA(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_MSA(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_uyvy, int width);
 // Effects related row functions.
 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -237,6 +237,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
  for (y = 0; y < height - 1; y += 2) {
    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -298,6 +306,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
  for (y = 0; y < height; ++y) {
    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -345,6 +361,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
  for (y = 0; y < height - 1; y += 2) {
    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);

--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -553,6 +553,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
  {
    // Allocate a rows of yuv.
@@ -655,6 +663,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
  {
    // Allocate a rows of yuv.

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -80,9 +80,15 @@ ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
 #ifdef HAS_I422TOYUY2ROW_NEON
 ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_I422TOUYVYROW_NEON
 ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_BLENDPLANEROW_AVX2
 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
 #endif

--- a/source/row_msa.cc
+++ b/source/row_msa.cc
@@ -53,6 +53,54 @@ void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
  }
 }
+void I422ToYUY2Row_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_yuy2,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
+    ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
+    ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_yuy2 += 64;
+  }
+}
+void I422ToUYVYRow_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_uyvy,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
+    ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
+    ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_uyvy += 64;
+  }
+}
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv