Add MSA optimized I422AlphaToARGBRow_MSA and I422ToRGB24Row_MSA functions

R=fbarchard@google.com BUG=libyuv:634 Performance Gain (vs C vectorized) I422AlphaToARGBRow_MSA : ~1.4x I422AlphaToARGBRow_Any_MSA : ~1.4x I422ToRGB24Row_MSA : ~4.8x I422ToRGB24Row_Any_MSA : ~4.8x Performance Gain (vs C non-vectorized) I422AlphaToARGBRow_MSA : ~7.0x I422AlphaToARGBRow_Any_MSA : ~7.0x I422ToRGB24Row_MSA : ~7.9x I422ToRGB24Row_Any_MSA : ~7.7x Review URL: https://codereview.chromium.org/2454433003 .

Add MSA optimized I422AlphaToARGBRow_MSA and I422ToRGB24Row_MSA functions
R=fbarchard@google.com BUG=libyuv:634 Performance Gain (vs C vectorized) I422AlphaToARGBRow_MSA : ~1.4x I422AlphaToARGBRow_Any_MSA : ~1.4x I422ToRGB24Row_MSA : ~4.8x I422ToRGB24Row_Any_MSA : ~4.8x Performance Gain (vs C non-vectorized) I422AlphaToARGBRow_MSA : ~7.0x I422AlphaToARGBRow_Any_MSA : ~7.0x I422ToRGB24Row_MSA : ~7.9x I422ToRGB24Row_Any_MSA : ~7.7x Review URL: https://codereview.chromium.org/2454433003 .
532f5708 · Frank Barchard · 02ae8b60 · 532f5708 · 532f5708 · 532f5708
Commit 532f5708 authored Oct 26, 2016 by Frank Barchard
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1631
+Version: 1632
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -383,6 +383,8 @@ extern "C" {
 #define HAS_ARGBTOUVROW_MSA
 #define HAS_I422TOARGBROW_MSA
 #define HAS_I422TORGBAROW_MSA
+#define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TORGB24ROW_MSA
 #endif

 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -663,6 +665,19 @@ void I422ToRGBARow_MSA(const uint8* src_y,
                       uint8* dst_rgba,
                       const struct YuvConstants* yuvconstants,
                       int width);
+void I422AlphaToARGBRow_MSA(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            const uint8* a_buf,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB24Row_MSA(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);

 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
 void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
@@ -1653,6 +1668,19 @@ void I422ToRGBARow_Any_MSA(const uint8* src_y,
                           uint8* dst_argb,
                           const struct YuvConstants* yuvconstants,
                           int width);
+void I422AlphaToARGBRow_Any_MSA(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                const uint8* src_a,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB24Row_Any_MSA(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width);

 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1631
+#define LIBYUV_VERSION 1632

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -564,6 +564,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
    I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
  }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -558,6 +558,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif

  for (y = 0; y < height; ++y) {
    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -53,6 +53,9 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I422ALPHATOARGBROW_NEON
 ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
 #endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
 #undef ANY41C

 // Any 3 planes to 1.
@@ -168,6 +171,7 @@ ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
 #ifdef HAS_I422TOARGBROW_MSA
 ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
 ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
 #endif
 #undef ANY31C


--- a/source/row_msa.cc
+++ b/source/row_msa.cc
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2711,7 +2711,6 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
  );
 }

-
 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
  asm volatile (
  "1:                                          \n"
@@ -2735,31 +2734,6 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
  );
 }

-void HalfFloatRow_NEON2(const uint16* src, uint16* dst, float scale, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
-    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
-    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
-    "uxtl2      v1.4s, v1.8h                   \n"
-    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
-    "scvtf      v1.4s, v1.4s                   \n"
-    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
-    "fmul       v1.4s, v1.4s, %3.s[0]          \n"
-    "uqshrn     v4.4h, v2.4s, #13              \n"  // isolate halffloat
-    "uqshrn2    v4.8h, v1.4s, #13              \n"
-   MEMACCESS(1)
-    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
-    "b.gt       1b                             \n"
-  : "+r"(src),    // %0
-    "+r"(dst),    // %1
-    "+r"(width)   // %2
-  : "w"(scale * 1.9259299444e-34f)    // %3
-  : "cc", "memory", "v1", "v2", "v4"
-  );
-}
-
 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
  asm volatile (
  "1:                                          \n"