Add MIPS SIMD Arch (MSA) optimized ARGBMirrorRow function

This patch adds MSA optimized ARGBMirrorRow function in libYUV project. Performance gain ~3x R=fbarchard@google.com BUG=libyuv:634 Review URL: https://codereview.chromium.org/2368313003 .

Add MIPS SIMD Arch (MSA) optimized ARGBMirrorRow function
This patch adds MSA optimized ARGBMirrorRow function in libYUV project. Performance gain ~3x R=fbarchard@google.com BUG=libyuv:634 Review URL: https://codereview.chromium.org/2368313003 .
61814908 · Frank Barchard · feaff94b · 61814908 · 61814908 · 61814908
Commit 61814908 authored Sep 26, 2016 by Frank Barchard
6 changed files
--- a/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
@@ -74,5 +74,20 @@
  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
 }
 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+
+#define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {   \
+  out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0);  \
+  out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2);  \
+}
+#define VSHF_W2_UB(...) VSHF_W2(v16u8, __VA_ARGS__)
+#define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
+#define VSHF_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                mask0, mask1, mask2, mask3,                     \
+                out0, out1, out2, out3) {                       \
+  VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
+  VSHF_W2(RTYPE, in4, in5, in6, in7, mask2, mask3, out2, out3)  \
+}
+#define VSHF_W4_UB(...) VSHF_W4(v16u8, __VA_ARGS__)
+
 #endif  /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
 #endif  /* __MACROS_MSA_H__ */
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -374,6 +374,7 @@ extern "C" {

 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #define HAS_MIRRORROW_MSA
+#define HAS_ARGBMIRRORROW_MSA
 #endif

 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -832,10 +833,12 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_MSA(const uint8* src, uint8* dst, int width);

 void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -659,6 +659,14 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif

  // Mirror plane
  for (y = 0; y < height; ++y) {

--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -118,6 +118,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
    }
  }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_COPYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -643,6 +643,9 @@ ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
 #ifdef HAS_ARGBMIRRORROW_NEON
 ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
 #endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
 #undef ANY11M

 // Any 1 plane. (memset)

--- a/source/row_msa.cc
+++ b/source/row_msa.cc
@@ -10,36 +10,52 @@

 #include "libyuv/row.h"

+// This module is for GCC MSA
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #include "libyuv/macros_msa.h"
-#endif

 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
-  int count;
+  int x;
  v16u8 src0, src1, src2, src3;
  v16u8 dst0, dst1, dst2, dst3;
-  v16i8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-
+  v16i8 shuffler = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
  src += width - 64;

-  for (count = 0; count < width; count += 64) {
+  for (x = 0; x < width; x += 64) {
    LD_UB4(src, 16, src3, src2, src1, src0);
-    VSHF_B2_UB(src3, src3, src2, src2, mask, mask, dst3, dst2);
-    VSHF_B2_UB(src1, src1, src0, src0, mask, mask, dst1, dst0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 shuffler = { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
+  src += width * 4 - 64;
+
+  for (x = 0; x < width; x += 16) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
    dst += 64;
    src -= 64;
  }
 }
-#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)