Add MIPS SIMD Arch (MSA) optimized MirrorRow function

As per the preparation patch added in Chromium sources at, 2150943003: Add MIPS SIMD Arch (MSA) build flags for GYP/GN builds This patch adds first MSA optimized function in libYUV project. BUG=libyuv:634 R=fbarchard@google.com Review URL: https://codereview.chromium.org/2285683002 .

Add MIPS SIMD Arch (MSA) optimized MirrorRow function
As per the preparation patch added in Chromium sources at, 2150943003: Add MIPS SIMD Arch (MSA) build flags for GYP/GN builds This patch adds first MSA optimized function in libYUV project. BUG=libyuv:634 R=fbarchard@google.com Review URL: https://codereview.chromium.org/2285683002 .
c5323b0f · Frank Barchard · 5da918b4 · c5323b0f · c5323b0f · c5323b0f
Commit c5323b0f authored Sep 22, 2016 by Frank Barchard
16 changed files
--- a/Android.mk
+++ b/Android.mk
@@ -53,6 +53,12 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
        source/scale_neon.cc.neon
 endif
+ifeq ($(TARGET_ARCH_ABI),mips)
+    LOCAL_CFLAGS += -DLIBYUV_MSA
+    LOCAL_SRC_FILES += \
+        source/row_msa.cc
+endif
 LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/include

--- a/BUILD.gn
+++ b/BUILD.gn
@@ -94,6 +94,10 @@ static_library("libyuv") {
    deps += [ ":libyuv_neon" ]
  }
+  if (libyuv_use_msa) {
+    deps += [ ":libyuv_msa" ]
+  }
  if (is_nacl) {
    # Always enable optimization under NaCl to workaround crbug.com/538243 .
    configs -= [ "//build/config/compiler:default_optimization" ]
@@ -124,6 +128,17 @@ if (libyuv_use_neon) {
  }
 }
+if (libyuv_use_msa) {
+  static_library("libyuv_msa") {
+    sources = [
+      # MSA Source Files
+      "source/row_msa.cc",
+    ]
+    public_configs = [ ":libyuv_config" ]
+  }
+}
 if (libyuv_include_tests) {
  config("libyuv_unittest_warnings_config") {
    if (!is_win) {

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,6 +40,7 @@ set(ly_source_files
  ${ly_src_dir}/row_any.cc
  ${ly_src_dir}/row_common.cc
  ${ly_src_dir}/row_mips.cc
+  ${ly_src_dir}/row_msa.cc
  ${ly_src_dir}/row_neon.cc
  ${ly_src_dir}/row_neon64.cc
  ${ly_src_dir}/row_gcc.cc
@@ -80,6 +81,7 @@ set(ly_header_files
  ${ly_inc_dir}/libyuv/convert_from.h
  ${ly_inc_dir}/libyuv/convert_from_argb.h
  ${ly_inc_dir}/libyuv/cpu_id.h
+  ${ly_inc_dir}/libyuv/macros_msa.h
  ${ly_inc_dir}/libyuv/planar_functions.h
  ${ly_inc_dir}/libyuv/rotate.h
  ${ly_inc_dir}/libyuv/rotate_argb.h

--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -195,6 +195,16 @@ Running test with C code:
    gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true"
    ninja -C out/Official
+#### Building mips with GN
+mipsel
+    gn gen out/Default "--args=is_debug=false target_cpu=\"mipsel\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false"
+    ninja -C out/Default
+mips64el
+    gn gen out/Default "--args=is_debug=false target_cpu=\"mips64el\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false"
+    ninja -C out/Default
 ### Linux
    GYP_DEFINES="target_arch=x64" ./gyp_libyuv

--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -42,6 +42,7 @@ static const int kCpuHasAVX3 = 0x2000;
 // These flags are only valid on MIPS processors.
 static const int kCpuHasMIPS = 0x10000;
 static const int kCpuHasDSPR2 = 0x20000;
+static const int kCpuHasMSA = 0x40000;
 // Internal function used to auto-init.
 LIBYUV_API

--- a/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef __MACROS_MSA_H__
+#define __MACROS_MSA_H__
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include <stdint.h>
+#include <msa.h>
+#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+/* Description : Load two vectors with 16 'byte' sized elements
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
+  out0 = LD_B(RTYPE, (psrc));                     \
+  out1 = LD_B(RTYPE, (psrc) + stride);            \
+}
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
+  LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
+  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
+}
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+/* Description : Store two vectors with stride each having 16 'byte' sized
+                 elements
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
+  ST_B(RTYPE, in0, (pdst));                     \
+  ST_B(RTYPE, in1, (pdst) + stride);            \
+}
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+#define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
+  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
+  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
+}
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {   \
+  out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
+  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
+}
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#endif  /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
+#endif  /* __MACROS_MSA_H__ */
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -372,6 +372,10 @@ extern "C" {
 #endif
 #endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_MIRRORROW_MSA
+#endif
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
 #if defined(VISUALC_HAS_AVX2)
 #define SIMD_ALIGNED(var) __declspec(align(32)) var
@@ -809,11 +813,13 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_MSA(const uint8* src, uint8* dst, int width);
 void MirrorRow_C(const uint8* src, uint8* dst, int width);
 void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
 void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
 void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
 void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_MSA(const uint8* src, uint8* dst, int width);
 void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                       int width);

--- a/libyuv.gni
+++ b/libyuv.gni
@@ -8,10 +8,13 @@
 import("//build_overrides/build.gni")
 import("//build/config/arm.gni")
+import("//build/config/mips.gni")
 declare_args() {
  libyuv_include_tests = !build_with_chromium
  libyuv_disable_jpeg = false
  libyuv_use_neon = (current_cpu == "arm64" ||
      (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)))
+  libyuv_use_msa = (current_cpu == "mips64el" || current_cpu == "mipsel") &&
+    mips_use_msa
 }
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -26,12 +26,18 @@
    # Link-Time Optimizations.
    'use_lto%': 0,
    'build_neon': 0,
+    'build_msa': 0,
    'conditions': [
       ['(target_arch == "armv7" or target_arch == "armv7s" or \
       (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
       and (arm_neon == 1 or arm_neon_optional == 1)', {
         'build_neon': 1,
       }],
+       ['(target_arch == "mipsel" or target_arch == "mips64el")\
+       and (mips_msa == 1)',
+       {
+         'build_msa': 1,
+       }],
    ],
  },
@@ -79,6 +85,11 @@
            }],
          ],
        }],
+        ['build_msa != 0', {
+          'defines': [
+            'LIBYUV_MSA',
+          ],
+        }],
        ['OS != "ios" and libyuv_disable_jpeg != 1', {
          'defines': [
            'HAVE_JPEG'

--- a/libyuv.gypi
+++ b/libyuv.gypi
@@ -18,6 +18,7 @@
      'include/libyuv/convert_from.h',
      'include/libyuv/convert_from_argb.h',
      'include/libyuv/cpu_id.h',
+      'include/libyuv/macros_msa.h',
      'include/libyuv/mjpeg_decoder.h',
      'include/libyuv/planar_functions.h',
      'include/libyuv/rotate.h',
@@ -61,6 +62,7 @@
      'source/row_common.cc',
      'source/row_gcc.cc',
      'source/row_mips.cc',
+      'source/row_msa.cc',
      'source/row_neon.cc',
      'source/row_neon64.cc',
      'source/row_win.cc',

--- a/libyuv_test.gyp
+++ b/libyuv_test.gyp
@@ -86,6 +86,12 @@
            'LIBYUV_NEON'
          ],
        }],
+        [ '(target_arch == "mipsel" or target_arch == "mips64el") \
+          and (mips_msa == 1)', {
+          'defines': [
+            'LIBYUV_MSA'
+          ],
+        }],
      ], # conditions
      'defines': [
        # Enable the following 3 macros to turn off assembly for specified CPU.

--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -161,6 +161,38 @@ int ArmCpuCaps(const char* cpuinfo_name) {
  return 0;
 }
+LIBYUV_API SAFEBUFFERS
+int MipsCpuCaps(const char* cpuinfo_name, const char ase[]) {
+  char cpuinfo_line[512];
+  int len = strlen(ase);
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // ase enabled if /proc/cpuinfo is unavailable.
+    if(strcmp(ase, " msa") == 0) {
+      return kCpuHasMSA;
+    }
+    if(strcmp(ase, " dspr2") == 0) {
+      return kCpuHasDSPR2;
+    }
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+      char* p = strstr(cpuinfo_line, ase);
+      if (p && (p[len] == ' ' || p[len] == '\n')) {
+        fclose(f);
+        if(strcmp(ase, " msa") == 0) {
+          return kCpuHasMSA;
+        }
+        if(strcmp(ase, " dspr2") == 0) {
+          return kCpuHasDSPR2;
+        }
+      }
+    }
+  }
+  fclose(f);
+  return 0;
+}
 // CPU detect function for SIMD instruction sets.
 LIBYUV_API
 int cpu_info_ = 0;  // cpu_info is not initialized yet.
@@ -253,11 +285,17 @@ int InitCpuFlags(void) {
 #if defined(__mips__) && defined(__linux__)
 #if defined(__mips_dspr2)
  cpu_info |= kCpuHasDSPR2;
+#endif
+#if defined(__mips_msa)
+  cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
 #endif
  cpu_info |= kCpuHasMIPS;
  if (getenv("LIBYUV_DISABLE_DSPR2")) {
    cpu_info &= ~kCpuHasDSPR2;
  }
+  if (getenv("LIBYUV_DISABLE_MSA")) {
+    cpu_info &= ~kCpuHasMSA;
+  }
 #endif
 #if defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -401,6 +401,14 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
    MirrorRow = MirrorRow_DSPR2;
  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+}
 #endif
  // Mirror plane

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -141,6 +141,14 @@ void RotatePlane180(const uint8* src, int src_stride,
    MirrorRow = MirrorRow_DSPR2;
  }
 #endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+}
+#endif
 #if defined(HAS_COPYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -631,6 +631,9 @@ ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
 #ifdef HAS_MIRRORROW_NEON
 ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
 #endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
 #ifdef HAS_ARGBMIRRORROW_AVX2
 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 #endif

--- a/source/row_msa.cc
+++ b/source/row_msa.cc
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "libyuv/row.h"
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+#endif
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+  int count;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  src += width - 64;
+  for (count = 0; count < width; count += 64) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, mask, mask, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, mask, mask, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif