Style fixes for mips version of SplitUV for nv12/21

BUG=126 TEST=lint passes and rotate_test and cpu_test on try bot pass. Review URL: https://webrtc-codereview.appspot.com/884004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@418 16f28f9a-4ce2-e073-06de-1de4eb20be90

Style fixes for mips version of SplitUV for nv12/21
BUG=126 TEST=lint passes and rotate_test and cpu_test on try bot pass. Review URL: https://webrtc-codereview.appspot.com/884004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@418 16f28f9a-4ce2-e073-06de-1de4eb20be90
bb6bddc9 · fbarchard@google.com · c4163acb · bb6bddc9 · bb6bddc9 · bb6bddc9
Commit bb6bddc9 authored Oct 14, 2012 by fbarchard@google.com
7 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 417
+Version: 418
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -271,7 +271,8 @@ void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                        int pix);
 void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 417
+#define LIBYUV_VERSION 418
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -380,7 +380,7 @@ static int X420ToI420(const uint8* src_y,
    SplitUV = SplitUV_SSE2;
  }
 #elif defined(HAS_SPLITUV_MIPS_DSPR2)
-if (TestCpuFlag(kCpuHasMIPS) && TestCpuFlag(kCpuHasMIPS_DSPR2)){
+if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
    SplitUV = SplitUV_MIPS_DSPR2;
  }
 #endif

--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -101,41 +101,38 @@ static const int kXCR_XFEATURE_ENABLED_MASK = 0;
 // For Arm, but public to allow testing on any CPU
 LIBYUV_API
 int ArmCpuCaps(const char* cpuinfo_name) {
-  int flags = 0;
+  FILE* f = fopen(cpuinfo_name, "r");
-  FILE* fin = fopen(cpuinfo_name, "r");
+  if (f) {
-  if (fin) {
    char buf[512];
-    while (fgets(buf, 511, fin)) {
+    while (fgets(buf, 511, f)) {
      if (memcmp(buf, "Features", 8) == 0) {
        char* p = strstr(buf, " neon");
        if (p && (p[5] == ' ' || p[5] == '\n')) {
-          flags |= kCpuHasNEON;
+          fclose(f);
-          break;
+          return kCpuHasNEON;
        }
      }
    }
-    fclose(fin);
+    fclose(f);
  }
-  return flags;
+  return 0;
 }
-static int MipsCpuCaps(const char *search_string) {
+static int MipsCpuCaps(const char* search_string) {
-  int flags = 0;
+  const char* file_name = "/proc/cpuinfo";
-  const char *file_name = "/proc/cpuinfo";
  char cpuinfo_line[256];
-  FILE *f = NULL;
+  FILE* f = NULL;
+  if ((f = fopen(file_name, "r")) != NULL) {
-  if ((f = fopen (file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
-    while (fgets (cpuinfo_line, sizeof (cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
-      if (strstr (cpuinfo_line, search_string) != NULL) {
+        fclose(f);
-        flags |= kCpuHasMIPS_DSP;
+        return kCpuHasMIPS_DSP;
-        fclose (f);
-        return flags;
      }
    }
+    fclose(f);
  }
  /* Did not find string in the proc file, or not Linux ELF. */
-  return flags;
+  return 0;
 }
 // CPU detect function for SIMD instruction sets.
@@ -197,9 +194,9 @@ int InitCpuFlags(void) {
  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
    cpu_info_ &= ~kCpuHasAVX2;
  }
-#elif defined (__mips__) && defined(__linux__)
+#elif defined(__mips__) && defined(__linux__)
  // linux mips parse text file for dsp detect.
-  cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP
+  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
 #if defined(__mips_dspr2)
  cpu_info_ |= kCpuHasMIPS_DSPR2;
 #endif
@@ -214,7 +211,6 @@ int InitCpuFlags(void) {
  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
    cpu_info_ &= ~kCpuHasMIPS_DSPR2;
  }
 #elif defined(__arm__)
 #if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
  // linux arm parse text file for neon detect.

--- a/source/row_mips.cc
+++ b/source/row_mips.cc
 /*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -9,6 +9,7 @@
 */
 #include "libyuv/row.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -16,191 +17,141 @@ extern "C" {
 #if !defined(YUV_DISABLE_ASM) && defined(__mips__)
 #ifdef HAS_SPLITUV_MIPS_DSPR2
-void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                        int width) {
-    __asm__ __volatile__(
+  __asm__ __volatile__ (
-        ".set push                                     \n\t"
+    ".set push                                     \n"
-        ".set noreorder                                \n\t"
+    ".set noreorder                                \n"
-        "srl             $t4, %[width], 4              \n\t"  // how many multiplies of 16 8bits
+    "srl             $t4, %[width], 4              \n"  // multiplies of 16
-        "blez            $t4, 2f                       \n\t"
+    "blez            $t4, 2f                       \n"
-        " andi           %[width], %[width], 0xf       \n\t"  // residual
+    " andi           %[width], %[width], 0xf       \n"  // residual
-        "andi            $t0, %[src_uv], 0x3           \n\t"
+    "andi            $t0, %[src_uv], 0x3           \n"
-        "andi            $t1, %[dst_u], 0x3            \n\t"
+    "andi            $t1, %[dst_u], 0x3            \n"
-        "andi            $t2, %[dst_v], 0x3            \n\t"
+    "andi            $t2, %[dst_v], 0x3            \n"
-        "or              $t0, $t0, $t1                 \n\t"
+    "or              $t0, $t0, $t1                 \n"
-        "or              $t0, $t0, $t2                 \n\t"
+    "or              $t0, $t0, $t2                 \n"
-        "beqz            $t0, 12f                      \n\t"  // if src and dsts are aligned
+    "beqz            $t0, 12f                      \n"  // test if aligned
-        " nop                                          \n\t"
+    " nop                                          \n"
    // src and dst are unaligned
-        "1:                                            \n\t"
-        "addiu           $t4, $t4, -1                  \n\t"
-        "lwr             $t0, 0(%[src_uv])             \n\t"
-        "lwl             $t0, 3(%[src_uv])             \n\t"  // t0 = V1 | U1 | V0 | U0
-        "lwr             $t1, 4(%[src_uv])             \n\t"
-        "lwl             $t1, 7(%[src_uv])             \n\t"  // t1 = V3 | U3 | V2 | U2
-        "lwr             $t2, 8(%[src_uv])             \n\t"
-        "lwl             $t2, 11(%[src_uv])            \n\t"  // t2 = V5 | U5 | V4 | U4
-        "lwr             $t3, 12(%[src_uv])            \n\t"
-        "lwl             $t3, 15(%[src_uv])            \n\t"  // t3 = V7 | U7 | V6 | U6
-        "lwr             $t5, 16(%[src_uv])            \n\t"
-        "lwl             $t5, 19(%[src_uv])            \n\t"  // t5 = V9 | U9 | V8 | U8
-        "lwr             $t6, 20(%[src_uv])            \n\t"
-        "lwl             $t6, 23(%[src_uv])            \n\t"  // t6 = V11 | U11 | V10 | U10
-        "lwr             $t7, 24(%[src_uv])            \n\t"
-        "lwl             $t7, 27(%[src_uv])            \n\t"  // t7 = V13 | U13 | V12 | U12
-        "lwr             $t8, 28(%[src_uv])            \n\t"
-        "lwl             $t8, 31(%[src_uv])            \n\t"  // t8 = V15 | U15 | V14 | U14
-        "precrq.qb.ph    $t9, $t1, $t0                 \n\t"  // t9 = V3 | V2 | V1 | V0
-        "precr.qb.ph     $t0, $t1, $t0                 \n\t"  // t0 = U3 | U2 | U1 | U0
-        "precrq.qb.ph    $t1, $t3, $t2                 \n\t"  // t1 = V7 | V6 | V5 | V4
-        "precr.qb.ph     $t2, $t3, $t2                 \n\t"  // t2 = U7 | U6 | U5 | U4
-        "precrq.qb.ph    $t3, $t6, $t5                 \n\t"  // t3 = V11 | V10 | V9 | V8
-        "precr.qb.ph     $t5, $t6, $t5                 \n\t"  // t5 = U11 | U10 | U9 | U8
-        "precrq.qb.ph    $t6, $t8, $t7                 \n\t"  // t6 = V15 | V14 | V13 | V12
-        "precr.qb.ph     $t7, $t8, $t7                 \n\t"  // t7 = U15 | U14 | U13 | U12
-        "addiu           %[src_uv], %[src_uv], 32      \n\t"
-        "swr             $t9, 0(%[dst_v])              \n\t"
-        "swl             $t9, 3(%[dst_v])              \n\t"
-        "swr             $t0, 0(%[dst_u])              \n\t"
-        "swl             $t0, 3(%[dst_u])              \n\t"
-        "swr             $t1, 4(%[dst_v])              \n\t"
-        "swl             $t1, 7(%[dst_v])              \n\t"
-        "swr             $t2, 4(%[dst_u])              \n\t"
-        "swl             $t2, 7(%[dst_u])              \n\t"
-        "swr             $t3, 8(%[dst_v])              \n\t"
-        "swl             $t3, 11(%[dst_v])             \n\t"
-        "swr             $t5, 8(%[dst_u])              \n\t"
-        "swl             $t5, 11(%[dst_u])             \n\t"
-        "swr             $t6, 12(%[dst_v])             \n\t"
-        "swl             $t6, 15(%[dst_v])             \n\t"
-        "swr             $t7, 12(%[dst_u])             \n\t"
-        "swl             $t7, 15(%[dst_u])             \n\t"
-        "addiu           %[dst_u], %[dst_u], 16        \n\t"
-        "bgtz            $t4, 1b                       \n\t"
-        " addiu          %[dst_v], %[dst_v], 16        \n\t"
-        "beqz            %[width], 3f                  \n\t"
-        " nop                                          \n\t"
-        "b               2f                            \n\t"
-        " nop                                          \n\t"
-        // src and dst are aligned
-        "12:                                           \n\t"
-        "addiu           $t4, $t4, -1                  \n\t"
-        "lw              $t0, 0(%[src_uv])             \n\t"  // t0 = V1 | U1 | V0 | U0
-        "lw              $t1, 4(%[src_uv])             \n\t"  // t1 = V3 | U3 | V2 | U2
-        "lw              $t2, 8(%[src_uv])             \n\t"  // t2 = V5 | U5 | V4 | U4
-        "lw              $t3, 12(%[src_uv])            \n\t"  // t3 = V7 | U7 | V6 | U6
-        "lw              $t5, 16(%[src_uv])            \n\t"  // t5 = V9 | U9 | V8 | U8
-        "lw              $t6, 20(%[src_uv])            \n\t"  // t6 = V11 | U11 | V10 | U10
-        "lw              $t7, 24(%[src_uv])            \n\t"  // t7 = V13 | U13 | V12 | U12
-        "lw              $t8, 28(%[src_uv])            \n\t"  // t8 = V15 | U15 | V14 | U14
-        "addiu           %[src_uv], %[src_uv], 32      \n\t"
-        "precrq.qb.ph    $t9, $t1, $t0                 \n\t"  // t9 = V3 | V2 | V1 | V0
-        "precr.qb.ph     $t0, $t1, $t0                 \n\t"  // t0 = U3 | U2 | U1 | U0
-        "precrq.qb.ph    $t1, $t3, $t2                 \n\t"  // t1 = V7 | V6 | V5 | V4
-        "precr.qb.ph     $t2, $t3, $t2                 \n\t"  // t2 = U7 | U6 | U5 | U4
-        "precrq.qb.ph    $t3, $t6, $t5                 \n\t"  // t3 = V11 | V10 | V9 | V8
-        "precr.qb.ph     $t5, $t6, $t5                 \n\t"  // t5 = U11 | U10 | U9 | U8
-        "precrq.qb.ph    $t6, $t8, $t7                 \n\t"  // t6 = V15 | V14 | V13 | V12
-        "precr.qb.ph     $t7, $t8, $t7                 \n\t"  // t7 = U15 | U14 | U13 | U12
-        "sw              $t9, 0(%[dst_v])              \n\t"
-        "sw              $t0, 0(%[dst_u])              \n\t"
-        "sw              $t1, 4(%[dst_v])              \n\t"
-        "sw              $t2, 4(%[dst_u])              \n\t"
-        "sw              $t3, 8(%[dst_v])              \n\t"
-        "sw              $t5, 8(%[dst_u])              \n\t"
-        "sw              $t6, 12(%[dst_v])             \n\t"
-        "sw              $t7, 12(%[dst_u])             \n\t"
-        "addiu           %[dst_v], %[dst_v], 16        \n\t"
-        "bgtz            $t4, 12b                      \n\t"
-        " addiu          %[dst_u], %[dst_u], 16        \n\t"
-        "beqz            %[width], 3f                  \n\t"
-        " nop                                          \n\t"
-        "2:                                            \n\t"
-        "lbu             $t0, 0(%[src_uv])             \n\t"
-        "lbu             $t1, 1(%[src_uv])             \n\t"
-        "addiu           %[src_uv], %[src_uv], 2       \n\t"
-        "addiu           %[width], %[width], -1        \n\t"
-        "sb              $t0, 0(%[dst_u])              \n\t"
-        "sb              $t1, 0(%[dst_v])              \n\t"
-        "addiu           %[dst_u], %[dst_u], 1         \n\t"
-        "bgtz            %[width], 2b                  \n\t"
-        " addiu          %[dst_v], %[dst_v], 1         \n\t"
-        "3:                                            \n\t"
-        ".set pop                                      \n\t"
-         : [src_uv] "+r" (src_uv), [width] "+r" (width),
-           [dst_u] "+r" (dst_u), [dst_v] "+r" (dst_v)
-         :
-         : "t0", "t1","t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"
-    );
-}
-#endif  // HAS_SPLITUV_MIPS_DSPR2
-#ifdef HAS_SPLITUV_MIPS_DSPR2
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
-// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
-void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    ".p2align  2                               \n"
    "1:                                            \n"
-    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "addiu           $t4, $t4, -1                  \n"
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "lwr             $t0, 0(%[src_uv])             \n"
-    "vst1.u8    {q0}, [%1]!                    \n"  // store U
+    "lwl             $t0, 3(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "vst1.u8    {q1}, [%2]!                    \n"  // Store V
+    "lwr             $t1, 4(%[src_uv])             \n"
-    "bgt        1b                             \n"
+    "lwl             $t1, 7(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    : "+r"(src_uv),  // %0
+    "lwr             $t2, 8(%[src_uv])             \n"
-      "+r"(dst_u),   // %1
+    "lwl             $t2, 11(%[src_uv])            \n"  // V5 | U5 | V4 | U4
-      "+r"(dst_v),   // %2
+    "lwr             $t3, 12(%[src_uv])            \n"
-      "+r"(width)    // %3  // Output registers
+    "lwl             $t3, 15(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    :                       // Input registers
+    "lwr             $t5, 16(%[src_uv])            \n"
-    : "memory", "cc", "q0", "q1"  // Clobber List
+    "lwl             $t5, 19(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-  );
+    "lwr             $t6, 20(%[src_uv])            \n"
-}
+    "lwl             $t6, 23(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-#endif  // HAS_SPLITUV_MIPS_DSPR2
+    "lwr             $t7, 24(%[src_uv])            \n"
+    "lwl             $t7, 27(%[src_uv])            \n"  // V13 | U13 | V12 | U12
+    "lwr             $t8, 28(%[src_uv])            \n"
+    "lwl             $t8, 31(%[src_uv])            \n"  // V15 | U15 | V14 | U14
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "addiu           %[src_uv], %[src_uv], 32      \n"
+    "swr             $t9, 0(%[dst_v])              \n"
+    "swl             $t9, 3(%[dst_v])              \n"
+    "swr             $t0, 0(%[dst_u])              \n"
+    "swl             $t0, 3(%[dst_u])              \n"
+    "swr             $t1, 4(%[dst_v])              \n"
+    "swl             $t1, 7(%[dst_v])              \n"
+    "swr             $t2, 4(%[dst_u])              \n"
+    "swl             $t2, 7(%[dst_u])              \n"
+    "swr             $t3, 8(%[dst_v])              \n"
+    "swl             $t3, 11(%[dst_v])             \n"
+    "swr             $t5, 8(%[dst_u])              \n"
+    "swl             $t5, 11(%[dst_u])             \n"
+    "swr             $t6, 12(%[dst_v])             \n"
+    "swl             $t6, 15(%[dst_v])             \n"
+    "swr             $t7, 12(%[dst_u])             \n"
+    "swl             $t7, 15(%[dst_u])             \n"
+    "addiu           %[dst_u], %[dst_u], 16        \n"
+    "bgtz            $t4, 1b                       \n"
+    " addiu          %[dst_v], %[dst_v], 16        \n"
+    "beqz            %[width], 3f                  \n"
+    " nop                                          \n"
+    "b               2f                            \n"
+    " nop                                          \n"
-#ifdef HAS_SPLITUV_MIPS_DSPR2
+    // src and dst are aligned
-// Reads 4 pairs of UV and write even values to dst_u and odd to dst_v
+    "12:                                           \n"
-// Alignment requirement: 4 bytes for pointers, and multiple of 4 pixels.
+    "addiu           $t4, $t4, -1                  \n"
-void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                        int width) {
-  asm volatile (
-    ".set push                                 \n"
-    ".set noreorder                            \n"
-    ".p2align  2                               \n"
-  "1:                                          \n"
    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "addiu         %[width], %[width], -4      \n"
+    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
-    "addiu         %[src_uv], %[src_uv], 8     \n"
+    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "precr.qb.ph   $t2, $t1, $t0               \n"  // U3 | U2 | U1 | U0
+    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "precrq.qb.ph  $t3, $t1, $t0               \n"  // V3 | V2 | V1 | V0
+    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "sw            $t2, 0(%[dst_u])            \n"
+    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "sw            $t3, 0(%[dst_v])            \n"
+    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-    "addiu         %[dst_u], %[dst_u], 4       \n"
-    "bgtz          %[width], 1b                \n"
+    "addiu           %[src_uv], %[src_uv], 32      \n"
-    " addiu        %[dst_v], %[dst_v], 4       \n"
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "sw              $t9, 0(%[dst_v])              \n"
+    "sw              $t0, 0(%[dst_u])              \n"
+    "sw              $t1, 4(%[dst_v])              \n"
+    "sw              $t2, 4(%[dst_u])              \n"
+    "sw              $t3, 8(%[dst_v])              \n"
+    "sw              $t5, 8(%[dst_u])              \n"
+    "sw              $t6, 12(%[dst_v])             \n"
+    "sw              $t7, 12(%[dst_u])             \n"
+    "addiu           %[dst_v], %[dst_v], 16        \n"
+    "bgtz            $t4, 12b                      \n"
+    " addiu          %[dst_u], %[dst_u], 16        \n"
+    "beqz            %[width], 3f                  \n"
+    " nop                                          \n"
+    "2:                                            \n"
+    "lbu             $t0, 0(%[src_uv])             \n"
+    "lbu             $t1, 1(%[src_uv])             \n"
+    "addiu           %[src_uv], %[src_uv], 2       \n"
+    "addiu           %[width], %[width], -1        \n"
+    "sb              $t0, 0(%[dst_u])              \n"
+    "sb              $t1, 0(%[dst_v])              \n"
+    "addiu           %[dst_u], %[dst_u], 1         \n"
+    "bgtz            %[width], 2b                  \n"
+    " addiu          %[dst_v], %[dst_v], 1         \n"
+    "3:                                            \n"
    ".set pop                                      \n"
     : [src_uv] "+r" (src_uv),
       [width] "+r" (width),
       [dst_u] "+r" (dst_u),
       [dst_v] "+r" (dst_v)
     :
-    : "t0", "t1","t2", "t3",
+     : "t0", "t1", "t2", "t3",
+       "t4", "t5", "t6", "t7", "t8", "t9"
  );
 }
 #endif  // HAS_SPLITUV_MIPS_DSPR2
 #endif  // __mips__
 #ifdef __cplusplus

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -909,7 +909,8 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
 }
 #ifdef HAS_ARGBTOARGB4444ROW_NEON
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, int pix) {
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int pix) {
  asm volatile (
    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
    ".p2align  2                               \n"