Mips memcpy moved to row_mips.

BUG=191 TEST=none Review URL: https://webrtc-codereview.appspot.com/1127005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@580 16f28f9a-4ce2-e073-06de-1de4eb20be90

Mips memcpy moved to row_mips.
BUG=191 TEST=none Review URL: https://webrtc-codereview.appspot.com/1127005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@580 16f28f9a-4ce2-e073-06de-1de4eb20be90
c22cd5b2 · fbarchard@google.com · c0d9c346 · c22cd5b2 · c22cd5b2 · c22cd5b2
Commit c22cd5b2 authored Feb 23, 2013 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 632 additions and 643 deletions

libyuv.gyp libyuv.gyp +0 -1

memcpy_mips.S source/memcpy_mips.S +0 -357

row_mips.cc source/row_mips.cc +632 -285

No files found.
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -75,7 +75,6 @@
        'source/convert_from_argb.cc',
        'source/cpu_id.cc',
        'source/format_conversion.cc',
-        'source/memcpy_mips.S',  # TODO(fbarchard): Move into row_mips.cc
        'source/mjpeg_decoder.cc',
        'source/planar_functions.cc',
        'source/rotate.cc',

--- a/source/memcpy_mips.S
+++ b/source/memcpy_mips.S
-#if defined (__mips__)
-#
-#  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
-#
-#  Use of this source code is governed by a BSD-style license
-#  that can be found in the LICENSE file in the root of the source
-#  tree. An additional intellectual property rights grant can be found
-#  in the file PATENTS. All contributing project authors may
-#  be found in the AUTHORS file in the root of the source tree.
-#
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-  .text
-  .globl  memcpy_MIPS;
-  .align  2;
-  .type   memcpy_MIPS,@function;
-  .ent    memcpy_MIPS,0;
-memcpy_MIPS:
-  .frame  $sp,0,$ra
-  .set    noreorder
-  .set    noat
-  slti    $at,$a2,8
-  bne     $at,$zero,last8
-   move   $v0,$a0 # memcpy returns the dst pointer
-# Test if the src and dst are word-aligned, or can be made word-aligned
-  xor     $t8,$a1,$a0
-  andi    $t8,$t8,0x3   # t8 is a0/a1 word-displacement
-  bne     $t8,$zero,unaligned
-  negu    $a3,$a0
-  andi    $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
-  beq     $a3,$zero,chk16w  # when a3=0 then the dst (a0) is
-   subu   $a2,$a2,$a3 # word-aligned now a2 is the remining bytes count
-  lwr     $t8,0($a1)
-  addu    $a1,$a1,$a3
-  swr     $t8,0($a0)
-  addu    $a0,$a0,$a3
-# Now the dst/src are mutually word-aligned with word-aligned addresses
-chk16w:
-  andi    $t8,$a2,0x3f  # any whole 64-byte chunks?
-                        # t8 is the byte count after 64-byte chunks
-  beq     $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
-                        # There will be at most 1 32-byte chunk after it
-   subu   $a3,$a2,$t8 # subtract from a2 the reminder
-                      # Here a3 counts bytes in 16w chunks
-  addu    $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
-  addu    $t0,$a0,$a2 # t0 is the "past the end" address
-# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
-# the "t0-32" address
-# This means: for x=128 the last "safe" a0 address is "t0-160"
-# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
-# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
-  subu    $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
-  pref    0,0($a1)    # bring the first line of src, addr 0
-  pref    0,32($a1) # bring the second line of src, addr 32
-  pref    0,64($a1) # bring the third line of src, addr 64
-  pref    30,32($a0)  # safe, as we have at least 64 bytes ahead
-# In case the a0 > t9 don't use "pref 30" at all
-  sgtu    $v1,$a0,$t9
-  bgtz    $v1,loop16w # skip "pref 30,64(a0)" for too short arrays
-   nop
-# otherwise, start with using pref30
-  pref    30,64($a0)
-loop16w:
-  pref    0,96($a1)
-  lw      $t0,0($a1)
-  bgtz    $v1,skip_pref30_96  # skip "pref 30,96(a0)"
-   lw     $t1,4($a1)
-  pref    30,96($a0)   # continue setting up the dest, addr 96
-skip_pref30_96:
-  lw      $t2,8($a1)
-  lw      $t3,12($a1)
-  lw      $t4,16($a1)
-  lw      $t5,20($a1)
-  lw      $t6,24($a1)
-  lw      $t7,28($a1)
-  pref    0,128($a1)    # bring the next lines of src, addr 128
-  sw      $t0,0($a0)
-  sw      $t1,4($a0)
-  sw      $t2,8($a0)
-  sw      $t3,12($a0)
-  sw      $t4,16($a0)
-  sw      $t5,20($a0)
-  sw      $t6,24($a0)
-  sw      $t7,28($a0)
-  lw      $t0,32($a1)
-  bgtz    $v1,skip_pref30_128 # skip "pref 30,128(a0)"
-  lw      $t1,36($a1)
-  pref    30,128($a0)   # continue setting up the dest, addr 128
-skip_pref30_128:
-  lw      $t2,40($a1)
-  lw      $t3,44($a1)
-  lw      $t4,48($a1)
-  lw      $t5,52($a1)
-  lw      $t6,56($a1)
-  lw      $t7,60($a1)
-  pref    0, 160($a1)    # bring the next lines of src, addr 160
-  sw      $t0,32($a0)
-  sw      $t1,36($a0)
-  sw      $t2,40($a0)
-  sw      $t3,44($a0)
-  sw      $t4,48($a0)
-  sw      $t5,52($a0)
-  sw      $t6,56($a0)
-  sw      $t7,60($a0)
-  addiu   $a0,$a0,64  # adding 64 to dest
-  sgtu    $v1,$a0,$t9
-  bne     $a0,$a3,loop16w
-   addiu  $a1,$a1,64  # adding 64 to src
-  move    $a2,$t8
-# Here we have src and dest word-aligned but less than 64-bytes to go
-chk8w:
-  pref 0, 0x0($a1)
-  andi    $t8,$a2,0x1f  # is there a 32-byte chunk?
-                        # the t8 is the reminder count past 32-bytes
-  beq     $a2,$t8,chk1w # when a2=t8, no 32-byte chunk
-   nop
-  lw      $t0,0($a1)
-  lw      $t1,4($a1)
-  lw      $t2,8($a1)
-  lw      $t3,12($a1)
-  lw      $t4,16($a1)
-  lw      $t5,20($a1)
-  lw      $t6,24($a1)
-  lw      $t7,28($a1)
-  addiu   $a1,$a1,32
-  sw      $t0,0($a0)
-  sw      $t1,4($a0)
-  sw      $t2,8($a0)
-  sw      $t3,12($a0)
-  sw      $t4,16($a0)
-  sw      $t5,20($a0)
-  sw      $t6,24($a0)
-  sw      $t7,28($a0)
-  addiu   $a0,$a0,32
-chk1w:
-  andi    $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
-  beq     $a2,$t8,last8
-   subu   $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
-  addu    $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
-# copying in words (4-byte chunks)
-wordCopy_loop:
-  lw      $t3,0($a1)  # the first t3 may be equal t0 ... optimize?
-  addiu   $a1,$a1,4
-  addiu   $a0,$a0,4
-  bne     $a0,$a3,wordCopy_loop
-   sw     $t3,-4($a0)
-# For the last (<8) bytes
-last8:
-  blez    $a2,leave
-   addu   $a3,$a0,$a2 # a3 is the last dst address
-last8loop:
-  lb      $v1,0($a1)
-  addiu   $a1,$a1,1
-  addiu   $a0,$a0,1
-  bne     $a0,$a3,last8loop
-   sb     $v1,-1($a0)
-leave:
-  j       $ra
-   nop
-#
-# UNALIGNED case
-#
-unaligned:
-  # got here with a3="negu a0"
-  andi    $a3,$a3,0x3 # test if the a0 is word aligned
-  beqz    $a3,ua_chk16w
-   subu   $a2,$a2,$a3 # bytes left after initial a3 bytes
-  lwr     $v1,0($a1)
-  lwl     $v1,3($a1)
-  addu    $a1,$a1,$a3 # a3 may be here 1, 2 or 3
-  swr     $v1,0($a0)
-  addu    $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)
-ua_chk16w:
-  andi    $t8,$a2,0x3f  # any whole 64-byte chunks?
-                        # t8 is the byte count after 64-byte chunks
-  beq     $a2,$t8,ua_chk8w  # if a2==t8, no 64-byte chunks
-  # There will be at most 1 32-byte chunk after it
-  subu    $a3,$a2,$t8 # subtract from a2 the reminder
-                      # Here a3 counts bytes in 16w chunks
-  addu    $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
-  addu    $t0,$a0,$a2 # t0 is the "past the end" address
-  subu    $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
-  pref    0,0($a1)    # bring the first line of src, addr 0
-  pref    0,32($a1) # bring the second line of src, addr 32
-  pref    0,64($a1) # bring the third line of src, addr 64
-  pref    30,32($a0)  # safe, as we have at least 64 bytes ahead
-# In case the a0 > t9 don't use "pref 30" at all
-  sgtu    $v1,$a0,$t9
-  bgtz    $v1,ua_loop16w  # skip "pref 30,64(a0)" for too short arrays
-   nop
-# otherwise, start with using pref30
-  pref    30,64($a0)
-ua_loop16w:
-  pref    0,96($a1)
-  lwr     $t0,0($a1)
-  lwl     $t0,3($a1)
-  lwr     $t1,4($a1)
-  bgtz    $v1,ua_skip_pref30_96
-   lwl    $t1,7($a1)
-  pref    30,96($a0)   # continue setting up the dest, addr 96
-ua_skip_pref30_96:
-  lwr     $t2,8($a1)
-  lwl     $t2,11($a1)
-  lwr     $t3,12($a1)
-  lwl     $t3,15($a1)
-  lwr     $t4,16($a1)
-  lwl     $t4,19($a1)
-  lwr     $t5,20($a1)
-  lwl     $t5,23($a1)
-  lwr     $t6,24($a1)
-  lwl     $t6,27($a1)
-  lwr     $t7,28($a1)
-  lwl     $t7,31($a1)
-  pref    0,128($a1)    # bring the next lines of src, addr 128
-  sw      $t0,0($a0)
-  sw      $t1,4($a0)
-  sw      $t2,8($a0)
-  sw      $t3,12($a0)
-  sw      $t4,16($a0)
-  sw      $t5,20($a0)
-  sw      $t6,24($a0)
-  sw      $t7,28($a0)
-  lwr     $t0,32($a1)
-  lwl     $t0,35($a1)
-  lwr     $t1,36($a1)
-  bgtz    $v1,ua_skip_pref30_128
-   lwl    $t1,39($a1)
-  pref    30,128($a0)   # continue setting up the dest, addr 128
-ua_skip_pref30_128:
-  lwr     $t2,40($a1)
-  lwl     $t2,43($a1)
-  lwr     $t3,44($a1)
-  lwl     $t3,47($a1)
-  lwr     $t4,48($a1)
-  lwl     $t4,51($a1)
-  lwr     $t5,52($a1)
-  lwl     $t5,55($a1)
-  lwr     $t6,56($a1)
-  lwl     $t6,59($a1)
-  lwr     $t7,60($a1)
-  lwl     $t7,63($a1)
-  pref    0, 160($a1)    # bring the next lines of src, addr 160
-  sw      $t0,32($a0)
-  sw      $t1,36($a0)
-  sw      $t2,40($a0)
-  sw      $t3,44($a0)
-  sw      $t4,48($a0)
-  sw      $t5,52($a0)
-  sw      $t6,56($a0)
-  sw      $t7,60($a0)
-  addiu   $a0,$a0,64  # adding 64 to dest
-  sgtu    $v1,$a0,$t9
-  bne     $a0,$a3,ua_loop16w
-   addiu  $a1,$a1,64  # adding 64 to src
-  move    $a2,$t8
-# Here we have src and dest word-aligned but less than 64-bytes to go
-ua_chk8w:
-  pref    0, 0x0($a1)
-  andi    $t8,$a2,0x1f  # is there a 32-byte chunk?
-                        # the t8 is the reminder count
-  beq     $a2,$t8,ua_chk1w  # when a2=t8, no 32-byte chunk
-   lwr    $t0,0($a1)
-  lwl     $t0,3($a1)
-  lwr     $t1,4($a1)
-  lwl     $t1,7($a1)
-  lwr     $t2,8($a1)
-  lwl     $t2,11($a1)
-  lwr     $t3,12($a1)
-  lwl     $t3,15($a1)
-  lwr     $t4,16($a1)
-  lwl     $t4,19($a1)
-  lwr     $t5,20($a1)
-  lwl     $t5,23($a1)
-  lwr     $t6,24($a1)
-  lwl     $t6,27($a1)
-  lwr     $t7,28($a1)
-  lwl     $t7,31($a1)
-  addiu   $a1,$a1,32
-  sw      $t0,0($a0)
-  sw      $t1,4($a0)
-  sw      $t2,8($a0)
-  sw      $t3,12($a0)
-  sw      $t4,16($a0)
-  sw      $t5,20($a0)
-  sw      $t6,24($a0)
-  sw      $t7,28($a0)
-  addiu   $a0,$a0,32
-ua_chk1w:
-  andi    $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
-  beq     $a2,$t8,ua_smallCopy
-  subu    $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
-  addu    $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
-# copying in words (4-byte chunks)
-ua_wordCopy_loop:
-  lwr     $v1,0($a1)
-  lwl     $v1,3($a1)
-  addiu   $a1,$a1,4
-  addiu   $a0,$a0,4   # note: dst=a0 is word aligned here, see NOTE1
-  bne     $a0,$a3,ua_wordCopy_loop
-   sw     $v1,-4($a0)
-# Now less than 4 bytes (value in a2) left to copy
-ua_smallCopy:
-  beqz    $a2,leave
-   addu   $a3,$a0,$a2 # a3 is the last dst address
-ua_smallCopy_loop:
-  lb      $v1,0($a1)
-  addiu   $a1,$a1,1
-  addiu   $a0,$a0,1
-  bne     $a0,$a3,ua_smallCopy_loop
-   sb     $v1,-1($a0)
-  j       $ra
-   nop
-  .set    at
-  .set    reorder
-  .end    memcpy_MIPS;
-  .size   memcpy_MIPS,.-memcpy_MIPS
-#endif // if defined (__mips__)
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -15,14 +15,361 @@ namespace libyuv {
 extern "C" {
 #endif
-#if !defined(YUV_DISABLE_ASM) && defined(__mips__)
 #ifdef HAS_COPYROW_MIPS
-extern "C" void  memcpy_MIPS(uint8* dst, const uint8* src, int count);
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
-  memcpy_MIPS(dst, src, count);
+  __asm__ __volatile__ (
+    ".set      noreorder                         \n"
+    ".set      noat                              \n"
+    "slti      $at, %[count], 8                  \n"
+    "bne       $at ,$zero, $last8                \n"
+    "xor       $t8, %[src], %[dst]               \n"
+    "andi      $t8, $t8, 0x3                     \n"
+    "bne       $t8, $zero, unaligned             \n"
+    "negu      $a3, %[dst]                       \n"
+    // make dst/src aligned
+    "andi      $a3, $a3, 0x3                     \n"
+    "beq       $a3, $zero, $chk16w               \n"
+    // word-aligned now count is the remining bytes count
+    "subu     %[count], %[count], $a3            \n"
+    "lwr       $t8, 0(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"
+    "swr       $t8, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+    // Now the dst/src are mutually word-aligned with word-aligned addresses
+    "$chk16w:                                    \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, chk8w              \n"
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"
+    // t0 is the "past the end" address
+    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+    // the "t0-32" address
+    // This means: for x=128 the last "safe" a1 address is "t0-160"
+    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+    // we will use "pref 30,128(a1)", so "t0-160" is the limit
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line of src
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $loop16w                     \n"
+    "nop                                         \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$loop16w:                                    \n"
+    "pref      0, 96(%[src])                     \n"
+    "lw        $t0, 0(%[src])                    \n"
+    "bgtz      $v1, $skip_pref30_96              \n"  // skip
+    "lw        $t1, 4(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"  // continue
+    "$skip_pref30_96:                            \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    //  bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lw        $t0, 32(%[src])                   \n"
+    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
+    "lw        $t1, 36(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
+    "$skip_pref30_128:                           \n"
+    "lw        $t2, 40(%[src])                   \n"
+    "lw        $t3, 44(%[src])                   \n"
+    "lw        $t4, 48(%[src])                   \n"
+    "lw        $t5, 52(%[src])                   \n"
+    "lw        $t6, 56(%[src])                   \n"
+    "lw        $t7, 60(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bne       %[dst], $a3, $loop16w             \n"
+    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
+    "move      %[count], $t8                     \n"
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+    "chk8w:                                      \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count past 32-bytes
+    "beq       %[count], $t8, chk1w              \n"
+    // count=t8,no 32-byte chunk
+    " nop                                        \n"
+    "lw        $t0, 0(%[src])                    \n"
+    "lw        $t1, 4(%[src])                    \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+    "chk1w:                                      \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, $last8             \n"
+    " subu     $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+    // copying in words (4-byte chunks)
+    "$wordCopy_loop:                             \n"
+    "lw        $t3, 0(%[src])                    \n"
+    // the first t3 may be equal t0 ... optimize?
+    "addiu     %[src], %[src],4                  \n"
+    "addiu     %[dst], %[dst],4                  \n"
+    "bne       %[dst], $a3,$wordCopy_loop        \n"
+    " sw       $t3, -4(%[dst])                   \n"
+    // For the last (<8) bytes
+    "$last8:                                     \n"
+    "blez      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
+    "$last8loop:                                 \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst], $a3, $last8loop           \n"
+    " sb       $v1, -1(%[dst])                   \n"
+    "leave:                                      \n"
+    "  j       $ra                               \n"
+    "  nop                                       \n"
+    //
+    // UNALIGNED case
+    //
+    "unaligned:                                  \n"
+    // got here with a3="negu a1"
+    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
+    "beqz      $a3, $ua_chk16w                   \n"
+    " subu     %[count], %[count], $a3           \n"
+    // bytes left after initial a3 bytes
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
+    "swr       $v1, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+    // below the dst will be word aligned (NOTE1)
+    "$ua_chk16w:                                 \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, ua_chk8w           \n"
+    // if a2==t8, no 64-byte chunks
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line  addr 32
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // safe, as we have at least 64 bytes ahead
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $ua_loop16w                  \n"
+    // skip "pref 30,64(a1)" for too short arrays
+    " nop                                        \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$ua_loop16w:                                \n"
+    "pref      0, 96(%[src])                     \n"
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "bgtz      $v1, $ua_skip_pref30_96           \n"
+    " lwl      $t1, 7(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"
+    // continue setting up the dest, addr 96
+    "$ua_skip_pref30_96:                         \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    // bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lwr       $t0, 32(%[src])                   \n"
+    "lwl       $t0, 35(%[src])                   \n"
+    "lwr       $t1, 36(%[src])                   \n"
+    "bgtz      $v1, ua_skip_pref30_128           \n"
+    " lwl      $t1, 39(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"
+    // continue setting up the dest, addr 128
+    "ua_skip_pref30_128:                         \n"
+    "lwr       $t2, 40(%[src])                   \n"
+    "lwl       $t2, 43(%[src])                   \n"
+    "lwr       $t3, 44(%[src])                   \n"
+    "lwl       $t3, 47(%[src])                   \n"
+    "lwr       $t4, 48(%[src])                   \n"
+    "lwl       $t4, 51(%[src])                   \n"
+    "lwr       $t5, 52(%[src])                   \n"
+    "lwl       $t5, 55(%[src])                   \n"
+    "lwr       $t6, 56(%[src])                   \n"
+    "lwl       $t6, 59(%[src])                   \n"
+    "lwr       $t7, 60(%[src])                   \n"
+    "lwl       $t7, 63(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
+    "sgtu      $v1,%[dst],$t9                    \n"
+    "bne       %[dst],$a3,$ua_loop16w            \n"
+    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
+    "move      %[count],$t8                      \n"
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+    "ua_chk8w:                                   \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count
+    "beq       %[count], $t8, $ua_chk1w          \n"
+    // when count==t8, no 32-byte chunk
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "lwl       $t1, 7(%[src])                    \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+    "$ua_chk1w:                                  \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, ua_smallCopy       \n"
+    "subu      $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+    // copying in words (4-byte chunks)
+    "$ua_wordCopy_loop:                          \n"
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addiu     %[src], %[src], 4                 \n"
+    "addiu     %[dst], %[dst], 4                 \n"
+    // note: dst=a1 is word aligned here, see NOTE1
+    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
+    " sw       $v1,-4(%[dst])                    \n"
+    // Now less than 4 bytes (value in count) left to copy
+    "ua_smallCopy:                               \n"
+    "beqz      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
+    "$ua_smallCopy_loop:                         \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
+    " sb       $v1, -1(%[dst])                   \n"
+    "j         $ra                               \n"
+    " nop                                        \n"
+    ".set      at                                \n"
+    ".set      reorder                           \n"
+       : [dst] "+r" (dst), [src] "+r" (src)
+       : [count] "r" (count)
+       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+       "t8", "t9", "a3", "v1", "at"
+  );
 }
 #endif  // HAS_COPYROW_MIPS
-#endif  // __mips__
 // MIPS DSPR2 functions
 #if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
@@ -88,7 +435,7 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
       [dst_v] "+r" (dst_v)
     :
     : "t0", "t1", "t2", "t3",
-       "t4", "t5", "t6", "t7", "t8", "t9"
+     "t4", "t5", "t6", "t7", "t8", "t9"
  );
 }
@@ -170,54 +517,54 @@ void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
       [dst_v] "+r" (dst_v)
     :
     : "t0", "t1", "t2", "t3",
-       "t4", "t5", "t6", "t7", "t8", "t9"
+     "t4", "t5", "t6", "t7", "t8", "t9"
  );
 }
 void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
  __asm__ __volatile__ (
-      ".set push                             \n"
+    ".set push                             \n"
-      ".set noreorder                        \n"
+    ".set noreorder                        \n"
-      "srl       $t4, %[width], 4            \n"  // multiplies of 16
+    "srl       $t4, %[width], 4            \n"  // multiplies of 16
-      "andi      $t5, %[width], 0xf          \n"
+    "andi      $t5, %[width], 0xf          \n"
-      "blez      $t4, 2f                     \n"
+    "blez      $t4, 2f                     \n"
-      " addu     %[src], %[src], %[width]    \n"  // src += width
+    " addu     %[src], %[src], %[width]    \n"  // src += width
    "1:                                      \n"
-      "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
+    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
-      "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
+    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
-      "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
+    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
-      "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
+    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
-      "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
+    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
-      "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
+    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
-      "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
+    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
-      "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
+    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
-      "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
+    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
-      "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
+    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
-      "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
+    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
-      "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
+    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
-      "addiu     %[src], %[src], -16         \n"
+    "addiu     %[src], %[src], -16         \n"
-      "addiu     $t4, $t4, -1                \n"
+    "addiu     $t4, $t4, -1                \n"
-      "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
+    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
-      "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
+    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
-      "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
+    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
-      "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
+    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
-      "bgtz      $t4, 1b                     \n"
+    "bgtz      $t4, 1b                     \n"
-      " addiu    %[dst], %[dst], 16          \n"
+    " addiu    %[dst], %[dst], 16          \n"
-      "beqz      $t5, 3f                     \n"
+    "beqz      $t5, 3f                     \n"
-      " nop                                  \n"
+    " nop                                  \n"
    "2:                                      \n"
-      "lbu       $t0, -1(%[src])             \n"
+    "lbu       $t0, -1(%[src])             \n"
-      "addiu     $t5, $t5, -1                \n"
+    "addiu     $t5, $t5, -1                \n"
-      "addiu     %[src], %[src], -1          \n"
+    "addiu     %[src], %[src], -1          \n"
-      "sb        $t0, 0(%[dst])              \n"
+    "sb        $t0, 0(%[dst])              \n"
-      "bgez      $t5, 2b                     \n"
+    "bgez      $t5, 2b                     \n"
-      " addiu    %[dst], %[dst], 1           \n"
+    " addiu    %[dst], %[dst], 1           \n"
    "3:                                      \n"
-      ".set pop                              \n"
+    ".set pop                              \n"
      : [src] "+r" (src), [dst] "+r" (dst)
      : [width] "r" (width)
      : "t0", "t1", "t2", "t3", "t4", "t5"
@@ -229,80 +576,80 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  int x = 0;
  int y = 0;
  __asm__ __volatile__ (
-      ".set push                                    \n"
+    ".set push                                    \n"
-      ".set noreorder                               \n"
+    ".set noreorder                               \n"
-      "addu            $t4, %[width], %[width]      \n"
+    "addu            $t4, %[width], %[width]      \n"
-      "srl             %[x], %[width], 4            \n"
+    "srl             %[x], %[width], 4            \n"
-      "andi            %[y], %[width], 0xf          \n"
+    "andi            %[y], %[width], 0xf          \n"
-      "blez            %[x], 2f                     \n"
+    "blez            %[x], 2f                     \n"
-      " addu           %[src_uv], %[src_uv], $t4    \n"
+    " addu           %[src_uv], %[src_uv], $t4    \n"
    "1:                                             \n"
-      "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
+    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
-      "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
+    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
-      "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
+    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
-      "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
+    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
-      "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
+    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
-      "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
+    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
-      "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
+    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
-      "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
+    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
-      "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
+    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
-      "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
+    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
-      "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
+    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
-      "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
+    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
-      "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
+    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
-      "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
+    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
-      "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
+    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
-      "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
+    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
-      "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
+    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
-      "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
+    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
-      "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
+    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
-      "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
+    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
-      "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
+    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
-      "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
+    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
-      "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
+    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
-      "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
+    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
-      "addiu           %[src_uv], %[src_uv], -32    \n"
+    "addiu           %[src_uv], %[src_uv], -32    \n"
-      "addiu           %[x], %[x], -1               \n"
+    "addiu           %[x], %[x], -1               \n"
-      "swr             $t4, 0(%[dst_u])             \n"
+    "swr             $t4, 0(%[dst_u])             \n"
-      "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
+    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
-      "swr             $t6, 0(%[dst_v])             \n"
+    "swr             $t6, 0(%[dst_v])             \n"
-      "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
+    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
-      "swr             $t2, 4(%[dst_u])             \n"
+    "swr             $t2, 4(%[dst_u])             \n"
-      "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
+    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
-      "swr             $t3, 4(%[dst_v])             \n"
+    "swr             $t3, 4(%[dst_v])             \n"
-      "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
+    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
-      "swr             $t0, 8(%[dst_u])             \n"
+    "swr             $t0, 8(%[dst_u])             \n"
-      "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
+    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
-      "swr             $t1, 8(%[dst_v])             \n"
+    "swr             $t1, 8(%[dst_v])             \n"
-      "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
+    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
-      "swr             $t9, 12(%[dst_u])            \n"
+    "swr             $t9, 12(%[dst_u])            \n"
-      "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
+    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
-      "swr             $t5, 12(%[dst_v])            \n"
+    "swr             $t5, 12(%[dst_v])            \n"
-      "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
+    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
-      "addiu           %[dst_v], %[dst_v], 16       \n"
+    "addiu           %[dst_v], %[dst_v], 16       \n"
-      "bgtz            %[x], 1b                     \n"
+    "bgtz            %[x], 1b                     \n"
-      " addiu          %[dst_u], %[dst_u], 16       \n"
+    " addiu          %[dst_u], %[dst_u], 16       \n"
-      "beqz            %[y], 3f                     \n"
+    "beqz            %[y], 3f                     \n"
-      " nop                                         \n"
+    " nop                                         \n"
-      "b               2f                           \n"
+    "b               2f                           \n"
-      " nop                                         \n"
+    " nop                                         \n"
    "2:                                             \n"
-      "lbu             $t0, -2(%[src_uv])           \n"
+    "lbu             $t0, -2(%[src_uv])           \n"
-      "lbu             $t1, -1(%[src_uv])           \n"
+    "lbu             $t1, -1(%[src_uv])           \n"
-      "addiu           %[src_uv], %[src_uv], -2     \n"
+    "addiu           %[src_uv], %[src_uv], -2     \n"
-      "addiu           %[y], %[y], -1               \n"
+    "addiu           %[y], %[y], -1               \n"
-      "sb              $t0, 0(%[dst_u])             \n"
+    "sb              $t0, 0(%[dst_u])             \n"
-      "sb              $t1, 0(%[dst_v])             \n"
+    "sb              $t1, 0(%[dst_v])             \n"
-      "addiu           %[dst_u], %[dst_u], 1        \n"
+    "addiu           %[dst_u], %[dst_u], 1        \n"
-      "bgtz            %[y], 2b                     \n"
+    "bgtz            %[y], 2b                     \n"
-      " addiu          %[dst_v], %[dst_v], 1        \n"
+    " addiu          %[dst_v], %[dst_v], 1        \n"
    "3:                                             \n"
-      ".set pop                                     \n"
+    ".set pop                                     \n"
      : [src_uv] "+r" (src_uv),
        [dst_u] "+r" (dst_u),
        [dst_v] "+r" (dst_v),
@@ -310,7 +657,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
        [y] "+r" (y)
      : [width] "r" (width)
      : "t0", "t1", "t2", "t3", "t4",
-        "t5", "t7", "t8", "t9"
+      "t5", "t7", "t8", "t9"
  );
 }
@@ -322,63 +669,63 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 // t2 = | 0 | R0 | 0 | r0 |
 // t1 = | 0 | R1 | 0 | r1 |
 #define I422ToTransientMipsRGB                                                 \
-        "lw                $t0, 0(%[y_buf])       \n"                          \
+      "lw                $t0, 0(%[y_buf])       \n"                          \
-        "lhu               $t1, 0(%[u_buf])       \n"                          \
+      "lhu               $t1, 0(%[u_buf])       \n"                          \
-        "lhu               $t2, 0(%[v_buf])       \n"                          \
+      "lhu               $t2, 0(%[v_buf])       \n"                          \
-        "preceu.ph.qbr     $t1, $t1               \n"                          \
+      "preceu.ph.qbr     $t1, $t1               \n"                          \
-        "preceu.ph.qbr     $t2, $t2               \n"                          \
+      "preceu.ph.qbr     $t2, $t2               \n"                          \
-        "preceu.ph.qbra    $t3, $t0               \n"                          \
+      "preceu.ph.qbra    $t3, $t0               \n"                          \
-        "preceu.ph.qbla    $t0, $t0               \n"                          \
+      "preceu.ph.qbla    $t0, $t0               \n"                          \
-        "subu.ph           $t1, $t1, $s5          \n"                          \
+      "subu.ph           $t1, $t1, $s5          \n"                          \
-        "subu.ph           $t2, $t2, $s5          \n"                          \
+      "subu.ph           $t2, $t2, $s5          \n"                          \
-        "subu.ph           $t3, $t3, $s4          \n"                          \
+      "subu.ph           $t3, $t3, $s4          \n"                          \
-        "subu.ph           $t0, $t0, $s4          \n"                          \
+      "subu.ph           $t0, $t0, $s4          \n"                          \
-        "mul.ph            $t3, $t3, $s0          \n"                          \
+      "mul.ph            $t3, $t3, $s0          \n"                          \
-        "mul.ph            $t0, $t0, $s0          \n"                          \
+      "mul.ph            $t0, $t0, $s0          \n"                          \
-        "shll.ph           $t4, $t1, 0x7          \n"                          \
+      "shll.ph           $t4, $t1, 0x7          \n"                          \
-        "subu.ph           $t4, $t4, $t1          \n"                          \
+      "subu.ph           $t4, $t4, $t1          \n"                          \
-        "mul.ph            $t6, $t1, $s1          \n"                          \
+      "mul.ph            $t6, $t1, $s1          \n"                          \
-        "mul.ph            $t1, $t2, $s2          \n"                          \
+      "mul.ph            $t1, $t2, $s2          \n"                          \
-        "addq_s.ph         $t5, $t4, $t3          \n"                          \
+      "addq_s.ph         $t5, $t4, $t3          \n"                          \
-        "addq_s.ph         $t4, $t4, $t0          \n"                          \
+      "addq_s.ph         $t4, $t4, $t0          \n"                          \
-        "shra.ph           $t5, $t5, 6            \n"                          \
+      "shra.ph           $t5, $t5, 6            \n"                          \
-        "shra.ph           $t4, $t4, 6            \n"                          \
+      "shra.ph           $t4, $t4, 6            \n"                          \
-        "addiu             %[u_buf], 2            \n"                          \
+      "addiu             %[u_buf], 2            \n"                          \
-        "addiu             %[v_buf], 2            \n"                          \
+      "addiu             %[v_buf], 2            \n"                          \
-        "addu.ph           $t6, $t6, $t1          \n"                          \
+      "addu.ph           $t6, $t6, $t1          \n"                          \
-        "mul.ph            $t1, $t2, $s3          \n"                          \
+      "mul.ph            $t1, $t2, $s3          \n"                          \
-        "addu.ph           $t9, $t6, $t3          \n"                          \
+      "addu.ph           $t9, $t6, $t3          \n"                          \
-        "addu.ph           $t8, $t6, $t0          \n"                          \
+      "addu.ph           $t8, $t6, $t0          \n"                          \
-        "shra.ph           $t9, $t9, 6            \n"                          \
+      "shra.ph           $t9, $t9, 6            \n"                          \
-        "shra.ph           $t8, $t8, 6            \n"                          \
+      "shra.ph           $t8, $t8, 6            \n"                          \
-        "addu.ph           $t2, $t1, $t3          \n"                          \
+      "addu.ph           $t2, $t1, $t3          \n"                          \
-        "addu.ph           $t1, $t1, $t0          \n"                          \
+      "addu.ph           $t1, $t1, $t0          \n"                          \
-        "shra.ph           $t2, $t2, 6            \n"                          \
+      "shra.ph           $t2, $t2, 6            \n"                          \
-        "shra.ph           $t1, $t1, 6            \n"                          \
+      "shra.ph           $t1, $t1, 6            \n"                          \
-        "subu.ph           $t5, $t5, $s5          \n"                          \
+      "subu.ph           $t5, $t5, $s5          \n"                          \
-        "subu.ph           $t4, $t4, $s5          \n"                          \
+      "subu.ph           $t4, $t4, $s5          \n"                          \
-        "subu.ph           $t9, $t9, $s5          \n"                          \
+      "subu.ph           $t9, $t9, $s5          \n"                          \
-        "subu.ph           $t8, $t8, $s5          \n"                          \
+      "subu.ph           $t8, $t8, $s5          \n"                          \
-        "subu.ph           $t2, $t2, $s5          \n"                          \
+      "subu.ph           $t2, $t2, $s5          \n"                          \
-        "subu.ph           $t1, $t1, $s5          \n"                          \
+      "subu.ph           $t1, $t1, $s5          \n"                          \
-        "shll_s.ph         $t5, $t5, 8            \n"                          \
+      "shll_s.ph         $t5, $t5, 8            \n"                          \
-        "shll_s.ph         $t4, $t4, 8            \n"                          \
+      "shll_s.ph         $t4, $t4, 8            \n"                          \
-        "shll_s.ph         $t9, $t9, 8            \n"                          \
+      "shll_s.ph         $t9, $t9, 8            \n"                          \
-        "shll_s.ph         $t8, $t8, 8            \n"                          \
+      "shll_s.ph         $t8, $t8, 8            \n"                          \
-        "shll_s.ph         $t2, $t2, 8            \n"                          \
+      "shll_s.ph         $t2, $t2, 8            \n"                          \
-        "shll_s.ph         $t1, $t1, 8            \n"                          \
+      "shll_s.ph         $t1, $t1, 8            \n"                          \
-        "shra.ph           $t5, $t5, 8            \n"                          \
+      "shra.ph           $t5, $t5, 8            \n"                          \
-        "shra.ph           $t4, $t4, 8            \n"                          \
+      "shra.ph           $t4, $t4, 8            \n"                          \
-        "shra.ph           $t9, $t9, 8            \n"                          \
+      "shra.ph           $t9, $t9, 8            \n"                          \
-        "shra.ph           $t8, $t8, 8            \n"                          \
+      "shra.ph           $t8, $t8, 8            \n"                          \
-        "shra.ph           $t2, $t2, 8            \n"                          \
+      "shra.ph           $t2, $t2, 8            \n"                          \
-        "shra.ph           $t1, $t1, 8            \n"                          \
+      "shra.ph           $t1, $t1, 8            \n"                          \
-        "addu.ph           $t5, $t5, $s5          \n"                          \
+      "addu.ph           $t5, $t5, $s5          \n"                          \
-        "addu.ph           $t4, $t4, $s5          \n"                          \
+      "addu.ph           $t4, $t4, $s5          \n"                          \
-        "addu.ph           $t9, $t9, $s5          \n"                          \
+      "addu.ph           $t9, $t9, $s5          \n"                          \
-        "addu.ph           $t8, $t8, $s5          \n"                          \
+      "addu.ph           $t8, $t8, $s5          \n"                          \
-        "addu.ph           $t2, $t2, $s5          \n"                          \
+      "addu.ph           $t2, $t2, $s5          \n"                          \
-        "addu.ph           $t1, $t1, $s5          \n"
+      "addu.ph           $t1, $t1, $s5          \n"
 void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
                              const uint8* u_buf,
@@ -386,47 +733,47 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
                              uint8* rgb_buf,
                              int width) {
  __asm__ __volatile__ (
-      ".set push                                \n"
+    ".set push                                \n"
-      ".set noreorder                           \n"
+    ".set noreorder                           \n"
-      "beqz              %[width], 2f           \n"
+    "beqz              %[width], 2f           \n"
-      " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
-      "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
-      "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
-      "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
-      "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
-      "repl.ph           $s5, 128               \n"  // |128|128| // clipping
+    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
-      "lui               $s6, 0xff00            \n"
+    "lui               $s6, 0xff00            \n"
-      "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
    "1:                                         \n"
      I422ToTransientMipsRGB
 // Arranging into argb format
-      "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
+    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
-      "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
+    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
-      "addiu             %[width], -4           \n"
+    "addiu             %[width], -4           \n"
-      "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
-      "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
-      "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
-      "addiu             %[y_buf], 4            \n"
+    "addiu             %[y_buf], 4            \n"
-      "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
-      "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
-      "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
+    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
-      "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
+    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
-      "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
-      "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
-      "sll               $t9, $t9, 16           \n"
+    "sll               $t9, $t9, 16           \n"
-      "sll               $t8, $t8, 16           \n"
+    "sll               $t8, $t8, 16           \n"
-      "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
-      "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
 // Store results.
-      "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t2, 0(%[rgb_buf])     \n"
-      "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
-      "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
-      "sw                $t3, 12(%[rgb_buf])    \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
-      "bnez              %[width], 1b           \n"
+    "bnez              %[width], 1b           \n"
-      " addiu            %[rgb_buf], 16         \n"
+    " addiu            %[rgb_buf], 16         \n"
    "2:                                         \n"
-      ".set pop                                 \n"
+    ".set pop                                 \n"
      :[y_buf] "+r" (y_buf),
       [u_buf] "+r" (u_buf),
       [v_buf] "+r" (v_buf),
@@ -434,9 +781,9 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
       [rgb_buf] "+r" (rgb_buf)
      :
      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-        "t6", "t7", "t8", "t9",
+      "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3",
+      "s0", "s1", "s2", "s3",
-        "s4", "s5", "s6"
+      "s4", "s5", "s6"
  );
 }
@@ -446,47 +793,47 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
                              uint8* rgb_buf,
                              int width) {
  __asm__ __volatile__ (
-      ".set push                                \n\t"
+    ".set push                                \n\t"
-      ".set noreorder                           \n\t"
+    ".set noreorder                           \n\t"
-      "beqz              %[width], 2f           \n\t"
+    "beqz              %[width], 2f           \n\t"
-      " repl.ph          $s0, 74                \n\t"  // |YG|YG| = |74|74|
+    " repl.ph          $s0, 74                \n\t"  // |YG|YG| = |74|74|
-      "repl.ph           $s1, -25               \n\t"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s1, -25               \n\t"  // |UG|UG| = |-25|-25|
-      "repl.ph           $s2, -52               \n\t"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s2, -52               \n\t"  // |VG|VG| = |-52|-52|
-      "repl.ph           $s3, 102               \n\t"  // |VR|VR| = |102|102|
+    "repl.ph           $s3, 102               \n\t"  // |VR|VR| = |102|102|
-      "repl.ph           $s4, 16                \n\t"  // |0|16|0|16|
+    "repl.ph           $s4, 16                \n\t"  // |0|16|0|16|
-      "repl.ph           $s5, 128               \n\t"  // |128|128|
+    "repl.ph           $s5, 128               \n\t"  // |128|128|
-      "lui               $s6, 0xff00            \n\t"
+    "lui               $s6, 0xff00            \n\t"
-      "ori               $s6, 0xff00            \n\t"  // |ff|00|ff|00|
+    "ori               $s6, 0xff00            \n\t"  // |ff|00|ff|00|
    "1:                                         \n"
      I422ToTransientMipsRGB
 // Arranging into abgr format
-      "precr.qb.ph      $t0, $t8, $t1           \n\t"  // |G1|g1|R1|r1|
+    "precr.qb.ph      $t0, $t8, $t1           \n\t"  // |G1|g1|R1|r1|
-      "precr.qb.ph      $t3, $t9, $t2           \n\t"  // |G0|g0|R0|r0|
+    "precr.qb.ph      $t3, $t9, $t2           \n\t"  // |G0|g0|R0|r0|
-      "precrq.qb.ph     $t8, $t0, $t3           \n\t"  // |G1|R1|G0|R0|
+    "precrq.qb.ph     $t8, $t0, $t3           \n\t"  // |G1|R1|G0|R0|
-      "precr.qb.ph      $t9, $t0, $t3           \n\t"  // |g1|r1|g0|r0|
+    "precr.qb.ph      $t9, $t0, $t3           \n\t"  // |g1|r1|g0|r0|
-      "precr.qb.ph       $t2, $t4, $t5          \n\t"  // |B1|b1|B0|b0|
+    "precr.qb.ph       $t2, $t4, $t5          \n\t"  // |B1|b1|B0|b0|
-      "addiu             %[width], -4           \n\t"
+    "addiu             %[width], -4           \n\t"
-      "addiu             %[y_buf], 4            \n\t"
+    "addiu             %[y_buf], 4            \n\t"
-      "preceu.ph.qbla    $t1, $t2               \n\t"  // |0 |B1|0 |B0|
+    "preceu.ph.qbla    $t1, $t2               \n\t"  // |0 |B1|0 |B0|
-      "preceu.ph.qbra    $t2, $t2               \n\t"  // |0 |b1|0 |b0|
+    "preceu.ph.qbra    $t2, $t2               \n\t"  // |0 |b1|0 |b0|
-      "or                $t1, $t1, $s6          \n\t"  // |ff|B1|ff|B0|
+    "or                $t1, $t1, $s6          \n\t"  // |ff|B1|ff|B0|
-      "or                $t2, $t2, $s6          \n\t"  // |ff|b1|ff|b0|
+    "or                $t2, $t2, $s6          \n\t"  // |ff|b1|ff|b0|
-      "precrq.ph.w       $t0, $t2, $t9          \n\t"  // |ff|b1|g1|r1|
+    "precrq.ph.w       $t0, $t2, $t9          \n\t"  // |ff|b1|g1|r1|
-      "precrq.ph.w       $t3, $t1, $t8          \n\t"  // |ff|B1|G1|R1|
+    "precrq.ph.w       $t3, $t1, $t8          \n\t"  // |ff|B1|G1|R1|
-      "sll               $t9, $t9, 16           \n\t"
+    "sll               $t9, $t9, 16           \n\t"
-      "sll               $t8, $t8, 16           \n\t"
+    "sll               $t8, $t8, 16           \n\t"
-      "packrl.ph         $t2, $t2, $t9          \n\t"  // |ff|b0|g0|r0|
+    "packrl.ph         $t2, $t2, $t9          \n\t"  // |ff|b0|g0|r0|
-      "packrl.ph         $t1, $t1, $t8          \n\t"  // |ff|B0|G0|R0|
+    "packrl.ph         $t1, $t1, $t8          \n\t"  // |ff|B0|G0|R0|
 // Store results.
-      "sw                $t2, 0(%[rgb_buf])     \n\t"
+    "sw                $t2, 0(%[rgb_buf])     \n\t"
-      "sw                $t0, 4(%[rgb_buf])     \n\t"
+    "sw                $t0, 4(%[rgb_buf])     \n\t"
-      "sw                $t1, 8(%[rgb_buf])     \n\t"
+    "sw                $t1, 8(%[rgb_buf])     \n\t"
-      "sw                $t3, 12(%[rgb_buf])    \n\t"
+    "sw                $t3, 12(%[rgb_buf])    \n\t"
-      "bnez              %[width], 1b           \n\t"
+    "bnez              %[width], 1b           \n\t"
-      " addiu            %[rgb_buf], 16         \n\t"
+    " addiu            %[rgb_buf], 16         \n\t"
    "2:                                         \n\t"
-      ".set pop                                 \n\t"
+    ".set pop                                 \n\t"
      :[y_buf] "+r" (y_buf),
       [u_buf] "+r" (u_buf),
       [v_buf] "+r" (v_buf),
@@ -494,9 +841,9 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
       [rgb_buf] "+r" (rgb_buf)
      :
      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-        "t6", "t7", "t8", "t9",
+      "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3",
+      "s0", "s1", "s2", "s3",
-        "s4", "s5", "s6"
+      "s4", "s5", "s6"
  );
 }
@@ -506,49 +853,49 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
                              uint8* rgb_buf,
                              int width) {
  __asm__ __volatile__ (
-      ".set push                                \n"
+    ".set push                                \n"
-      ".set noreorder                           \n"
+    ".set noreorder                           \n"
-      "beqz              %[width], 2f           \n"
+    "beqz              %[width], 2f           \n"
-      " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
-      "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
-      "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
-      "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
-      "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
-      "repl.ph           $s5, 128               \n"  // |128|128|
+    "repl.ph           $s5, 128               \n"  // |128|128|
-      "lui               $s6, 0xff              \n"
+    "lui               $s6, 0xff              \n"
-      "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
+    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
    "1:                                         \n"
      I422ToTransientMipsRGB
      // Arranging into bgra format
-      "precr.qb.ph      $t4, $t4, $t8           \n"  // |B1|b1|G1|g1|
+    "precr.qb.ph      $t4, $t4, $t8           \n"  // |B1|b1|G1|g1|
-      "precr.qb.ph      $t5, $t5, $t9           \n"  // |B0|b0|G0|g0|
+    "precr.qb.ph      $t5, $t5, $t9           \n"  // |B0|b0|G0|g0|
-      "precrq.qb.ph     $t8, $t4, $t5           \n"  // |B1|G1|B0|G0|
+    "precrq.qb.ph     $t8, $t4, $t5           \n"  // |B1|G1|B0|G0|
-      "precr.qb.ph      $t9, $t4, $t5           \n"  // |b1|g1|b0|g0|
+    "precr.qb.ph      $t9, $t4, $t5           \n"  // |b1|g1|b0|g0|
-      "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
-      "addiu             %[width], -4           \n"
+    "addiu             %[width], -4           \n"
-      "addiu             %[y_buf], 4            \n"
+    "addiu             %[y_buf], 4            \n"
-      "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
-      "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
-      "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
+    "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
-      "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
+    "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
-      "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
+    "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
-      "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
+    "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
-      "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
+    "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
-      "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
+    "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
-      "sll               $t1, $t1, 16           \n"
+    "sll               $t1, $t1, 16           \n"
-      "sll               $t2, $t2, 16           \n"
+    "sll               $t2, $t2, 16           \n"
-      "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
+    "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
-      "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
+    "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
 // Store results.
-      "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t2, 0(%[rgb_buf])     \n"
-      "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
-      "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
-      "sw                $t3, 12(%[rgb_buf])    \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
-      "bnez              %[width], 1b           \n"
+    "bnez              %[width], 1b           \n"
-      " addiu            %[rgb_buf], 16         \n"
+    " addiu            %[rgb_buf], 16         \n"
    "2:                                         \n"
-      ".set pop                                 \n"
+    ".set pop                                 \n"
      :[y_buf] "+r" (y_buf),
       [u_buf] "+r" (u_buf),
       [v_buf] "+r" (v_buf),
@@ -556,9 +903,9 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
       [rgb_buf] "+r" (rgb_buf)
      :
      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-        "t6", "t7", "t8", "t9",
+      "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3",
+      "s0", "s1", "s2", "s3",
-        "s4", "s5", "s6"
+      "s4", "s5", "s6"
  );
 }
 #endif  // __mips_dsp_rev >= 2