Mips memcpy moved to row_mips.

BUG=191 TEST=none Review URL: https://webrtc-codereview.appspot.com/1127005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@580 16f28f9a-4ce2-e073-06de-1de4eb20be90

Mips memcpy moved to row_mips.
BUG=191 TEST=none Review URL: https://webrtc-codereview.appspot.com/1127005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@580 16f28f9a-4ce2-e073-06de-1de4eb20be90
c22cd5b2 · fbarchard@google.com · c0d9c346 · c22cd5b2 · c22cd5b2 · c22cd5b2
Commit c22cd5b2 authored Feb 23, 2013 by fbarchard@google.com
Show whitespace changes
Inline Side-by-side

Showing with 351 additions and 362 deletions

libyuv.gyp libyuv.gyp +0 -1

memcpy_mips.S source/memcpy_mips.S +0 -357

row_mips.cc source/row_mips.cc +351 -4

No files found.
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -75,7 +75,6 @@
        'source/convert_from_argb.cc',
        'source/cpu_id.cc',
        'source/format_conversion.cc',
-        'source/memcpy_mips.S',  # TODO(fbarchard): Move into row_mips.cc
        'source/mjpeg_decoder.cc',
        'source/planar_functions.cc',
        'source/rotate.cc',

--- a/source/memcpy_mips.S
+++ b/source/memcpy_mips.S
-#if defined (__mips__)
-#
-#  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
-#
-#  Use of this source code is governed by a BSD-style license
-#  that can be found in the LICENSE file in the root of the source
-#  tree. An additional intellectual property rights grant can be found
-#  in the file PATENTS. All contributing project authors may
-#  be found in the AUTHORS file in the root of the source tree.
-#
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-  .text
-
-  .globl  memcpy_MIPS;
-  .align  2;
-  .type   memcpy_MIPS,@function;
-  .ent    memcpy_MIPS,0;
-memcpy_MIPS:
-  .frame  $sp,0,$ra
-  .set    noreorder
-  .set    noat
-
-  slti    $at,$a2,8
-  bne     $at,$zero,last8
-   move   $v0,$a0 # memcpy returns the dst pointer
-
-# Test if the src and dst are word-aligned, or can be made word-aligned
-  xor     $t8,$a1,$a0
-  andi    $t8,$t8,0x3   # t8 is a0/a1 word-displacement
-
-  bne     $t8,$zero,unaligned
-  negu    $a3,$a0
-
-  andi    $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
-  beq     $a3,$zero,chk16w  # when a3=0 then the dst (a0) is
-   subu   $a2,$a2,$a3 # word-aligned now a2 is the remining bytes count
-
-  lwr     $t8,0($a1)
-  addu    $a1,$a1,$a3
-  swr     $t8,0($a0)
-  addu    $a0,$a0,$a3
-
-# Now the dst/src are mutually word-aligned with word-aligned addresses
-chk16w:
-  andi    $t8,$a2,0x3f  # any whole 64-byte chunks?
-                        # t8 is the byte count after 64-byte chunks
-  beq     $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
-                        # There will be at most 1 32-byte chunk after it
-   subu   $a3,$a2,$t8 # subtract from a2 the reminder
-                      # Here a3 counts bytes in 16w chunks
-  addu    $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
-  addu    $t0,$a0,$a2 # t0 is the "past the end" address
-
-# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
-# the "t0-32" address
-# This means: for x=128 the last "safe" a0 address is "t0-160"
-# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
-# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
-  subu    $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
-
-  pref    0,0($a1)    # bring the first line of src, addr 0
-  pref    0,32($a1) # bring the second line of src, addr 32
-  pref    0,64($a1) # bring the third line of src, addr 64
-  pref    30,32($a0)  # safe, as we have at least 64 bytes ahead
-# In case the a0 > t9 don't use "pref 30" at all
-  sgtu    $v1,$a0,$t9
-  bgtz    $v1,loop16w # skip "pref 30,64(a0)" for too short arrays
-   nop
-# otherwise, start with using pref30
-  pref    30,64($a0)
-loop16w:
-  pref    0,96($a1)
-  lw      $t0,0($a1)
-  bgtz    $v1,skip_pref30_96  # skip "pref 30,96(a0)"
-   lw     $t1,4($a1)
-  pref    30,96($a0)   # continue setting up the dest, addr 96
-skip_pref30_96:
-  lw      $t2,8($a1)
-  lw      $t3,12($a1)
-  lw      $t4,16($a1)
-  lw      $t5,20($a1)
-  lw      $t6,24($a1)
-  lw      $t7,28($a1)
-  pref    0,128($a1)    # bring the next lines of src, addr 128
-
-  sw      $t0,0($a0)
-  sw      $t1,4($a0)
-  sw      $t2,8($a0)
-  sw      $t3,12($a0)
-  sw      $t4,16($a0)
-  sw      $t5,20($a0)
-  sw      $t6,24($a0)
-  sw      $t7,28($a0)
-
-  lw      $t0,32($a1)
-  bgtz    $v1,skip_pref30_128 # skip "pref 30,128(a0)"
-  lw      $t1,36($a1)
-  pref    30,128($a0)   # continue setting up the dest, addr 128
-skip_pref30_128:
-  lw      $t2,40($a1)
-  lw      $t3,44($a1)
-  lw      $t4,48($a1)
-  lw      $t5,52($a1)
-  lw      $t6,56($a1)
-  lw      $t7,60($a1)
-  pref    0, 160($a1)    # bring the next lines of src, addr 160
-
-  sw      $t0,32($a0)
-  sw      $t1,36($a0)
-  sw      $t2,40($a0)
-  sw      $t3,44($a0)
-  sw      $t4,48($a0)
-  sw      $t5,52($a0)
-  sw      $t6,56($a0)
-  sw      $t7,60($a0)
-
-  addiu   $a0,$a0,64  # adding 64 to dest
-  sgtu    $v1,$a0,$t9
-  bne     $a0,$a3,loop16w
-   addiu  $a1,$a1,64  # adding 64 to src
-  move    $a2,$t8
-
-# Here we have src and dest word-aligned but less than 64-bytes to go
-
-chk8w:
-  pref 0, 0x0($a1)
-  andi    $t8,$a2,0x1f  # is there a 32-byte chunk?
-                        # the t8 is the reminder count past 32-bytes
-  beq     $a2,$t8,chk1w # when a2=t8, no 32-byte chunk
-   nop
-
-  lw      $t0,0($a1)
-  lw      $t1,4($a1)
-  lw      $t2,8($a1)
-  lw      $t3,12($a1)
-  lw      $t4,16($a1)
-  lw      $t5,20($a1)
-  lw      $t6,24($a1)
-  lw      $t7,28($a1)
-  addiu   $a1,$a1,32
-
-  sw      $t0,0($a0)
-  sw      $t1,4($a0)
-  sw      $t2,8($a0)
-  sw      $t3,12($a0)
-  sw      $t4,16($a0)
-  sw      $t5,20($a0)
-  sw      $t6,24($a0)
-  sw      $t7,28($a0)
-  addiu   $a0,$a0,32
-
-chk1w:
-  andi    $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
-  beq     $a2,$t8,last8
-   subu   $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
-  addu    $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
-
-# copying in words (4-byte chunks)
-wordCopy_loop:
-  lw      $t3,0($a1)  # the first t3 may be equal t0 ... optimize?
-  addiu   $a1,$a1,4
-  addiu   $a0,$a0,4
-  bne     $a0,$a3,wordCopy_loop
-   sw     $t3,-4($a0)
-
-# For the last (<8) bytes
-last8:
-  blez    $a2,leave
-   addu   $a3,$a0,$a2 # a3 is the last dst address
-last8loop:
-  lb      $v1,0($a1)
-  addiu   $a1,$a1,1
-  addiu   $a0,$a0,1
-  bne     $a0,$a3,last8loop
-   sb     $v1,-1($a0)
-
-leave:
-  j       $ra
-   nop
-
-#
-# UNALIGNED case
-#
-
-unaligned:
-  # got here with a3="negu a0"
-  andi    $a3,$a3,0x3 # test if the a0 is word aligned
-  beqz    $a3,ua_chk16w
-   subu   $a2,$a2,$a3 # bytes left after initial a3 bytes
-
-  lwr     $v1,0($a1)
-  lwl     $v1,3($a1)
-  addu    $a1,$a1,$a3 # a3 may be here 1, 2 or 3
-  swr     $v1,0($a0)
-  addu    $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)
-
-ua_chk16w:
-  andi    $t8,$a2,0x3f  # any whole 64-byte chunks?
-                        # t8 is the byte count after 64-byte chunks
-  beq     $a2,$t8,ua_chk8w  # if a2==t8, no 64-byte chunks
-  # There will be at most 1 32-byte chunk after it
-  subu    $a3,$a2,$t8 # subtract from a2 the reminder
-                      # Here a3 counts bytes in 16w chunks
-  addu    $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
-  addu    $t0,$a0,$a2 # t0 is the "past the end" address
-  subu    $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
-  pref    0,0($a1)    # bring the first line of src, addr 0
-  pref    0,32($a1) # bring the second line of src, addr 32
-  pref    0,64($a1) # bring the third line of src, addr 64
-  pref    30,32($a0)  # safe, as we have at least 64 bytes ahead
-# In case the a0 > t9 don't use "pref 30" at all
-  sgtu    $v1,$a0,$t9
-  bgtz    $v1,ua_loop16w  # skip "pref 30,64(a0)" for too short arrays
-   nop
-# otherwise, start with using pref30
-  pref    30,64($a0)
-ua_loop16w:
-  pref    0,96($a1)
-  lwr     $t0,0($a1)
-  lwl     $t0,3($a1)
-  lwr     $t1,4($a1)
-  bgtz    $v1,ua_skip_pref30_96
-   lwl    $t1,7($a1)
-  pref    30,96($a0)   # continue setting up the dest, addr 96
-ua_skip_pref30_96:
-  lwr     $t2,8($a1)
-  lwl     $t2,11($a1)
-  lwr     $t3,12($a1)
-  lwl     $t3,15($a1)
-  lwr     $t4,16($a1)
-  lwl     $t4,19($a1)
-  lwr     $t5,20($a1)
-  lwl     $t5,23($a1)
-  lwr     $t6,24($a1)
-  lwl     $t6,27($a1)
-  lwr     $t7,28($a1)
-  lwl     $t7,31($a1)
-  pref    0,128($a1)    # bring the next lines of src, addr 128
-
-  sw      $t0,0($a0)
-  sw      $t1,4($a0)
-  sw      $t2,8($a0)
-  sw      $t3,12($a0)
-  sw      $t4,16($a0)
-  sw      $t5,20($a0)
-  sw      $t6,24($a0)
-  sw      $t7,28($a0)
-
-  lwr     $t0,32($a1)
-  lwl     $t0,35($a1)
-  lwr     $t1,36($a1)
-  bgtz    $v1,ua_skip_pref30_128
-   lwl    $t1,39($a1)
-  pref    30,128($a0)   # continue setting up the dest, addr 128
-ua_skip_pref30_128:
-  lwr     $t2,40($a1)
-  lwl     $t2,43($a1)
-  lwr     $t3,44($a1)
-  lwl     $t3,47($a1)
-  lwr     $t4,48($a1)
-  lwl     $t4,51($a1)
-  lwr     $t5,52($a1)
-  lwl     $t5,55($a1)
-  lwr     $t6,56($a1)
-  lwl     $t6,59($a1)
-  lwr     $t7,60($a1)
-  lwl     $t7,63($a1)
-  pref    0, 160($a1)    # bring the next lines of src, addr 160
-
-  sw      $t0,32($a0)
-  sw      $t1,36($a0)
-  sw      $t2,40($a0)
-  sw      $t3,44($a0)
-  sw      $t4,48($a0)
-  sw      $t5,52($a0)
-  sw      $t6,56($a0)
-  sw      $t7,60($a0)
-
-  addiu   $a0,$a0,64  # adding 64 to dest
-  sgtu    $v1,$a0,$t9
-  bne     $a0,$a3,ua_loop16w
-   addiu  $a1,$a1,64  # adding 64 to src
-  move    $a2,$t8
-
-# Here we have src and dest word-aligned but less than 64-bytes to go
-
-ua_chk8w:
-  pref    0, 0x0($a1)
-  andi    $t8,$a2,0x1f  # is there a 32-byte chunk?
-                        # the t8 is the reminder count
-  beq     $a2,$t8,ua_chk1w  # when a2=t8, no 32-byte chunk
-
-   lwr    $t0,0($a1)
-  lwl     $t0,3($a1)
-  lwr     $t1,4($a1)
-  lwl     $t1,7($a1)
-  lwr     $t2,8($a1)
-  lwl     $t2,11($a1)
-  lwr     $t3,12($a1)
-  lwl     $t3,15($a1)
-  lwr     $t4,16($a1)
-  lwl     $t4,19($a1)
-  lwr     $t5,20($a1)
-  lwl     $t5,23($a1)
-  lwr     $t6,24($a1)
-  lwl     $t6,27($a1)
-  lwr     $t7,28($a1)
-  lwl     $t7,31($a1)
-  addiu   $a1,$a1,32
-
-  sw      $t0,0($a0)
-  sw      $t1,4($a0)
-  sw      $t2,8($a0)
-  sw      $t3,12($a0)
-  sw      $t4,16($a0)
-  sw      $t5,20($a0)
-  sw      $t6,24($a0)
-  sw      $t7,28($a0)
-  addiu   $a0,$a0,32
-
-ua_chk1w:
-  andi    $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
-  beq     $a2,$t8,ua_smallCopy
-  subu    $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
-  addu    $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
-
-# copying in words (4-byte chunks)
-ua_wordCopy_loop:
-  lwr     $v1,0($a1)
-  lwl     $v1,3($a1)
-  addiu   $a1,$a1,4
-  addiu   $a0,$a0,4   # note: dst=a0 is word aligned here, see NOTE1
-  bne     $a0,$a3,ua_wordCopy_loop
-   sw     $v1,-4($a0)
-
-# Now less than 4 bytes (value in a2) left to copy
-ua_smallCopy:
-  beqz    $a2,leave
-   addu   $a3,$a0,$a2 # a3 is the last dst address
-ua_smallCopy_loop:
-  lb      $v1,0($a1)
-  addiu   $a1,$a1,1
-  addiu   $a0,$a0,1
-  bne     $a0,$a3,ua_smallCopy_loop
-   sb     $v1,-1($a0)
-
-  j       $ra
-   nop
-
-  .set    at
-  .set    reorder
-  .end    memcpy_MIPS;
-  .size   memcpy_MIPS,.-memcpy_MIPS
-
-#endif // if defined (__mips__)
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -15,14 +15,361 @@ namespace libyuv {
 extern "C" {
 #endif

-#if !defined(YUV_DISABLE_ASM) && defined(__mips__)
 #ifdef HAS_COPYROW_MIPS
-extern "C" void  memcpy_MIPS(uint8* dst, const uint8* src, int count);
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
-  memcpy_MIPS(dst, src, count);
+  __asm__ __volatile__ (
+    ".set      noreorder                         \n"
+    ".set      noat                              \n"
+    "slti      $at, %[count], 8                  \n"
+    "bne       $at ,$zero, $last8                \n"
+    "xor       $t8, %[src], %[dst]               \n"
+    "andi      $t8, $t8, 0x3                     \n"
+
+    "bne       $t8, $zero, unaligned             \n"
+    "negu      $a3, %[dst]                       \n"
+    // make dst/src aligned
+    "andi      $a3, $a3, 0x3                     \n"
+    "beq       $a3, $zero, $chk16w               \n"
+    // word-aligned now count is the remining bytes count
+    "subu     %[count], %[count], $a3            \n"
+
+    "lwr       $t8, 0(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"
+    "swr       $t8, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+
+    // Now the dst/src are mutually word-aligned with word-aligned addresses
+    "$chk16w:                                    \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, chk8w              \n"
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"
+    // t0 is the "past the end" address
+
+    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+    // the "t0-32" address
+    // This means: for x=128 the last "safe" a1 address is "t0-160"
+    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+    // we will use "pref 30,128(a1)", so "t0-160" is the limit
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line of src
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $loop16w                     \n"
+    "nop                                         \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$loop16w:                                    \n"
+    "pref      0, 96(%[src])                     \n"
+    "lw        $t0, 0(%[src])                    \n"
+    "bgtz      $v1, $skip_pref30_96              \n"  // skip
+    "lw        $t1, 4(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"  // continue
+    "$skip_pref30_96:                            \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    //  bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lw        $t0, 32(%[src])                   \n"
+    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
+    "lw        $t1, 36(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
+    "$skip_pref30_128:                           \n"
+    "lw        $t2, 40(%[src])                   \n"
+    "lw        $t3, 44(%[src])                   \n"
+    "lw        $t4, 48(%[src])                   \n"
+    "lw        $t5, 52(%[src])                   \n"
+    "lw        $t6, 56(%[src])                   \n"
+    "lw        $t7, 60(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bne       %[dst], $a3, $loop16w             \n"
+    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
+    "move      %[count], $t8                     \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "chk8w:                                      \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count past 32-bytes
+    "beq       %[count], $t8, chk1w              \n"
+    // count=t8,no 32-byte chunk
+    " nop                                        \n"
+
+    "lw        $t0, 0(%[src])                    \n"
+    "lw        $t1, 4(%[src])                    \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "chk1w:                                      \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, $last8             \n"
+    " subu     $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+    // copying in words (4-byte chunks)
+    "$wordCopy_loop:                             \n"
+    "lw        $t3, 0(%[src])                    \n"
+    // the first t3 may be equal t0 ... optimize?
+    "addiu     %[src], %[src],4                  \n"
+    "addiu     %[dst], %[dst],4                  \n"
+    "bne       %[dst], $a3,$wordCopy_loop        \n"
+    " sw       $t3, -4(%[dst])                   \n"
+
+    // For the last (<8) bytes
+    "$last8:                                     \n"
+    "blez      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
+    "$last8loop:                                 \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst], $a3, $last8loop           \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "leave:                                      \n"
+    "  j       $ra                               \n"
+    "  nop                                       \n"
+
+    //
+    // UNALIGNED case
+    //
+
+    "unaligned:                                  \n"
+    // got here with a3="negu a1"
+    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
+    "beqz      $a3, $ua_chk16w                   \n"
+    " subu     %[count], %[count], $a3           \n"
+    // bytes left after initial a3 bytes
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
+    "swr       $v1, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+    // below the dst will be word aligned (NOTE1)
+    "$ua_chk16w:                                 \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, ua_chk8w           \n"
+    // if a2==t8, no 64-byte chunks
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line  addr 32
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // safe, as we have at least 64 bytes ahead
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $ua_loop16w                  \n"
+    // skip "pref 30,64(a1)" for too short arrays
+    " nop                                        \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$ua_loop16w:                                \n"
+    "pref      0, 96(%[src])                     \n"
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "bgtz      $v1, $ua_skip_pref30_96           \n"
+    " lwl      $t1, 7(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"
+    // continue setting up the dest, addr 96
+    "$ua_skip_pref30_96:                         \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    // bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lwr       $t0, 32(%[src])                   \n"
+    "lwl       $t0, 35(%[src])                   \n"
+    "lwr       $t1, 36(%[src])                   \n"
+    "bgtz      $v1, ua_skip_pref30_128           \n"
+    " lwl      $t1, 39(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"
+    // continue setting up the dest, addr 128
+    "ua_skip_pref30_128:                         \n"
+
+    "lwr       $t2, 40(%[src])                   \n"
+    "lwl       $t2, 43(%[src])                   \n"
+    "lwr       $t3, 44(%[src])                   \n"
+    "lwl       $t3, 47(%[src])                   \n"
+    "lwr       $t4, 48(%[src])                   \n"
+    "lwl       $t4, 51(%[src])                   \n"
+    "lwr       $t5, 52(%[src])                   \n"
+    "lwl       $t5, 55(%[src])                   \n"
+    "lwr       $t6, 56(%[src])                   \n"
+    "lwl       $t6, 59(%[src])                   \n"
+    "lwr       $t7, 60(%[src])                   \n"
+    "lwl       $t7, 63(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
+    "sgtu      $v1,%[dst],$t9                    \n"
+    "bne       %[dst],$a3,$ua_loop16w            \n"
+    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
+    "move      %[count],$t8                      \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "ua_chk8w:                                   \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count
+    "beq       %[count], $t8, $ua_chk1w          \n"
+    // when count==t8, no 32-byte chunk
+
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "lwl       $t1, 7(%[src])                    \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "$ua_chk1w:                                  \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, ua_smallCopy       \n"
+    "subu      $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+
+    // copying in words (4-byte chunks)
+    "$ua_wordCopy_loop:                          \n"
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addiu     %[src], %[src], 4                 \n"
+    "addiu     %[dst], %[dst], 4                 \n"
+    // note: dst=a1 is word aligned here, see NOTE1
+    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
+    " sw       $v1,-4(%[dst])                    \n"
+
+    // Now less than 4 bytes (value in count) left to copy
+    "ua_smallCopy:                               \n"
+    "beqz      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
+    "$ua_smallCopy_loop:                         \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "j         $ra                               \n"
+    " nop                                        \n"
+    ".set      at                                \n"
+    ".set      reorder                           \n"
+       : [dst] "+r" (dst), [src] "+r" (src)
+       : [count] "r" (count)
+       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+       "t8", "t9", "a3", "v1", "at"
+  );
 }
 #endif  // HAS_COPYROW_MIPS
-#endif  // __mips__

 // MIPS DSPR2 functions
 #if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)