#if defined (__mips__) # # Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. # .globl memcpy_MIPS; .align 2; .type memcpy_MIPS,@function; .ent memcpy_MIPS,0; memcpy_MIPS: .frame $sp,0,$ra .set noreorder .set noat slti $at,$a2,8 bne $at,$zero,last8 move $v0,$a0 # memcpy returns the dst pointer # Test if the src and dst are word-aligned, or can be made word-aligned xor $t8,$a1,$a0 andi $t8,$t8,0x3 # t8 is a0/a1 word-displacement bne $t8,$zero,unaligned negu $a3,$a0 andi $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is subu $a2,$a2,$a3 # word-aligned now a2 is the remining bytes count lwr $t8,0($a1) addu $a1,$a1,$a3 swr $t8,0($a0) addu $a0,$a0,$a3 # Now the dst/src are mutually word-aligned with word-aligned addresses chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks? # t8 is the byte count after 64-byte chunks beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks # There will be at most 1 32-byte chunk after it subu $a3,$a2,$t8 # subtract from a2 the reminder # Here a3 counts bytes in 16w chunks addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks addu $t0,$a0,$a2 # t0 is the "past the end" address # When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past # the "t0-32" address # This means: for x=128 the last "safe" a0 address is "t0-160" # Alternatively, for x=64 the last "safe" a0 address is "t0-96" # In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address pref 0,0($a1) # bring the first line of src, addr 0 pref 0,32($a1) # bring the second line of src, addr 32 pref 0,64($a1) # bring the third line of src, addr 64 pref 30,32($a0) # safe, as we have at least 64 bytes ahead # In case the a0 > t9 don't use "pref 30" at all sgtu $v1,$a0,$t9 bgtz $v1,loop16w # skip "pref 30,64(a0)" for too short arrays nop # otherwise, start with using pref30 pref 30,64($a0) loop16w: pref 0,96($a1) lw $t0,0($a1) bgtz $v1,skip_pref30_96 # skip "pref 30,96(a0)" lw $t1,4($a1) pref 30,96($a0) # continue setting up the dest, addr 96 skip_pref30_96: lw $t2,8($a1) lw $t3,12($a1) lw $t4,16($a1) lw $t5,20($a1) lw $t6,24($a1) lw $t7,28($a1) pref 0,128($a1) # bring the next lines of src, addr 128 sw $t0,0($a0) sw $t1,4($a0) sw $t2,8($a0) sw $t3,12($a0) sw $t4,16($a0) sw $t5,20($a0) sw $t6,24($a0) sw $t7,28($a0) lw $t0,32($a1) bgtz $v1,skip_pref30_128 # skip "pref 30,128(a0)" lw $t1,36($a1) pref 30,128($a0) # continue setting up the dest, addr 128 skip_pref30_128: lw $t2,40($a1) lw $t3,44($a1) lw $t4,48($a1) lw $t5,52($a1) lw $t6,56($a1) lw $t7,60($a1) pref 0, 160($a1) # bring the next lines of src, addr 160 sw $t0,32($a0) sw $t1,36($a0) sw $t2,40($a0) sw $t3,44($a0) sw $t4,48($a0) sw $t5,52($a0) sw $t6,56($a0) sw $t7,60($a0) addiu $a0,$a0,64 # adding 64 to dest sgtu $v1,$a0,$t9 bne $a0,$a3,loop16w addiu $a1,$a1,64 # adding 64 to src move $a2,$t8 # Here we have src and dest word-aligned but less than 64-bytes to go chk8w: pref 0, 0x0($a1) andi $t8,$a2,0x1f # is there a 32-byte chunk? # the t8 is the reminder count past 32-bytes beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk nop lw $t0,0($a1) lw $t1,4($a1) lw $t2,8($a1) lw $t3,12($a1) lw $t4,16($a1) lw $t5,20($a1) lw $t6,24($a1) lw $t7,28($a1) addiu $a1,$a1,32 sw $t0,0($a0) sw $t1,4($a0) sw $t2,8($a0) sw $t3,12($a0) sw $t4,16($a0) sw $t5,20($a0) sw $t6,24($a0) sw $t7,28($a0) addiu $a0,$a0,32 chk1w: andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks beq $a2,$t8,last8 subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks # copying in words (4-byte chunks) wordCopy_loop: lw $t3,0($a1) # the first t3 may be equal t0 ... optimize? addiu $a1,$a1,4 addiu $a0,$a0,4 bne $a0,$a3,wordCopy_loop sw $t3,-4($a0) # For the last (<8) bytes last8: blez $a2,leave addu $a3,$a0,$a2 # a3 is the last dst address last8loop: lb $v1,0($a1) addiu $a1,$a1,1 addiu $a0,$a0,1 bne $a0,$a3,last8loop sb $v1,-1($a0) leave: j $ra nop # # UNALIGNED case # unaligned: # got here with a3="negu a0" andi $a3,$a3,0x3 # test if the a0 is word aligned beqz $a3,ua_chk16w subu $a2,$a2,$a3 # bytes left after initial a3 bytes lwr $v1,0($a1) lwl $v1,3($a1) addu $a1,$a1,$a3 # a3 may be here 1, 2 or 3 swr $v1,0($a0) addu $a0,$a0,$a3 # below the dst will be word aligned (NOTE1) ua_chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks? # t8 is the byte count after 64-byte chunks beq $a2,$t8,ua_chk8w # if a2==t8, no 64-byte chunks # There will be at most 1 32-byte chunk after it subu $a3,$a2,$t8 # subtract from a2 the reminder # Here a3 counts bytes in 16w chunks addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks addu $t0,$a0,$a2 # t0 is the "past the end" address subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address pref 0,0($a1) # bring the first line of src, addr 0 pref 0,32($a1) # bring the second line of src, addr 32 pref 0,64($a1) # bring the third line of src, addr 64 pref 30,32($a0) # safe, as we have at least 64 bytes ahead # In case the a0 > t9 don't use "pref 30" at all sgtu $v1,$a0,$t9 bgtz $v1,ua_loop16w # skip "pref 30,64(a0)" for too short arrays nop # otherwise, start with using pref30 pref 30,64($a0) ua_loop16w: pref 0,96($a1) lwr $t0,0($a1) lwl $t0,3($a1) lwr $t1,4($a1) bgtz $v1,ua_skip_pref30_96 lwl $t1,7($a1) pref 30,96($a0) # continue setting up the dest, addr 96 ua_skip_pref30_96: lwr $t2,8($a1) lwl $t2,11($a1) lwr $t3,12($a1) lwl $t3,15($a1) lwr $t4,16($a1) lwl $t4,19($a1) lwr $t5,20($a1) lwl $t5,23($a1) lwr $t6,24($a1) lwl $t6,27($a1) lwr $t7,28($a1) lwl $t7,31($a1) pref 0,128($a1) # bring the next lines of src, addr 128 sw $t0,0($a0) sw $t1,4($a0) sw $t2,8($a0) sw $t3,12($a0) sw $t4,16($a0) sw $t5,20($a0) sw $t6,24($a0) sw $t7,28($a0) lwr $t0,32($a1) lwl $t0,35($a1) lwr $t1,36($a1) bgtz $v1,ua_skip_pref30_128 lwl $t1,39($a1) pref 30,128($a0) # continue setting up the dest, addr 128 ua_skip_pref30_128: lwr $t2,40($a1) lwl $t2,43($a1) lwr $t3,44($a1) lwl $t3,47($a1) lwr $t4,48($a1) lwl $t4,51($a1) lwr $t5,52($a1) lwl $t5,55($a1) lwr $t6,56($a1) lwl $t6,59($a1) lwr $t7,60($a1) lwl $t7,63($a1) pref 0, 160($a1) # bring the next lines of src, addr 160 sw $t0,32($a0) sw $t1,36($a0) sw $t2,40($a0) sw $t3,44($a0) sw $t4,48($a0) sw $t5,52($a0) sw $t6,56($a0) sw $t7,60($a0) addiu $a0,$a0,64 # adding 64 to dest sgtu $v1,$a0,$t9 bne $a0,$a3,ua_loop16w addiu $a1,$a1,64 # adding 64 to src move $a2,$t8 # Here we have src and dest word-aligned but less than 64-bytes to go ua_chk8w: pref 0, 0x0($a1) andi $t8,$a2,0x1f # is there a 32-byte chunk? # the t8 is the reminder count beq $a2,$t8,ua_chk1w # when a2=t8, no 32-byte chunk lwr $t0,0($a1) lwl $t0,3($a1) lwr $t1,4($a1) lwl $t1,7($a1) lwr $t2,8($a1) lwl $t2,11($a1) lwr $t3,12($a1) lwl $t3,15($a1) lwr $t4,16($a1) lwl $t4,19($a1) lwr $t5,20($a1) lwl $t5,23($a1) lwr $t6,24($a1) lwl $t6,27($a1) lwr $t7,28($a1) lwl $t7,31($a1) addiu $a1,$a1,32 sw $t0,0($a0) sw $t1,4($a0) sw $t2,8($a0) sw $t3,12($a0) sw $t4,16($a0) sw $t5,20($a0) sw $t6,24($a0) sw $t7,28($a0) addiu $a0,$a0,32 ua_chk1w: andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks beq $a2,$t8,ua_smallCopy subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks # copying in words (4-byte chunks) ua_wordCopy_loop: lwr $v1,0($a1) lwl $v1,3($a1) addiu $a1,$a1,4 addiu $a0,$a0,4 # note: dst=a0 is word aligned here, see NOTE1 bne $a0,$a3,ua_wordCopy_loop sw $v1,-4($a0) # Now less than 4 bytes (value in a2) left to copy ua_smallCopy: beqz $a2,leave addu $a3,$a0,$a2 # a3 is the last dst address ua_smallCopy_loop: lb $v1,0($a1) addiu $a1,$a1,1 addiu $a0,$a0,1 bne $a0,$a3,ua_smallCopy_loop sb $v1,-1($a0) j $ra nop .set at .set reorder .end memcpy_MIPS; .size memcpy_MIPS,.-memcpy_MIPS #endif // if defined (__mips__)