Commit c22cd5b2 authored by fbarchard@google.com's avatar fbarchard@google.com

Mips memcpy moved to row_mips.

BUG=191
TEST=none
Review URL: https://webrtc-codereview.appspot.com/1127005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@580 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c0d9c346
......@@ -75,7 +75,6 @@
'source/convert_from_argb.cc',
'source/cpu_id.cc',
'source/format_conversion.cc',
'source/memcpy_mips.S', # TODO(fbarchard): Move into row_mips.cc
'source/mjpeg_decoder.cc',
'source/planar_functions.cc',
'source/rotate.cc',
......
#if defined (__mips__)
#
# Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the LICENSE file in the root of the source
# tree. An additional intellectual property rights grant can be found
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
#
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
.text
.globl memcpy_MIPS;
.align 2;
.type memcpy_MIPS,@function;
.ent memcpy_MIPS,0;
memcpy_MIPS:
.frame $sp,0,$ra
.set noreorder
.set noat
slti $at,$a2,8
bne $at,$zero,last8
move $v0,$a0 # memcpy returns the dst pointer
# Test if the src and dst are word-aligned, or can be made word-aligned
xor $t8,$a1,$a0
andi $t8,$t8,0x3 # t8 is a0/a1 word-displacement
bne $t8,$zero,unaligned
negu $a3,$a0
andi $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is
subu $a2,$a2,$a3 # word-aligned now a2 is the remining bytes count
lwr $t8,0($a1)
addu $a1,$a1,$a3
swr $t8,0($a0)
addu $a0,$a0,$a3
# Now the dst/src are mutually word-aligned with word-aligned addresses
chk16w:
andi $t8,$a2,0x3f # any whole 64-byte chunks?
# t8 is the byte count after 64-byte chunks
beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
# There will be at most 1 32-byte chunk after it
subu $a3,$a2,$t8 # subtract from a2 the reminder
# Here a3 counts bytes in 16w chunks
addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
addu $t0,$a0,$a2 # t0 is the "past the end" address
# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
# the "t0-32" address
# This means: for x=128 the last "safe" a0 address is "t0-160"
# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
pref 0,0($a1) # bring the first line of src, addr 0
pref 0,32($a1) # bring the second line of src, addr 32
pref 0,64($a1) # bring the third line of src, addr 64
pref 30,32($a0) # safe, as we have at least 64 bytes ahead
# In case the a0 > t9 don't use "pref 30" at all
sgtu $v1,$a0,$t9
bgtz $v1,loop16w # skip "pref 30,64(a0)" for too short arrays
nop
# otherwise, start with using pref30
pref 30,64($a0)
loop16w:
pref 0,96($a1)
lw $t0,0($a1)
bgtz $v1,skip_pref30_96 # skip "pref 30,96(a0)"
lw $t1,4($a1)
pref 30,96($a0) # continue setting up the dest, addr 96
skip_pref30_96:
lw $t2,8($a1)
lw $t3,12($a1)
lw $t4,16($a1)
lw $t5,20($a1)
lw $t6,24($a1)
lw $t7,28($a1)
pref 0,128($a1) # bring the next lines of src, addr 128
sw $t0,0($a0)
sw $t1,4($a0)
sw $t2,8($a0)
sw $t3,12($a0)
sw $t4,16($a0)
sw $t5,20($a0)
sw $t6,24($a0)
sw $t7,28($a0)
lw $t0,32($a1)
bgtz $v1,skip_pref30_128 # skip "pref 30,128(a0)"
lw $t1,36($a1)
pref 30,128($a0) # continue setting up the dest, addr 128
skip_pref30_128:
lw $t2,40($a1)
lw $t3,44($a1)
lw $t4,48($a1)
lw $t5,52($a1)
lw $t6,56($a1)
lw $t7,60($a1)
pref 0, 160($a1) # bring the next lines of src, addr 160
sw $t0,32($a0)
sw $t1,36($a0)
sw $t2,40($a0)
sw $t3,44($a0)
sw $t4,48($a0)
sw $t5,52($a0)
sw $t6,56($a0)
sw $t7,60($a0)
addiu $a0,$a0,64 # adding 64 to dest
sgtu $v1,$a0,$t9
bne $a0,$a3,loop16w
addiu $a1,$a1,64 # adding 64 to src
move $a2,$t8
# Here we have src and dest word-aligned but less than 64-bytes to go
chk8w:
pref 0, 0x0($a1)
andi $t8,$a2,0x1f # is there a 32-byte chunk?
# the t8 is the reminder count past 32-bytes
beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk
nop
lw $t0,0($a1)
lw $t1,4($a1)
lw $t2,8($a1)
lw $t3,12($a1)
lw $t4,16($a1)
lw $t5,20($a1)
lw $t6,24($a1)
lw $t7,28($a1)
addiu $a1,$a1,32
sw $t0,0($a0)
sw $t1,4($a0)
sw $t2,8($a0)
sw $t3,12($a0)
sw $t4,16($a0)
sw $t5,20($a0)
sw $t6,24($a0)
sw $t7,28($a0)
addiu $a0,$a0,32
chk1w:
andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
beq $a2,$t8,last8
subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
# copying in words (4-byte chunks)
wordCopy_loop:
lw $t3,0($a1) # the first t3 may be equal t0 ... optimize?
addiu $a1,$a1,4
addiu $a0,$a0,4
bne $a0,$a3,wordCopy_loop
sw $t3,-4($a0)
# For the last (<8) bytes
last8:
blez $a2,leave
addu $a3,$a0,$a2 # a3 is the last dst address
last8loop:
lb $v1,0($a1)
addiu $a1,$a1,1
addiu $a0,$a0,1
bne $a0,$a3,last8loop
sb $v1,-1($a0)
leave:
j $ra
nop
#
# UNALIGNED case
#
unaligned:
# got here with a3="negu a0"
andi $a3,$a3,0x3 # test if the a0 is word aligned
beqz $a3,ua_chk16w
subu $a2,$a2,$a3 # bytes left after initial a3 bytes
lwr $v1,0($a1)
lwl $v1,3($a1)
addu $a1,$a1,$a3 # a3 may be here 1, 2 or 3
swr $v1,0($a0)
addu $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)
ua_chk16w:
andi $t8,$a2,0x3f # any whole 64-byte chunks?
# t8 is the byte count after 64-byte chunks
beq $a2,$t8,ua_chk8w # if a2==t8, no 64-byte chunks
# There will be at most 1 32-byte chunk after it
subu $a3,$a2,$t8 # subtract from a2 the reminder
# Here a3 counts bytes in 16w chunks
addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
addu $t0,$a0,$a2 # t0 is the "past the end" address
subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
pref 0,0($a1) # bring the first line of src, addr 0
pref 0,32($a1) # bring the second line of src, addr 32
pref 0,64($a1) # bring the third line of src, addr 64
pref 30,32($a0) # safe, as we have at least 64 bytes ahead
# In case the a0 > t9 don't use "pref 30" at all
sgtu $v1,$a0,$t9
bgtz $v1,ua_loop16w # skip "pref 30,64(a0)" for too short arrays
nop
# otherwise, start with using pref30
pref 30,64($a0)
ua_loop16w:
pref 0,96($a1)
lwr $t0,0($a1)
lwl $t0,3($a1)
lwr $t1,4($a1)
bgtz $v1,ua_skip_pref30_96
lwl $t1,7($a1)
pref 30,96($a0) # continue setting up the dest, addr 96
ua_skip_pref30_96:
lwr $t2,8($a1)
lwl $t2,11($a1)
lwr $t3,12($a1)
lwl $t3,15($a1)
lwr $t4,16($a1)
lwl $t4,19($a1)
lwr $t5,20($a1)
lwl $t5,23($a1)
lwr $t6,24($a1)
lwl $t6,27($a1)
lwr $t7,28($a1)
lwl $t7,31($a1)
pref 0,128($a1) # bring the next lines of src, addr 128
sw $t0,0($a0)
sw $t1,4($a0)
sw $t2,8($a0)
sw $t3,12($a0)
sw $t4,16($a0)
sw $t5,20($a0)
sw $t6,24($a0)
sw $t7,28($a0)
lwr $t0,32($a1)
lwl $t0,35($a1)
lwr $t1,36($a1)
bgtz $v1,ua_skip_pref30_128
lwl $t1,39($a1)
pref 30,128($a0) # continue setting up the dest, addr 128
ua_skip_pref30_128:
lwr $t2,40($a1)
lwl $t2,43($a1)
lwr $t3,44($a1)
lwl $t3,47($a1)
lwr $t4,48($a1)
lwl $t4,51($a1)
lwr $t5,52($a1)
lwl $t5,55($a1)
lwr $t6,56($a1)
lwl $t6,59($a1)
lwr $t7,60($a1)
lwl $t7,63($a1)
pref 0, 160($a1) # bring the next lines of src, addr 160
sw $t0,32($a0)
sw $t1,36($a0)
sw $t2,40($a0)
sw $t3,44($a0)
sw $t4,48($a0)
sw $t5,52($a0)
sw $t6,56($a0)
sw $t7,60($a0)
addiu $a0,$a0,64 # adding 64 to dest
sgtu $v1,$a0,$t9
bne $a0,$a3,ua_loop16w
addiu $a1,$a1,64 # adding 64 to src
move $a2,$t8
# Here we have src and dest word-aligned but less than 64-bytes to go
ua_chk8w:
pref 0, 0x0($a1)
andi $t8,$a2,0x1f # is there a 32-byte chunk?
# the t8 is the reminder count
beq $a2,$t8,ua_chk1w # when a2=t8, no 32-byte chunk
lwr $t0,0($a1)
lwl $t0,3($a1)
lwr $t1,4($a1)
lwl $t1,7($a1)
lwr $t2,8($a1)
lwl $t2,11($a1)
lwr $t3,12($a1)
lwl $t3,15($a1)
lwr $t4,16($a1)
lwl $t4,19($a1)
lwr $t5,20($a1)
lwl $t5,23($a1)
lwr $t6,24($a1)
lwl $t6,27($a1)
lwr $t7,28($a1)
lwl $t7,31($a1)
addiu $a1,$a1,32
sw $t0,0($a0)
sw $t1,4($a0)
sw $t2,8($a0)
sw $t3,12($a0)
sw $t4,16($a0)
sw $t5,20($a0)
sw $t6,24($a0)
sw $t7,28($a0)
addiu $a0,$a0,32
ua_chk1w:
andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
beq $a2,$t8,ua_smallCopy
subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
# copying in words (4-byte chunks)
ua_wordCopy_loop:
lwr $v1,0($a1)
lwl $v1,3($a1)
addiu $a1,$a1,4
addiu $a0,$a0,4 # note: dst=a0 is word aligned here, see NOTE1
bne $a0,$a3,ua_wordCopy_loop
sw $v1,-4($a0)
# Now less than 4 bytes (value in a2) left to copy
ua_smallCopy:
beqz $a2,leave
addu $a3,$a0,$a2 # a3 is the last dst address
ua_smallCopy_loop:
lb $v1,0($a1)
addiu $a1,$a1,1
addiu $a0,$a0,1
bne $a0,$a3,ua_smallCopy_loop
sb $v1,-1($a0)
j $ra
nop
.set at
.set reorder
.end memcpy_MIPS;
.size memcpy_MIPS,.-memcpy_MIPS
#endif // if defined (__mips__)
......@@ -15,14 +15,361 @@ namespace libyuv {
extern "C" {
#endif
#if !defined(YUV_DISABLE_ASM) && defined(__mips__)
#ifdef HAS_COPYROW_MIPS
extern "C" void memcpy_MIPS(uint8* dst, const uint8* src, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
memcpy_MIPS(dst, src, count);
__asm__ __volatile__ (
".set noreorder \n"
".set noat \n"
"slti $at, %[count], 8 \n"
"bne $at ,$zero, $last8 \n"
"xor $t8, %[src], %[dst] \n"
"andi $t8, $t8, 0x3 \n"
"bne $t8, $zero, unaligned \n"
"negu $a3, %[dst] \n"
// make dst/src aligned
"andi $a3, $a3, 0x3 \n"
"beq $a3, $zero, $chk16w \n"
// word-aligned now count is the remining bytes count
"subu %[count], %[count], $a3 \n"
"lwr $t8, 0(%[src]) \n"
"addu %[src], %[src], $a3 \n"
"swr $t8, 0(%[dst]) \n"
"addu %[dst], %[dst], $a3 \n"
// Now the dst/src are mutually word-aligned with word-aligned addresses
"$chk16w: \n"
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
// t8 is the byte count after 64-byte chunks
"beq %[count], $t8, chk8w \n"
// There will be at most 1 32-byte chunk after it
"subu $a3, %[count], $t8 \n" // the reminder
// Here a3 counts bytes in 16w chunks
"addu $a3, %[dst], $a3 \n"
// Now a3 is the final dst after 64-byte chunks
"addu $t0, %[dst], %[count] \n"
// t0 is the "past the end" address
// When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
// the "t0-32" address
// This means: for x=128 the last "safe" a1 address is "t0-160"
// Alternatively, for x=64 the last "safe" a1 address is "t0-96"
// we will use "pref 30,128(a1)", so "t0-160" is the limit
"subu $t9, $t0, 160 \n"
// t9 is the "last safe pref 30,128(a1)" address
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line of src
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
"bgtz $v1, $loop16w \n"
"nop \n"
// otherwise, start with using pref30
"pref 30, 64(%[dst]) \n"
"$loop16w: \n"
"pref 0, 96(%[src]) \n"
"lw $t0, 0(%[src]) \n"
"bgtz $v1, $skip_pref30_96 \n" // skip
"lw $t1, 4(%[src]) \n"
"pref 30, 96(%[dst]) \n" // continue
"$skip_pref30_96: \n"
"lw $t2, 8(%[src]) \n"
"lw $t3, 12(%[src]) \n"
"lw $t4, 16(%[src]) \n"
"lw $t5, 20(%[src]) \n"
"lw $t6, 24(%[src]) \n"
"lw $t7, 28(%[src]) \n"
"pref 0, 128(%[src]) \n"
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"lw $t0, 32(%[src]) \n"
"bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
"lw $t1, 36(%[src]) \n"
"pref 30, 128(%[dst]) \n" // set dest, addr 128
"$skip_pref30_128: \n"
"lw $t2, 40(%[src]) \n"
"lw $t3, 44(%[src]) \n"
"lw $t4, 48(%[src]) \n"
"lw $t5, 52(%[src]) \n"
"lw $t6, 56(%[src]) \n"
"lw $t7, 60(%[src]) \n"
"pref 0, 160(%[src]) \n"
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
"sw $t2, 40(%[dst]) \n"
"sw $t3, 44(%[dst]) \n"
"sw $t4, 48(%[dst]) \n"
"sw $t5, 52(%[dst]) \n"
"sw $t6, 56(%[dst]) \n"
"sw $t7, 60(%[dst]) \n"
"addiu %[dst], %[dst], 64 \n" // adding 64 to dest
"sgtu $v1, %[dst], $t9 \n"
"bne %[dst], $a3, $loop16w \n"
" addiu %[src], %[src], 64 \n" // adding 64 to src
"move %[count], $t8 \n"
// Here we have src and dest word-aligned but less than 64-bytes to go
"chk8w: \n"
"pref 0, 0x0(%[src]) \n"
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count past 32-bytes
"beq %[count], $t8, chk1w \n"
// count=t8,no 32-byte chunk
" nop \n"
"lw $t0, 0(%[src]) \n"
"lw $t1, 4(%[src]) \n"
"lw $t2, 8(%[src]) \n"
"lw $t3, 12(%[src]) \n"
"lw $t4, 16(%[src]) \n"
"lw $t5, 20(%[src]) \n"
"lw $t6, 24(%[src]) \n"
"lw $t7, 28(%[src]) \n"
"addiu %[src], %[src], 32 \n"
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"addiu %[dst], %[dst], 32 \n"
"chk1w: \n"
"andi %[count], $t8, 0x3 \n"
// now count is the reminder past 1w chunks
"beq %[count], $t8, $last8 \n"
" subu $a3, $t8, %[count] \n"
// a3 is count of bytes in 1w chunks
"addu $a3, %[dst], $a3 \n"
// now a3 is the dst address past the 1w chunks
// copying in words (4-byte chunks)
"$wordCopy_loop: \n"
"lw $t3, 0(%[src]) \n"
// the first t3 may be equal t0 ... optimize?
"addiu %[src], %[src],4 \n"
"addiu %[dst], %[dst],4 \n"
"bne %[dst], $a3,$wordCopy_loop \n"
" sw $t3, -4(%[dst]) \n"
// For the last (<8) bytes
"$last8: \n"
"blez %[count], leave \n"
" addu $a3, %[dst], %[count] \n" // a3 -last dst address
"$last8loop: \n"
"lb $v1, 0(%[src]) \n"
"addiu %[src], %[src], 1 \n"
"addiu %[dst], %[dst], 1 \n"
"bne %[dst], $a3, $last8loop \n"
" sb $v1, -1(%[dst]) \n"
"leave: \n"
" j $ra \n"
" nop \n"
//
// UNALIGNED case
//
"unaligned: \n"
// got here with a3="negu a1"
"andi $a3, $a3, 0x3 \n" // a1 is word aligned?
"beqz $a3, $ua_chk16w \n"
" subu %[count], %[count], $a3 \n"
// bytes left after initial a3 bytes
"lwr $v1, 0(%[src]) \n"
"lwl $v1, 3(%[src]) \n"
"addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
"swr $v1, 0(%[dst]) \n"
"addu %[dst], %[dst], $a3 \n"
// below the dst will be word aligned (NOTE1)
"$ua_chk16w: \n"
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
// t8 is the byte count after 64-byte chunks
"beq %[count], $t8, ua_chk8w \n"
// if a2==t8, no 64-byte chunks
// There will be at most 1 32-byte chunk after it
"subu $a3, %[count], $t8 \n" // the reminder
// Here a3 counts bytes in 16w chunks
"addu $a3, %[dst], $a3 \n"
// Now a3 is the final dst after 64-byte chunks
"addu $t0, %[dst], %[count] \n" // t0 "past the end"
"subu $t9, $t0, 160 \n"
// t9 is the "last safe pref 30,128(a1)" address
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line addr 32
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
// safe, as we have at least 64 bytes ahead
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
"bgtz $v1, $ua_loop16w \n"
// skip "pref 30,64(a1)" for too short arrays
" nop \n"
// otherwise, start with using pref30
"pref 30, 64(%[dst]) \n"
"$ua_loop16w: \n"
"pref 0, 96(%[src]) \n"
"lwr $t0, 0(%[src]) \n"
"lwl $t0, 3(%[src]) \n"
"lwr $t1, 4(%[src]) \n"
"bgtz $v1, $ua_skip_pref30_96 \n"
" lwl $t1, 7(%[src]) \n"
"pref 30, 96(%[dst]) \n"
// continue setting up the dest, addr 96
"$ua_skip_pref30_96: \n"
"lwr $t2, 8(%[src]) \n"
"lwl $t2, 11(%[src]) \n"
"lwr $t3, 12(%[src]) \n"
"lwl $t3, 15(%[src]) \n"
"lwr $t4, 16(%[src]) \n"
"lwl $t4, 19(%[src]) \n"
"lwr $t5, 20(%[src]) \n"
"lwl $t5, 23(%[src]) \n"
"lwr $t6, 24(%[src]) \n"
"lwl $t6, 27(%[src]) \n"
"lwr $t7, 28(%[src]) \n"
"lwl $t7, 31(%[src]) \n"
"pref 0, 128(%[src]) \n"
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"lwr $t0, 32(%[src]) \n"
"lwl $t0, 35(%[src]) \n"
"lwr $t1, 36(%[src]) \n"
"bgtz $v1, ua_skip_pref30_128 \n"
" lwl $t1, 39(%[src]) \n"
"pref 30, 128(%[dst]) \n"
// continue setting up the dest, addr 128
"ua_skip_pref30_128: \n"
"lwr $t2, 40(%[src]) \n"
"lwl $t2, 43(%[src]) \n"
"lwr $t3, 44(%[src]) \n"
"lwl $t3, 47(%[src]) \n"
"lwr $t4, 48(%[src]) \n"
"lwl $t4, 51(%[src]) \n"
"lwr $t5, 52(%[src]) \n"
"lwl $t5, 55(%[src]) \n"
"lwr $t6, 56(%[src]) \n"
"lwl $t6, 59(%[src]) \n"
"lwr $t7, 60(%[src]) \n"
"lwl $t7, 63(%[src]) \n"
"pref 0, 160(%[src]) \n"
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
"sw $t2, 40(%[dst]) \n"
"sw $t3, 44(%[dst]) \n"
"sw $t4, 48(%[dst]) \n"
"sw $t5, 52(%[dst]) \n"
"sw $t6, 56(%[dst]) \n"
"sw $t7, 60(%[dst]) \n"
"addiu %[dst],%[dst],64 \n" // adding 64 to dest
"sgtu $v1,%[dst],$t9 \n"
"bne %[dst],$a3,$ua_loop16w \n"
" addiu %[src],%[src],64 \n" // adding 64 to src
"move %[count],$t8 \n"
// Here we have src and dest word-aligned but less than 64-bytes to go
"ua_chk8w: \n"
"pref 0, 0x0(%[src]) \n"
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count
"beq %[count], $t8, $ua_chk1w \n"
// when count==t8, no 32-byte chunk
"lwr $t0, 0(%[src]) \n"
"lwl $t0, 3(%[src]) \n"
"lwr $t1, 4(%[src]) \n"
"lwl $t1, 7(%[src]) \n"
"lwr $t2, 8(%[src]) \n"
"lwl $t2, 11(%[src]) \n"
"lwr $t3, 12(%[src]) \n"
"lwl $t3, 15(%[src]) \n"
"lwr $t4, 16(%[src]) \n"
"lwl $t4, 19(%[src]) \n"
"lwr $t5, 20(%[src]) \n"
"lwl $t5, 23(%[src]) \n"
"lwr $t6, 24(%[src]) \n"
"lwl $t6, 27(%[src]) \n"
"lwr $t7, 28(%[src]) \n"
"lwl $t7, 31(%[src]) \n"
"addiu %[src], %[src], 32 \n"
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"addiu %[dst], %[dst], 32 \n"
"$ua_chk1w: \n"
"andi %[count], $t8, 0x3 \n"
// now count is the reminder past 1w chunks
"beq %[count], $t8, ua_smallCopy \n"
"subu $a3, $t8, %[count] \n"
// a3 is count of bytes in 1w chunks
"addu $a3, %[dst], $a3 \n"
// now a3 is the dst address past the 1w chunks
// copying in words (4-byte chunks)
"$ua_wordCopy_loop: \n"
"lwr $v1, 0(%[src]) \n"
"lwl $v1, 3(%[src]) \n"
"addiu %[src], %[src], 4 \n"
"addiu %[dst], %[dst], 4 \n"
// note: dst=a1 is word aligned here, see NOTE1
"bne %[dst], $a3, $ua_wordCopy_loop \n"
" sw $v1,-4(%[dst]) \n"
// Now less than 4 bytes (value in count) left to copy
"ua_smallCopy: \n"
"beqz %[count], leave \n"
" addu $a3, %[dst], %[count] \n" // a3 = last dst address
"$ua_smallCopy_loop: \n"
"lb $v1, 0(%[src]) \n"
"addiu %[src], %[src], 1 \n"
"addiu %[dst], %[dst], 1 \n"
"bne %[dst],$a3,$ua_smallCopy_loop \n"
" sb $v1, -1(%[dst]) \n"
"j $ra \n"
" nop \n"
".set at \n"
".set reorder \n"
: [dst] "+r" (dst), [src] "+r" (src)
: [count] "r" (count)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
"t8", "t9", "a3", "v1", "at"
);
}
#endif // HAS_COPYROW_MIPS
#endif // __mips__
// MIPS DSPR2 functions
#if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment