Commit 6c1b2d38 authored by fbarchard@google.com's avatar fbarchard@google.com

Mips port of libyuv. Includes functionality for convert, rotate, scale and memcpy.

BUG=126
TESTED=tested by mips
Review URL: https://webrtc-codereview.appspot.com/930005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@449 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 1f399dfa
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 447 Version: 449
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -19,9 +19,11 @@ ...@@ -19,9 +19,11 @@
#include "libyuv/convert_from_argb.h" #include "libyuv/convert_from_argb.h"
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h" #include "libyuv/format_conversion.h"
#include "libyuv/mjpeg_decoder.h"
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include "libyuv/rotate.h" #include "libyuv/rotate.h"
#include "libyuv/rotate_argb.h" #include "libyuv/rotate_argb.h"
#include "libyuv/row.h"
#include "libyuv/scale.h" #include "libyuv/scale.h"
#include "libyuv/scale_argb.h" #include "libyuv/scale_argb.h"
#include "libyuv/version.h" #include "libyuv/version.h"
......
...@@ -175,8 +175,14 @@ extern "C" { ...@@ -175,8 +175,14 @@ extern "C" {
// The following are available on Mips platforms // The following are available on Mips platforms
#if !defined(YUV_DISABLE_ASM) && defined(__mips__) #if !defined(YUV_DISABLE_ASM) && defined(__mips__)
#define HAS_COPYROW_MIPS
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_SPLITUV_MIPS_DSPR2 #define HAS_SPLITUV_MIPS_DSPR2
#define HAS_MIRRORROW_MIPS_DSPR2
#define HAS_MIRRORROWUV_MIPS_DSPR2
#define HAS_I422TOARGBROW_MIPS_DSPR2
#define HAS_I422TOBGRAROW_MIPS_DSPR2
#define HAS_I422TOABGRROW_MIPS_DSPR2
#endif #endif
#endif #endif
...@@ -282,6 +288,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); ...@@ -282,6 +288,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
void MirrorRow_NEON(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
void MirrorRowUV_MIPS_DSPR2(const uint8* src, uint8* dst_u, uint8* dst_v,
int width);
void MirrorRow_C(const uint8* src, uint8* dst, int width); void MirrorRow_C(const uint8* src, uint8* dst, int width);
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width); void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
...@@ -321,6 +330,7 @@ void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -321,6 +330,7 @@ void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
void CopyRow_C(const uint8* src, uint8* dst, int count); void CopyRow_C(const uint8* src, uint8* dst, int count);
void SetRow8_X86(uint8* dst, uint32 v32, int count); void SetRow8_X86(uint8* dst, uint32 v32, int count);
...@@ -694,6 +704,21 @@ void NV21ToARGBRow_Any_NEON(const uint8* y_buf, ...@@ -694,6 +704,21 @@ void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 447 #define LIBYUV_VERSION 449
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -75,11 +75,12 @@ ...@@ -75,11 +75,12 @@
'source/convert_from_argb.cc', 'source/convert_from_argb.cc',
'source/cpu_id.cc', 'source/cpu_id.cc',
'source/format_conversion.cc', 'source/format_conversion.cc',
'source/memcpy_mips.S', 'source/memcpy_mips.S', # TODO(fbarchard): Move into row_mips.cc
'source/mjpeg_decoder.cc', 'source/mjpeg_decoder.cc',
'source/planar_functions.cc', 'source/planar_functions.cc',
'source/rotate.cc', 'source/rotate.cc',
'source/rotate_argb.cc', 'source/rotate_argb.cc',
'source/rotate_mips.cc',
'source/rotate_neon.cc', 'source/rotate_neon.cc',
'source/row_common.cc', 'source/row_common.cc',
'source/row_mips.cc', 'source/row_mips.cc',
......
...@@ -132,6 +132,14 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, ...@@ -132,6 +132,14 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
} }
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
...@@ -756,6 +764,11 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -756,6 +764,11 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
I422ToARGBRow = I422ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
} }
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif #endif
SIMD_ALIGNED(uint8 rowy[kMaxStride]); SIMD_ALIGNED(uint8 rowy[kMaxStride]);
...@@ -829,6 +842,11 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -829,6 +842,11 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
I422ToARGBRow = I422ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
} }
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif #endif
SIMD_ALIGNED(uint8 rowy[kMaxStride]); SIMD_ALIGNED(uint8 rowy[kMaxStride]);
......
...@@ -599,6 +599,14 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, ...@@ -599,6 +599,14 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
} }
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
...@@ -652,6 +660,14 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, ...@@ -652,6 +660,14 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
I422ToBGRARow = I422ToBGRARow_NEON; I422ToBGRARow = I422ToBGRARow_NEON;
} }
} }
#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
}
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
...@@ -909,6 +925,13 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, ...@@ -909,6 +925,13 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
} }
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif #endif
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
...@@ -975,6 +998,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, ...@@ -975,6 +998,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
} }
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif #endif
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
...@@ -1041,6 +1072,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, ...@@ -1041,6 +1072,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
} }
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif #endif
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
......
...@@ -174,7 +174,7 @@ int InitCpuFlags(void) { ...@@ -174,7 +174,7 @@ int InitCpuFlags(void) {
} }
} }
#endif #endif
// environment variable overrides for testing. // Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) { if (TestEnv("LIBYUV_DISABLE_X86")) {
cpu_info_ &= ~kCpuHasX86; cpu_info_ &= ~kCpuHasX86;
} }
...@@ -197,7 +197,7 @@ int InitCpuFlags(void) { ...@@ -197,7 +197,7 @@ int InitCpuFlags(void) {
cpu_info_ &= ~kCpuHasAVX2; cpu_info_ &= ~kCpuHasAVX2;
} }
#elif defined(__mips__) && defined(__linux__) #elif defined(__mips__) && defined(__linux__)
// linux mips parse text file for dsp detect. // Linux mips parse text file for dsp detect.
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
#if defined(__mips_dspr2) #if defined(__mips_dspr2)
cpu_info_ |= kCpuHasMIPS_DSPR2; cpu_info_ |= kCpuHasMIPS_DSPR2;
...@@ -215,7 +215,7 @@ int InitCpuFlags(void) { ...@@ -215,7 +215,7 @@ int InitCpuFlags(void) {
} }
#elif defined(__arm__) #elif defined(__arm__)
#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
// linux arm parse text file for neon detect. // Linux arm parse text file for neon detect.
cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
#elif defined(__ARM_NEON__) #elif defined(__ARM_NEON__)
// gcc -mfpu=neon defines __ARM_NEON__ // gcc -mfpu=neon defines __ARM_NEON__
......
#if defined (__mips__) #if defined (__mips__)
#
.globl memcpy_MIPS; # Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
.align 2; #
.type memcpy_MIPS,@function; # Use of this source code is governed by a BSD-style license
.ent memcpy_MIPS,0; # that can be found in the LICENSE file in the root of the source
# tree. An additional intellectual property rights grant can be found
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
#
.globl memcpy_MIPS;
.align 2;
.type memcpy_MIPS,@function;
.ent memcpy_MIPS,0;
memcpy_MIPS: memcpy_MIPS:
.frame $sp,0,$ra .frame $sp,0,$ra
.set noreorder .set noreorder
.set noat .set noat
slti $at,$a2,8 slti $at,$a2,8
bne $at,$zero,last8 bne $at,$zero,last8
move $v0,$a0 # memcpy returns the dst pointer move $v0,$a0 # memcpy returns the dst pointer
# Test if the src and dst are word-aligned, or can be made word-aligned # Test if the src and dst are word-aligned, or can be made word-aligned
xor $t8,$a1,$a0 xor $t8,$a1,$a0
andi $t8,$t8,0x3 # t8 is a0/a1 word-displacement andi $t8,$t8,0x3 # t8 is a0/a1 word-displacement
bne $t8,$zero,unaligned bne $t8,$zero,unaligned
negu $a3,$a0 negu $a3,$a0
andi $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned andi $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is word-aligned beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is
subu $a2,$a2,$a3 # now a2 is the remining bytes count subu $a2,$a2,$a3 # word-aligned now a2 is the remining bytes count
lwr $t8,0($a1) lwr $t8,0($a1)
addu $a1,$a1,$a3 addu $a1,$a1,$a3
swr $t8,0($a0) swr $t8,0($a0)
addu $a0,$a0,$a3 addu $a0,$a0,$a3
# Now the dst/src are mutually word-aligned with word-aligned addresses # Now the dst/src are mutually word-aligned with word-aligned addresses
chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks? chk16w:
# t8 is the byte count after 64-byte chunks andi $t8,$a2,0x3f # any whole 64-byte chunks?
# t8 is the byte count after 64-byte chunks
beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
# There will be at most 1 32-byte chunk after it # There will be at most 1 32-byte chunk after it
subu $a3,$a2,$t8 # subtract from a2 the reminder subu $a3,$a2,$t8 # subtract from a2 the reminder
# Here a3 counts bytes in 16w chunks # Here a3 counts bytes in 16w chunks
addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
addu $t0,$a0,$a2 # t0 is the "past the end" address
addu $t0,$a0,$a2 # t0 is the "past the end" address
# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past # When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
# the "t0-32" address # the "t0-32" address
# This means: for x=128 the last "safe" a0 address is "t0-160" # This means: for x=128 the last "safe" a0 address is "t0-160"
# Alternatively, for x=64 the last "safe" a0 address is "t0-96" # Alternatively, for x=64 the last "safe" a0 address is "t0-96"
# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit # In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
pref 0,0($a1) # bring the first line of src, addr 0 pref 0,0($a1) # bring the first line of src, addr 0
pref 0,32($a1) # bring the second line of src, addr 32 pref 0,32($a1) # bring the second line of src, addr 32
pref 0,64($a1) # bring the third line of src, addr 64 pref 0,64($a1) # bring the third line of src, addr 64
pref 30,32($a0) # safe, as we have at least 64 bytes ahead pref 30,32($a0) # safe, as we have at least 64 bytes ahead
# In case the a0 > t9 don't use "pref 30" at all # In case the a0 > t9 don't use "pref 30" at all
sgtu $v1,$a0,$t9 sgtu $v1,$a0,$t9
bgtz $v1,loop16w # skip "pref 30,64(a0)" for too short arrays bgtz $v1,loop16w # skip "pref 30,64(a0)" for too short arrays
nop nop
# otherwise, start with using pref30 # otherwise, start with using pref30
pref 30,64($a0) pref 30,64($a0)
loop16w: loop16w:
pref 0,96($a1) pref 0,96($a1)
lw $t0,0($a1) lw $t0,0($a1)
bgtz $v1,skip_pref30_96 # skip "pref 30,96(a0)" bgtz $v1,skip_pref30_96 # skip "pref 30,96(a0)"
lw $t1,4($a1) lw $t1,4($a1)
pref 30,96($a0) # continue setting up the dest, addr 96 pref 30,96($a0) # continue setting up the dest, addr 96
skip_pref30_96: skip_pref30_96:
lw $t2,8($a1) lw $t2,8($a1)
lw $t3,12($a1) lw $t3,12($a1)
lw $t4,16($a1) lw $t4,16($a1)
lw $t5,20($a1) lw $t5,20($a1)
lw $t6,24($a1) lw $t6,24($a1)
lw $t7,28($a1) lw $t7,28($a1)
pref 0,128($a1) # bring the next lines of src, addr 128 pref 0,128($a1) # bring the next lines of src, addr 128
sw $t0,0($a0) sw $t0,0($a0)
sw $t1,4($a0) sw $t1,4($a0)
sw $t2,8($a0) sw $t2,8($a0)
sw $t3,12($a0) sw $t3,12($a0)
sw $t4,16($a0) sw $t4,16($a0)
sw $t5,20($a0) sw $t5,20($a0)
sw $t6,24($a0) sw $t6,24($a0)
sw $t7,28($a0) sw $t7,28($a0)
lw $t0,32($a1) lw $t0,32($a1)
bgtz $v1,skip_pref30_128 # skip "pref 30,128(a0)" bgtz $v1,skip_pref30_128 # skip "pref 30,128(a0)"
lw $t1,36($a1) lw $t1,36($a1)
pref 30,128($a0) # continue setting up the dest, addr 128 pref 30,128($a0) # continue setting up the dest, addr 128
skip_pref30_128: skip_pref30_128:
lw $t2,40($a1) lw $t2,40($a1)
lw $t3,44($a1) lw $t3,44($a1)
lw $t4,48($a1) lw $t4,48($a1)
lw $t5,52($a1) lw $t5,52($a1)
lw $t6,56($a1) lw $t6,56($a1)
lw $t7,60($a1) lw $t7,60($a1)
pref 0, 160($a1) # bring the next lines of src, addr 160 pref 0, 160($a1) # bring the next lines of src, addr 160
sw $t0,32($a0) sw $t0,32($a0)
sw $t1,36($a0) sw $t1,36($a0)
sw $t2,40($a0) sw $t2,40($a0)
sw $t3,44($a0) sw $t3,44($a0)
sw $t4,48($a0) sw $t4,48($a0)
sw $t5,52($a0) sw $t5,52($a0)
sw $t6,56($a0) sw $t6,56($a0)
sw $t7,60($a0) sw $t7,60($a0)
addiu $a0,$a0,64 # adding 64 to dest addiu $a0,$a0,64 # adding 64 to dest
sgtu $v1,$a0,$t9 sgtu $v1,$a0,$t9
bne $a0,$a3,loop16w bne $a0,$a3,loop16w
addiu $a1,$a1,64 # adding 64 to src addiu $a1,$a1,64 # adding 64 to src
move $a2,$t8 move $a2,$t8
# Here we have src and dest word-aligned but less than 64-bytes to go # Here we have src and dest word-aligned but less than 64-bytes to go
chk8w: chk8w:
pref 0, 0x0($a1) pref 0, 0x0($a1)
andi $t8,$a2,0x1f # is there a 32-byte chunk? andi $t8,$a2,0x1f # is there a 32-byte chunk?
# the t8 is the reminder count past 32-bytes # the t8 is the reminder count past 32-bytes
beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk
nop nop
lw $t0,0($a1) lw $t0,0($a1)
lw $t1,4($a1) lw $t1,4($a1)
lw $t2,8($a1) lw $t2,8($a1)
lw $t3,12($a1) lw $t3,12($a1)
lw $t4,16($a1) lw $t4,16($a1)
lw $t5,20($a1) lw $t5,20($a1)
lw $t6,24($a1) lw $t6,24($a1)
lw $t7,28($a1) lw $t7,28($a1)
addiu $a1,$a1,32 addiu $a1,$a1,32
sw $t0,0($a0) sw $t0,0($a0)
sw $t1,4($a0) sw $t1,4($a0)
sw $t2,8($a0) sw $t2,8($a0)
sw $t3,12($a0) sw $t3,12($a0)
sw $t4,16($a0) sw $t4,16($a0)
sw $t5,20($a0) sw $t5,20($a0)
sw $t6,24($a0) sw $t6,24($a0)
sw $t7,28($a0) sw $t7,28($a0)
addiu $a0,$a0,32 addiu $a0,$a0,32
chk1w: chk1w:
andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
beq $a2,$t8,last8 beq $a2,$t8,last8
subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
# copying in words (4-byte chunks) # copying in words (4-byte chunks)
wordCopy_loop: wordCopy_loop:
lw $t3,0($a1) # the first t3 may be equal t0 ... optimize? lw $t3,0($a1) # the first t3 may be equal t0 ... optimize?
addiu $a1,$a1,4 addiu $a1,$a1,4
addiu $a0,$a0,4 addiu $a0,$a0,4
bne $a0,$a3,wordCopy_loop bne $a0,$a3,wordCopy_loop
sw $t3,-4($a0) sw $t3,-4($a0)
# For the last (<8) bytes # For the last (<8) bytes
last8: last8:
blez $a2,leave blez $a2,leave
addu $a3,$a0,$a2 # a3 is the last dst address addu $a3,$a0,$a2 # a3 is the last dst address
last8loop: last8loop:
lb $v1,0($a1) lb $v1,0($a1)
addiu $a1,$a1,1 addiu $a1,$a1,1
addiu $a0,$a0,1 addiu $a0,$a0,1
bne $a0,$a3,last8loop bne $a0,$a3,last8loop
sb $v1,-1($a0) sb $v1,-1($a0)
leave: j $ra leave:
nop j $ra
nop
# #
# UNALIGNED case # UNALIGNED case
...@@ -173,174 +181,172 @@ leave: j $ra ...@@ -173,174 +181,172 @@ leave: j $ra
unaligned: unaligned:
# got here with a3="negu a0" # got here with a3="negu a0"
andi $a3,$a3,0x3 # test if the a0 is word aligned andi $a3,$a3,0x3 # test if the a0 is word aligned
beqz $a3,ua_chk16w beqz $a3,ua_chk16w
subu $a2,$a2,$a3 # bytes left after initial a3 bytes subu $a2,$a2,$a3 # bytes left after initial a3 bytes
lwr $v1,0($a1) lwr $v1,0($a1)
lwl $v1,3($a1) lwl $v1,3($a1)
addu $a1,$a1,$a3 # a3 may be here 1, 2 or 3 addu $a1,$a1,$a3 # a3 may be here 1, 2 or 3
swr $v1,0($a0) swr $v1,0($a0)
addu $a0,$a0,$a3 # below the dst will be word aligned (NOTE1) addu $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)
ua_chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks? ua_chk16w:
# t8 is the byte count after 64-byte chunks andi $t8,$a2,0x3f # any whole 64-byte chunks?
beq $a2,$t8,ua_chk8w # if a2==t8, no 64-byte chunks # t8 is the byte count after 64-byte chunks
# There will be at most 1 32-byte chunk after it beq $a2,$t8,ua_chk8w # if a2==t8, no 64-byte chunks
subu $a3,$a2,$t8 # subtract from a2 the reminder # There will be at most 1 32-byte chunk after it
# Here a3 counts bytes in 16w chunks subu $a3,$a2,$t8 # subtract from a2 the reminder
addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks # Here a3 counts bytes in 16w chunks
addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
addu $t0,$a0,$a2 # t0 is the "past the end" address addu $t0,$a0,$a2 # t0 is the "past the end" address
subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
pref 0,0($a1) # bring the first line of src, addr 0 pref 0,0($a1) # bring the first line of src, addr 0
pref 0,32($a1) # bring the second line of src, addr 32 pref 0,32($a1) # bring the second line of src, addr 32
pref 0,64($a1) # bring the third line of src, addr 64 pref 0,64($a1) # bring the third line of src, addr 64
pref 30,32($a0) # safe, as we have at least 64 bytes ahead pref 30,32($a0) # safe, as we have at least 64 bytes ahead
# In case the a0 > t9 don't use "pref 30" at all # In case the a0 > t9 don't use "pref 30" at all
sgtu $v1,$a0,$t9 sgtu $v1,$a0,$t9
bgtz $v1,ua_loop16w # skip "pref 30,64(a0)" for too short arrays bgtz $v1,ua_loop16w # skip "pref 30,64(a0)" for too short arrays
nop nop
# otherwise, start with using pref30 # otherwise, start with using pref30
pref 30,64($a0) pref 30,64($a0)
ua_loop16w: ua_loop16w:
pref 0,96($a1) pref 0,96($a1)
lwr $t0,0($a1) lwr $t0,0($a1)
lwl $t0,3($a1) lwl $t0,3($a1)
lwr $t1,4($a1) lwr $t1,4($a1)
bgtz $v1,ua_skip_pref30_96 bgtz $v1,ua_skip_pref30_96
lwl $t1,7($a1) lwl $t1,7($a1)
pref 30,96($a0) # continue setting up the dest, addr 96 pref 30,96($a0) # continue setting up the dest, addr 96
ua_skip_pref30_96: ua_skip_pref30_96:
lwr $t2,8($a1) lwr $t2,8($a1)
lwl $t2,11($a1) lwl $t2,11($a1)
lwr $t3,12($a1) lwr $t3,12($a1)
lwl $t3,15($a1) lwl $t3,15($a1)
lwr $t4,16($a1) lwr $t4,16($a1)
lwl $t4,19($a1) lwl $t4,19($a1)
lwr $t5,20($a1) lwr $t5,20($a1)
lwl $t5,23($a1) lwl $t5,23($a1)
lwr $t6,24($a1) lwr $t6,24($a1)
lwl $t6,27($a1) lwl $t6,27($a1)
lwr $t7,28($a1) lwr $t7,28($a1)
lwl $t7,31($a1) lwl $t7,31($a1)
pref 0,128($a1) # bring the next lines of src, addr 128 pref 0,128($a1) # bring the next lines of src, addr 128
sw $t0,0($a0) sw $t0,0($a0)
sw $t1,4($a0) sw $t1,4($a0)
sw $t2,8($a0) sw $t2,8($a0)
sw $t3,12($a0) sw $t3,12($a0)
sw $t4,16($a0) sw $t4,16($a0)
sw $t5,20($a0) sw $t5,20($a0)
sw $t6,24($a0) sw $t6,24($a0)
sw $t7,28($a0) sw $t7,28($a0)
lwr $t0,32($a1) lwr $t0,32($a1)
lwl $t0,35($a1) lwl $t0,35($a1)
lwr $t1,36($a1) lwr $t1,36($a1)
bgtz $v1,ua_skip_pref30_128 bgtz $v1,ua_skip_pref30_128
lwl $t1,39($a1) lwl $t1,39($a1)
pref 30,128($a0) # continue setting up the dest, addr 128 pref 30,128($a0) # continue setting up the dest, addr 128
ua_skip_pref30_128: ua_skip_pref30_128:
lwr $t2,40($a1) lwr $t2,40($a1)
lwl $t2,43($a1) lwl $t2,43($a1)
lwr $t3,44($a1) lwr $t3,44($a1)
lwl $t3,47($a1) lwl $t3,47($a1)
lwr $t4,48($a1) lwr $t4,48($a1)
lwl $t4,51($a1) lwl $t4,51($a1)
lwr $t5,52($a1) lwr $t5,52($a1)
lwl $t5,55($a1) lwl $t5,55($a1)
lwr $t6,56($a1) lwr $t6,56($a1)
lwl $t6,59($a1) lwl $t6,59($a1)
lwr $t7,60($a1) lwr $t7,60($a1)
lwl $t7,63($a1) lwl $t7,63($a1)
pref 0, 160($a1) # bring the next lines of src, addr 160 pref 0, 160($a1) # bring the next lines of src, addr 160
sw $t0,32($a0) sw $t0,32($a0)
sw $t1,36($a0) sw $t1,36($a0)
sw $t2,40($a0) sw $t2,40($a0)
sw $t3,44($a0) sw $t3,44($a0)
sw $t4,48($a0) sw $t4,48($a0)
sw $t5,52($a0) sw $t5,52($a0)
sw $t6,56($a0) sw $t6,56($a0)
sw $t7,60($a0) sw $t7,60($a0)
addiu $a0,$a0,64 # adding 64 to dest addiu $a0,$a0,64 # adding 64 to dest
sgtu $v1,$a0,$t9 sgtu $v1,$a0,$t9
bne $a0,$a3,ua_loop16w bne $a0,$a3,ua_loop16w
addiu $a1,$a1,64 # adding 64 to src addiu $a1,$a1,64 # adding 64 to src
move $a2,$t8 move $a2,$t8
# Here we have src and dest word-aligned but less than 64-bytes to go # Here we have src and dest word-aligned but less than 64-bytes to go
ua_chk8w: ua_chk8w:
pref 0, 0x0($a1) pref 0, 0x0($a1)
andi $t8,$a2,0x1f # is there a 32-byte chunk? andi $t8,$a2,0x1f # is there a 32-byte chunk?
# the t8 is the reminder count # the t8 is the reminder count
beq $a2,$t8,ua_chk1w # when a2=t8, no 32-byte chunk beq $a2,$t8,ua_chk1w # when a2=t8, no 32-byte chunk
lwr $t0,0($a1) lwr $t0,0($a1)
lwl $t0,3($a1) lwl $t0,3($a1)
lwr $t1,4($a1) lwr $t1,4($a1)
lwl $t1,7($a1) lwl $t1,7($a1)
lwr $t2,8($a1) lwr $t2,8($a1)
lwl $t2,11($a1) lwl $t2,11($a1)
lwr $t3,12($a1) lwr $t3,12($a1)
lwl $t3,15($a1) lwl $t3,15($a1)
lwr $t4,16($a1) lwr $t4,16($a1)
lwl $t4,19($a1) lwl $t4,19($a1)
lwr $t5,20($a1) lwr $t5,20($a1)
lwl $t5,23($a1) lwl $t5,23($a1)
lwr $t6,24($a1) lwr $t6,24($a1)
lwl $t6,27($a1) lwl $t6,27($a1)
lwr $t7,28($a1) lwr $t7,28($a1)
lwl $t7,31($a1) lwl $t7,31($a1)
addiu $a1,$a1,32 addiu $a1,$a1,32
sw $t0,0($a0) sw $t0,0($a0)
sw $t1,4($a0) sw $t1,4($a0)
sw $t2,8($a0) sw $t2,8($a0)
sw $t3,12($a0) sw $t3,12($a0)
sw $t4,16($a0) sw $t4,16($a0)
sw $t5,20($a0) sw $t5,20($a0)
sw $t6,24($a0) sw $t6,24($a0)
sw $t7,28($a0) sw $t7,28($a0)
addiu $a0,$a0,32 addiu $a0,$a0,32
ua_chk1w: ua_chk1w:
andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
beq $a2,$t8,ua_smallCopy beq $a2,$t8,ua_smallCopy
subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
# copying in words (4-byte chunks) # copying in words (4-byte chunks)
ua_wordCopy_loop: ua_wordCopy_loop:
lwr $v1,0($a1) lwr $v1,0($a1)
lwl $v1,3($a1) lwl $v1,3($a1)
addiu $a1,$a1,4 addiu $a1,$a1,4
addiu $a0,$a0,4 # note: dst=a0 is word aligned here, see NOTE1 addiu $a0,$a0,4 # note: dst=a0 is word aligned here, see NOTE1
bne $a0,$a3,ua_wordCopy_loop bne $a0,$a3,ua_wordCopy_loop
sw $v1,-4($a0) sw $v1,-4($a0)
# Now less than 4 bytes (value in a2) left to copy # Now less than 4 bytes (value in a2) left to copy
ua_smallCopy: ua_smallCopy:
beqz $a2,leave beqz $a2,leave
addu $a3,$a0,$a2 # a3 is the last dst address addu $a3,$a0,$a2 # a3 is the last dst address
ua_smallCopy_loop: ua_smallCopy_loop:
lb $v1,0($a1) lb $v1,0($a1)
addiu $a1,$a1,1 addiu $a1,$a1,1
addiu $a0,$a0,1 addiu $a0,$a0,1
bne $a0,$a3,ua_smallCopy_loop bne $a0,$a3,ua_smallCopy_loop
sb $v1,-1($a0) sb $v1,-1($a0)
j $ra j $ra
nop nop
.set at .set at
.set reorder .set reorder
.end memcpy_MIPS; .end memcpy_MIPS;
.size memcpy_MIPS,.-memcpy_MIPS .size memcpy_MIPS,.-memcpy_MIPS
#endif // if defined (__mips__) #endif // if defined (__mips__)
...@@ -46,6 +46,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y, ...@@ -46,6 +46,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_SSE2; CopyRow = CopyRow_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
}
#endif
// Copy plane // Copy plane
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
...@@ -424,6 +429,14 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y, ...@@ -424,6 +429,14 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
} }
} }
} }
#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
}
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
......
...@@ -56,6 +56,23 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, ...@@ -56,6 +56,23 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
int width); int width);
#endif // defined(__ARM_NEON__) #endif // defined(__ARM_NEON__)
#if !defined(YUV_DISABLE_ASM) && defined(__mips__)
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width);
#endif
#endif
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_TRANSPOSE_WX8_SSSE3 #define HAS_TRANSPOSE_WX8_SSSE3
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
...@@ -794,6 +811,16 @@ void TransposePlane(const uint8* src, int src_stride, ...@@ -794,6 +811,16 @@ void TransposePlane(const uint8* src, int src_stride,
TransposeWx8 = TransposeWx8_FAST_SSSE3; TransposeWx8 = TransposeWx8_FAST_SSSE3;
} }
#endif #endif
#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
} else {
TransposeWx8 = TransposeWx8_MIPS_DSPR2;
}
}
#endif
// Work across the source in 8x8 tiles // Work across the source in 8x8 tiles
int i = height; int i = height;
...@@ -856,6 +883,13 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -856,6 +883,13 @@ void RotatePlane180(const uint8* src, int src_stride,
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
MirrorRow = MirrorRow_SSSE3; MirrorRow = MirrorRow_SSSE3;
} }
#endif
#if defined(HAS_MIRRORROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif #endif
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
...@@ -952,6 +986,11 @@ void TransposeUV(const uint8* src, int src_stride, ...@@ -952,6 +986,11 @@ void TransposeUV(const uint8* src, int src_stride,
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
TransposeUVWx8 = TransposeUVWx8_SSE2; TransposeUVWx8 = TransposeUVWx8_SSE2;
} }
#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
}
#endif #endif
// Work through the source in 8x8 tiles. // Work through the source in 8x8 tiles.
...@@ -1021,6 +1060,11 @@ void RotateUV180(const uint8* src, int src_stride, ...@@ -1021,6 +1060,11 @@ void RotateUV180(const uint8* src, int src_stride,
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
MirrorRowUV = MirrorRowUV_SSSE3; MirrorRowUV = MirrorRowUV_SSSE3;
} }
#elif defined(HAS_MIRRORROWUV_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
MirrorRowUV = MirrorRowUV_MIPS_DSPR2;
}
#endif #endif
dst_a += dst_stride_a * (height - 1); dst_a += dst_stride_a * (height - 1);
......
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
"andi $t0, %[dst], 0x3 \n"
"andi $t1, %[dst_stride], 0x3 \n"
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
//dst + dst_stride word aligned
"1: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
"lbux $t9, $t3(%[src]) \n"
"sll $t1, $t1, 16 \n"
"sll $t9, $t9, 16 \n"
"or $t0, $t0, $t1 \n"
"or $t8, $t8, $t9 \n"
"precr.qb.ph $s0, $t8, $t0 \n"
"lbux $t0, $t4(%[src]) \n"
"lbux $t1, $t5(%[src]) \n"
"lbux $t8, $t6(%[src]) \n"
"lbux $t9, $t7(%[src]) \n"
"sll $t1, $t1, 16 \n"
"sll $t9, $t9, 16 \n"
"or $t0, $t0, $t1 \n"
"or $t8, $t8, $t9 \n"
"precr.qb.ph $s1, $t8, $t0 \n"
"sw $s0, 0(%[dst]) \n"
"addiu %[width], -1 \n"
"addiu %[src], 1 \n"
"sw $s1, 4(%[dst]) \n"
"bnez %[width], 1b \n"
" addu %[dst], %[dst], %[dst_stride] \n"
"b 2f \n"
//dst + dst_stride unaligned
"11: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
"lbux $t9, $t3(%[src]) \n"
"sll $t1, $t1, 16 \n"
"sll $t9, $t9, 16 \n"
"or $t0, $t0, $t1 \n"
"or $t8, $t8, $t9 \n"
"precr.qb.ph $s0, $t8, $t0 \n"
"lbux $t0, $t4(%[src]) \n"
"lbux $t1, $t5(%[src]) \n"
"lbux $t8, $t6(%[src]) \n"
"lbux $t9, $t7(%[src]) \n"
"sll $t1, $t1, 16 \n"
"sll $t9, $t9, 16 \n"
"or $t0, $t0, $t1 \n"
"or $t8, $t8, $t9 \n"
"precr.qb.ph $s1, $t8, $t0 \n"
"swr $s0, 0(%[dst]) \n"
"swl $s0, 3(%[dst]) \n"
"addiu %[width], -1 \n"
"addiu %[src], 1 \n"
"swr $s1, 4(%[dst]) \n"
"swl $s1, 7(%[dst]) \n"
"bnez %[width], 11b \n"
"addu %[dst], %[dst], %[dst_stride] \n"
"2: \n"
".set pop \n"
:[src] "+r" (src),
[dst] "+r" (dst),
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1"
);
}
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
__asm__ __volatile__ (
".set noat \n"
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
"srl $AT, %[width], 0x2 \n"
"andi $t0, %[dst], 0x3 \n"
"andi $t1, %[dst_stride], 0x3 \n"
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
//dst + dst_stride word aligned
"1: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
"addu $s2, $s1, %[dst_stride] \n"
"sw $s4, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $s6, 0($s0) \n"
"sw $t8, 4($s0) \n"
"sw $s5, 0($s1) \n"
"sw $t1, 4($s1) \n"
"sw $s7, 0($s2) \n"
"sw $t9, 4($s2) \n"
"addiu $AT, -1 \n"
"addiu %[src], 4 \n"
"bnez $AT, 1b \n"
" addu %[dst], $s2, %[dst_stride] \n"
"b 2f \n"
//dst + dst_stride unaligned
"11: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
"addu $s2, $s1, %[dst_stride] \n"
"swr $s4, 0(%[dst]) \n"
"swl $s4, 3(%[dst]) \n"
"swr $t0, 4(%[dst]) \n"
"swl $t0, 7(%[dst]) \n"
"swr $s6, 0($s0) \n"
"swl $s6, 3($s0) \n"
"swr $t8, 4($s0) \n"
"swl $t8, 7($s0) \n"
"swr $s5, 0($s1) \n"
"swl $s5, 3($s1) \n"
"swr $t1, 4($s1) \n"
"swl $t1, 7($s1) \n"
"swr $s7, 0($s2) \n"
"swl $s7, 3($s2) \n"
"swr $t9, 4($s2) \n"
"swl $t9, 7($s2) \n"
"addiu $AT, -1 \n"
"addiu %[src], 4 \n"
"bnez $AT, 11b \n"
" addu %[dst], $s2, %[dst_stride] \n"
"2: \n"
".set pop \n"
".set at \n"
:[src] "+r" (src),
[dst] "+r" (dst),
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3", "s4",
"s5", "s6", "s7"
);
}
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
"subu $t7, $t9, %[src_stride] \n"
"srl $t1, %[width], 1 \n"
// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
"andi $t0, %[dst_a], 0x3 \n"
"andi $t8, %[dst_b], 0x3 \n"
"or $t0, $t0, $t8 \n"
"andi $t8, %[dst_stride_a], 0x3 \n"
"andi $s5, %[dst_stride_b], 0x3 \n"
"or $t8, $t8, $s5 \n"
"or $t0, $t0, $t8 \n"
"bnez $t0, 11f \n"
" nop \n"
// dst + dst_stride word aligned (both, a & b dst addresses)
"1: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"sw $s3, 0($s5) \n"
"sw $s4, 0($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"sw $s3, 0(%[dst_a]) \n"
"sw $s4, 0(%[dst_b]) \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"sw $s3, 4($s5) \n"
"sw $s4, 4($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
"sll $t0, %[dst_stride_a], 1 \n"
"sll $t8, %[dst_stride_b], 1 \n"
"sw $s3, 4(%[dst_a]) \n"
"sw $s4, 4(%[dst_b]) \n"
"addu %[dst_a], %[dst_a], $t0 \n"
"bnez $t1, 1b \n"
" addu %[dst_b], %[dst_b], $t8 \n"
"b 2f \n"
" nop \n"
// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
"11: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"swr $s3, 0($s5) \n"
"swl $s3, 3($s5) \n"
"swr $s4, 0($s6) \n"
"swl $s4, 3($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"swr $s3, 0(%[dst_a]) \n"
"swl $s3, 3(%[dst_a]) \n"
"swr $s4, 0(%[dst_b]) \n"
"swl $s4, 3(%[dst_b]) \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"swr $s3, 4($s5) \n"
"swl $s3, 7($s5) \n"
"swr $s4, 4($s6) \n"
"swl $s4, 7($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
"sll $t0, %[dst_stride_a], 1 \n"
"sll $t8, %[dst_stride_b], 1 \n"
"swr $s3, 4(%[dst_a]) \n"
"swl $s3, 7(%[dst_a]) \n"
"swr $s4, 4(%[dst_b]) \n"
"swl $s4, 7(%[dst_b]) \n"
"addu %[dst_a], %[dst_a], $t0 \n"
"bnez $t1, 11b \n"
" addu %[dst_b], %[dst_b], $t8 \n"
"2: \n"
".set pop \n"
: [src] "+r" (src),
[dst_a] "+r" (dst_a),
[dst_b] "+r" (dst_b),
[width] "+r" (width),
[src_stride] "+r" (src_stride)
: [dst_stride_a] "r" (dst_stride_a),
[dst_stride_b] "r" (dst_stride_b)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
...@@ -16,6 +16,13 @@ extern "C" { ...@@ -16,6 +16,13 @@ extern "C" {
#endif #endif
#if !defined(YUV_DISABLE_ASM) && defined(__mips__) #if !defined(YUV_DISABLE_ASM) && defined(__mips__)
#if defined HAS_COPYROW_MIPS
extern "C" void memcpy_MIPS(uint8* dst, const uint8* src, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
memcpy_MIPS(dst, src, count);
}
#endif
#ifdef HAS_SPLITUV_MIPS_DSPR2 #ifdef HAS_SPLITUV_MIPS_DSPR2
void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) { int width) {
...@@ -166,6 +173,400 @@ void SplitUV_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, ...@@ -166,6 +173,400 @@ void SplitUV_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
} }
#endif // HAS_SPLITUV_MIPS_DSPR2 #endif // HAS_SPLITUV_MIPS_DSPR2
#ifdef HAS_MIRRORROW_MIPS_DSPR2
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"andi $t5, %[width], 0xf \n"
"blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width
"1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
"lw $t2, -8(%[src]) \n" // |11|10|9|8|
"lw $t3, -4(%[src]) \n" // |15|14|13|12|
"wsbh $t0, $t0 \n" // |2|3|0|1|
"wsbh $t1, $t1 \n" // |6|7|4|5|
"wsbh $t2, $t2 \n" // |10|11|8|9|
"wsbh $t3, $t3 \n" // |14|15|12|13|
"rotr $t0, $t0, 16 \n" // |0|1|2|3|
"rotr $t1, $t1, 16 \n" // |4|5|6|7|
"rotr $t2, $t2, 16 \n" // |8|9|10|11|
"rotr $t3, $t3, 16 \n" // |12|13|14|15|
"addiu %[src], %[src], -16 \n"
"addiu $t4, $t4, -1 \n"
"sw $t3, 0(%[dst]) \n" // |15|14|13|12|
"sw $t2, 4(%[dst]) \n" // |11|10|9|8|
"sw $t1, 8(%[dst]) \n" // |7|6|5|4|
"sw $t0, 12(%[dst]) \n" // |3|2|1|0|
"bgtz $t4, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"beqz $t5, 3f \n"
" nop \n"
"2: \n"
"lbu $t0, -1(%[src]) \n"
"addiu $t5, $t5, -1 \n"
"addiu %[src], %[src], -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgez $t5, 2b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src] "+r" (src), [dst] "+r" (dst)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4", "t5"
);
}
#endif // HAS_MIRRORROW_MIPS_DSPR2
#ifdef HAS_MIRRORROWUV_MIPS_DSPR2
void MirrorRowUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
int x = 0;
int y = 0;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"addu $t4, %[width], %[width] \n"
"srl %[x], %[width], 4 \n"
"andi %[y], %[width], 0xf \n"
"blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n"
"1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
"lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
"lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
"lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
"lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
"lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
"lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
"rotr $t0, $t0, 16 \n" // |1|0|3|2|
"rotr $t1, $t1, 16 \n" // |5|4|7|6|
"rotr $t2, $t2, 16 \n" // |9|8|11|10|
"rotr $t3, $t3, 16 \n" // |13|12|15|14|
"rotr $t4, $t4, 16 \n" // |17|16|19|18|
"rotr $t6, $t6, 16 \n" // |21|20|23|22|
"rotr $t7, $t7, 16 \n" // |25|24|27|26|
"rotr $t8, $t8, 16 \n" // |29|28|31|30|
"precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
"precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
"precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
"precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
"precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
"precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
"precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
"precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
"addiu %[src_uv], %[src_uv], -32 \n"
"addiu %[x], %[x], -1 \n"
"swr $t4, 0(%[dst_u]) \n"
"swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
"swr $t6, 0(%[dst_v]) \n"
"swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
"swr $t2, 4(%[dst_u]) \n"
"swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
"swr $t3, 4(%[dst_v]) \n"
"swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
"swr $t0, 8(%[dst_u]) \n"
"swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
"swr $t1, 8(%[dst_v]) \n"
"swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
"swr $t9, 12(%[dst_u]) \n"
"swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
"swr $t5, 12(%[dst_v]) \n"
"swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
"addiu %[dst_v], %[dst_v], 16 \n"
"bgtz %[x], 1b \n"
" addiu %[dst_u], %[dst_u], 16 \n"
"beqz %[y], 3f \n"
" nop \n"
"b 2f \n"
" nop \n"
"2: \n"
"lbu $t0, -2(%[src_uv]) \n"
"lbu $t1, -1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], -2 \n"
"addiu %[y], %[y], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[y], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v),
[x] "=&r" (x),
[y] "+r" (y)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t7", "t8", "t9"
);
}
#endif // HAS_MIRRORROWUV_MIPS_DSPR2
// Convert (4 Y and 2 VU) I422 and arrange RGB values into
// t5 = | 0 | B0 | 0 | b0 |
// t4 = | 0 | B1 | 0 | b1 |
// t9 = | 0 | G0 | 0 | g0 |
// t8 = | 0 | G1 | 0 | g1 |
// t2 = | 0 | R0 | 0 | r0 |
// t1 = | 0 | R1 | 0 | r1 |
#define I422ToTransientMipsRGB \
"lw $t0, 0(%[y_buf]) \n" \
"lhu $t1, 0(%[u_buf]) \n" \
"lhu $t2, 0(%[v_buf]) \n" \
"preceu.ph.qbr $t1, $t1 \n" \
"preceu.ph.qbr $t2, $t2 \n" \
"preceu.ph.qbra $t3, $t0 \n" \
"preceu.ph.qbla $t0, $t0 \n" \
"subu.ph $t1, $t1, $s5 \n" \
"subu.ph $t2, $t2, $s5 \n" \
"subu.ph $t3, $t3, $s4 \n" \
"subu.ph $t0, $t0, $s4 \n" \
"mul.ph $t3, $t3, $s0 \n" \
"mul.ph $t0, $t0, $s0 \n" \
"shll.ph $t4, $t1, 0x7 \n" \
"subu.ph $t4, $t4, $t1 \n" \
"mul.ph $t6, $t1, $s1 \n" \
"mul.ph $t1, $t2, $s2 \n" \
"addq_s.ph $t5, $t4, $t3 \n" \
"addq_s.ph $t4, $t4, $t0 \n" \
"shra.ph $t5, $t5, 6 \n" \
"shra.ph $t4, $t4, 6 \n" \
"addiu %[u_buf], 2 \n" \
"addiu %[v_buf], 2 \n" \
"addu.ph $t6, $t6, $t1 \n" \
"mul.ph $t1, $t2, $s3 \n" \
"addu.ph $t9, $t6, $t3 \n" \
"addu.ph $t8, $t6, $t0 \n" \
"shra.ph $t9, $t9, 6 \n" \
"shra.ph $t8, $t8, 6 \n" \
"addu.ph $t2, $t1, $t3 \n" \
"addu.ph $t1, $t1, $t0 \n" \
"shra.ph $t2, $t2, 6 \n" \
"shra.ph $t1, $t1, 6 \n" \
"subu.ph $t5, $t5, $s5 \n" \
"subu.ph $t4, $t4, $s5 \n" \
"subu.ph $t9, $t9, $s5 \n" \
"subu.ph $t8, $t8, $s5 \n" \
"subu.ph $t2, $t2, $s5 \n" \
"subu.ph $t1, $t1, $s5 \n" \
"shll_s.ph $t5, $t5, 8 \n" \
"shll_s.ph $t4, $t4, 8 \n" \
"shll_s.ph $t9, $t9, 8 \n" \
"shll_s.ph $t8, $t8, 8 \n" \
"shll_s.ph $t2, $t2, 8 \n" \
"shll_s.ph $t1, $t1, 8 \n" \
"shra.ph $t5, $t5, 8 \n" \
"shra.ph $t4, $t4, 8 \n" \
"shra.ph $t9, $t9, 8 \n" \
"shra.ph $t8, $t8, 8 \n" \
"shra.ph $t2, $t2, 8 \n" \
"shra.ph $t1, $t1, 8 \n" \
"addu.ph $t5, $t5, $s5 \n" \
"addu.ph $t4, $t4, $s5 \n" \
"addu.ph $t9, $t9, $s5 \n" \
"addu.ph $t8, $t8, $s5 \n" \
"addu.ph $t2, $t2, $s5 \n" \
"addu.ph $t1, $t1, $s5 \n"
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128| // clipping
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
"1: \n"
I422ToTransientMipsRGB
// Arranging into argb format
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
"addiu %[width], -4 \n"
"precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
"precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
"or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
"or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
"sll $t9, $t9, 16 \n"
"sll $t8, $t8, 16 \n"
"packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
"packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n\t"
".set noreorder \n\t"
"beqz %[width], 2f \n\t"
" repl.ph $s0, 74 \n\t" // |YG|YG| = |74|74|
"repl.ph $s1, -25 \n\t" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n\t" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n\t" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n\t" // |0|16|0|16|
"repl.ph $s5, 128 \n\t" // |128|128|
"lui $s6, 0xff00 \n\t"
"ori $s6, 0xff00 \n\t" // |ff|00|ff|00|
"1: \n"
I422ToTransientMipsRGB
// Arranging into abgr format
"precr.qb.ph $t0, $t8, $t1 \n\t" // |G1|g1|R1|r1|
"precr.qb.ph $t3, $t9, $t2 \n\t" // |G0|g0|R0|r0|
"precrq.qb.ph $t8, $t0, $t3 \n\t" // |G1|R1|G0|R0|
"precr.qb.ph $t9, $t0, $t3 \n\t" // |g1|r1|g0|r0|
"precr.qb.ph $t2, $t4, $t5 \n\t" // |B1|b1|B0|b0|
"addiu %[width], -4 \n\t"
"addiu %[y_buf], 4 \n\t"
"preceu.ph.qbla $t1, $t2 \n\t" // |0 |B1|0 |B0|
"preceu.ph.qbra $t2, $t2 \n\t" // |0 |b1|0 |b0|
"or $t1, $t1, $s6 \n\t" // |ff|B1|ff|B0|
"or $t2, $t2, $s6 \n\t" // |ff|b1|ff|b0|
"precrq.ph.w $t0, $t2, $t9 \n\t" // |ff|b1|g1|r1|
"precrq.ph.w $t3, $t1, $t8 \n\t" // |ff|B1|G1|R1|
"sll $t9, $t9, 16 \n\t"
"sll $t8, $t8, 16 \n\t"
"packrl.ph $t2, $t2, $t9 \n\t" // |ff|b0|g0|r0|
"packrl.ph $t1, $t1, $t8 \n\t" // |ff|B0|G0|R0|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n\t"
"sw $t0, 4(%[rgb_buf]) \n\t"
"sw $t1, 8(%[rgb_buf]) \n\t"
"sw $t3, 12(%[rgb_buf]) \n\t"
"bnez %[width], 1b \n\t"
" addiu %[rgb_buf], 16 \n\t"
"2: \n\t"
".set pop \n\t"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff \n"
"ori $s6, 0xff \n" // |00|ff|00|ff|
"1: \n"
I422ToTransientMipsRGB
// Arranging into bgra format
"precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
"precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
"precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
"precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[width], -4 \n"
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
"sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
"sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
"or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
"or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
"precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
"precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
"sll $t1, $t1, 16 \n"
"sll $t2, $t2, 16 \n"
"packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
"packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
#endif // __mips__ #endif // __mips__
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -1957,6 +1957,26 @@ void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr, ...@@ -1957,6 +1957,26 @@ void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
const unsigned char* src_ptr, const unsigned char* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int dst_width, int source_y_fraction); int dst_width, int source_y_fraction);
#define HAS_SCALEROWDOWN4_MIPS_DSPR2
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width);
void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
#define HAS_SCALEROWDOWN34_MIPS_DSPR2
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width);
void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
#define HAS_SCALEROWDOWN38_MIPS_DSPR2
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width);
void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
// CPU agnostic row functions // CPU agnostic row functions
...@@ -2331,7 +2351,7 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */, ...@@ -2331,7 +2351,7 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown2 = filtering ? ScaleRowDown2 = filtering ?
ScaleRowDown2Int_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; ScaleRowDown2Int_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
} }
#endif #endif
...@@ -2368,6 +2388,13 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */, ...@@ -2368,6 +2388,13 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
} }
#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown4 = filtering ?
ScaleRowDown4Int_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
}
#endif #endif
for (int y = 0; y < dst_height; ++y) { for (int y = 0; y < dst_height; ++y) {
...@@ -2461,6 +2488,19 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, ...@@ -2461,6 +2488,19 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Int_MIPS_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_1_Int_MIPS_DSPR2;
}
}
#endif
for (int y = 0; y < dst_height - 2; y += 3) { for (int y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
...@@ -2541,6 +2581,18 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */, ...@@ -2541,6 +2581,18 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
} }
} }
#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Int_MIPS_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_2_Int_MIPS_DSPR2;
}
}
#endif #endif
for (int y = 0; y < dst_height - 2; y += 3) { for (int y = 0; y < dst_height - 2; y += 3) {
......
...@@ -173,6 +173,460 @@ void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -173,6 +173,460 @@ void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 3 \n"
"beqz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
"precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
"precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
"precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
"precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
"precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
"precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t1, 0(%[dst]) \n"
"sw $t5, 4(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"andi $t9, %[dst_width], 7 \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t1, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -1 \n"
"sb $t1, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride;
const uint8* s3 = s2 + stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
"lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 4(%[s1]) \n" // |23|22|21|20|
"lw $t6, 4(%[s2]) \n" // |27|26|25|24|
"lw $t7, 4(%[s3]) \n" // |31|30|29|28|
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
"raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
"raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
"raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
"raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
"add $t0, $t0, $t1 \n"
"add $t1, $t2, $t3 \n"
"add $t0, $t0, $t1 \n"
"add $t4, $t4, $t5 \n"
"add $t6, $t6, $t7 \n"
"add $t4, $t4, $t6 \n"
"shra_r.w $t0, $t0, 4 \n"
"shra_r.w $t4, $t4, 4 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[s1], %[s1], 8 \n"
"addiu %[s2], %[s2], 8 \n"
"addiu %[s3], %[s3], 8 \n"
"addiu $t9, $t9, -1 \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 2 \n"
"beqz $t8, 2f \n"
" nop \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
"add $t0, $t0, $t1 \n"
"add $t1, $t2, $t3 \n"
"add $t0, $t0, $t1 \n"
"shra_r.w $t0, $t0, 4 \n"
"sb $t0, 0(%[dst]) \n"
"2: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[s1] "+r" (s1),
[s2] "+r" (s2),
[s3] "+r" (s3)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
}
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
"precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
"precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
"addiu %[dst_width], %[dst_width], -24 \n"
"ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
"ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
"ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
"ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
"packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
"prepend $t1, $t2, 8 \n" // |4|3|1|0|
"prepend $t3, $t4, 24 \n" // |15|13|12|11|
"prepend $t5, $t6, 8 \n" // |20|19|17|16|
"prepend $t7, $t8, 24 \n" // |31|29|28|27|
"sw $t1, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t3, 8(%[dst]) \n"
"sw $t5, 12(%[dst]) \n"
"sw $t9, 16(%[dst]) \n"
"sw $t7, 20(%[dst]) \n"
"bnez %[dst_width], 1b \n"
" addiu %[dst], %[dst], 24 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
}
void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
"muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
"muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
"andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
"raddu.w.qb $t0, $t0 \n"
"raddu.w.qb $t1, $t1 \n"
"shra_r.w $t0, $t0, 1 \n"
"shra_r.w $t1, $t1, 1 \n"
"preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
"rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
"addu.ph $t2, $t2, $t4 \n"
"addu.ph $t6, $t6, $t5 \n"
"sll $t5, $t0, 1 \n"
"add $t0, $t5, $t0 \n"
"shra_r.ph $t2, $t2, 2 \n"
"shra_r.ph $t6, $t6, 2 \n"
"shll.ph $t4, $t2, 1 \n"
"addq.ph $t4, $t4, $t2 \n"
"addu $t0, $t0, $t1 \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"shra_r.w $t0, $t0, 2 \n"
"addu.ph $t6, $t6, $t4 \n"
"shra_r.ph $t6, $t6, 2 \n"
"srl $t1, $t6, 16 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"sb $t1, 0(%[d]) \n"
"sb $t0, 1(%[d]) \n"
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
}
void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
"muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
"muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
"andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
"raddu.w.qb $t0, $t0 \n"
"raddu.w.qb $t1, $t1 \n"
"shra_r.w $t0, $t0, 1 \n"
"shra_r.w $t1, $t1, 1 \n"
"preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
"rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
"addu.ph $t4, $t4, $t3 \n"
"addu.ph $t6, $t6, $t5 \n"
"shra_r.ph $t6, $t6, 2 \n"
"shra_r.ph $t4, $t4, 2 \n"
"addu.ph $t6, $t6, $t4 \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"shra_r.ph $t6, $t6, 1 \n"
"addu $t0, $t0, $t1 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"shra_r.w $t0, $t0, 1 \n"
"srl $t1, $t6, 16 \n"
"sb $t1, 0(%[d]) \n"
"sb $t0, 1(%[d]) \n"
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
}
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
"wsbh $t0, $t0 \n" // |2|3|0|1|
"wsbh $t6, $t6 \n" // |26|27|24|25|
"srl $t0, $t0, 8 \n" // |X|2|3|0|
"srl $t3, $t3, 16 \n" // |X|X|15|14|
"srl $t5, $t5, 16 \n" // |X|X|23|22|
"srl $t7, $t7, 16 \n" // |X|X|31|30|
"ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
"ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
"ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
"ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
"prepend $t2, $t3, 24 \n" // |X|15|14|11|
"ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
"ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu %[dst_width], %[dst_width], -12 \n"
"addiu $t8,%[dst_width], -12 \n"
"sw $t1, 0(%[dst]) \n"
"sw $t4, 4(%[dst]) \n"
"sw $t6, 8(%[dst]) \n"
"bgez $t8, 1b \n"
" addiu %[dst], %[dst], 12 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
}
void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
const int c = 0x2AAA;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
"lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
"packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
"packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
"raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
"raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
"precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
"precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
"srl $t4, $t4, 2 \n" // t4 / 4
"srl $t6, $t6, 16 \n" // |0|0|S3|T3|
"raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
"addu $t6, $t5, $t6 \n"
"mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
"raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
"raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
"addu $t0, $t0, $t2 \n"
"mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[t], %[t], 8 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
"srl $t6, $t6, 16 \n"
"srl $t0, $t0, 16 \n"
"sb $t4, -1(%[dst_ptr]) \n"
"sb $t6, -2(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[t] "+r" (t),
[dst_width] "+r" (dst_width)
: [c] "r" (c)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6"
);
}
void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
stride += stride;
const uint8* s2 = src_ptr + stride;
const int c1 = 0x1C71;
const int c2 = 0x2AAA;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
"lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
"lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
"lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
"packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
"raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
"packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
"raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
"sll $t8, $t5, 16 \n" // |R5|R4|0|0|
"raddu.w.qb $t8, $t8 \n" // R5+R4
"addu $t7, $t7, $t8 \n"
"srl $t8, $t5, 16 \n" // |0|0|R7|R6|
"raddu.w.qb $t8, $t8 \n" // R7 + R6
"addu $t6, $t6, $t8 \n"
"mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
"precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
"precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
"srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
"raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
"addu $t7, $t7, $t8 \n"
"mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
"sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
"raddu.w.qb $t0, $t0 \n"
"raddu.w.qb $t2, $t2 \n"
"raddu.w.qb $t4, $t4 \n"
"addu $t0, $t0, $t2 \n"
"addu $t0, $t0, $t4 \n"
"mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[s1], %[s1], 8 \n"
"addiu %[s2], %[s2], 8 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
"srl $t6, $t6, 16 \n"
"srl $t7, $t7, 16 \n"
"srl $t0, $t0, 16 \n"
"sb $t6, -1(%[dst_ptr]) \n"
"sb $t7, -2(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[s1] "+r" (s1),
[s2] "+r" (s2),
[dst_width] "+r" (dst_width)
: [c1] "r" (c1), [c2] "r" (c2)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
}
void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr, void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
const unsigned char* src_ptr, const unsigned char* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
......
...@@ -630,4 +630,70 @@ TEST_F(libyuvTest, TestAffine) { ...@@ -630,4 +630,70 @@ TEST_F(libyuvTest, TestAffine) {
#endif #endif
} }
TEST_F(libyuvTest, TestCopyPlane) {
int err = 0;
int yw = benchmark_width_;
int yh = benchmark_height_;
int b = 12;
int i, j;
int y_plane_size = (yw + b * 2) * (yh + b * 2);
srandom(time(NULL));
align_buffer_16(orig_y, y_plane_size)
align_buffer_16(dst_c, y_plane_size)
align_buffer_16(dst_opt, y_plane_size);
memset(orig_y, 0, y_plane_size);
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 0, y_plane_size);
// Fill image buffers with random data.
for (i = b; i < (yh + b); ++i) {
for (j = b; j < (yw + b); ++j) {
orig_y[i * (yw + b * 2) + j] = random() & 0xff;
}
}
// Fill destination buffers with random data.
for (i = 0; i < y_plane_size; ++i) {
uint8 random_number = random() & 0x7f;
dst_c[i] = random_number;
dst_opt[i] = dst_c[i];
}
int y_off = b * (yw + b * 2) + b;
int y_st = yw + b * 2;
int stride = 8;
// Disable all optimizations.
MaskCpuFlags(0);
double c_time = get_time();
for (j = 0; j < benchmark_iterations_; j++) {
CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh);
}
c_time = (get_time() - c_time) / benchmark_iterations_;
// Enable optimizations.
MaskCpuFlags(-1);
double opt_time = get_time();
for (j = 0; j < benchmark_iterations_; j++) {
CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
}
opt_time = (get_time() - opt_time) / benchmark_iterations_;
printf(" %8d us C - %8d us OPT\n",
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
for (i = 0; i < y_plane_size; ++i) {
if (dst_c[i] != dst_opt[i])
++err;
}
free_aligned_buffer_16(orig_y)
free_aligned_buffer_16(dst_c)
free_aligned_buffer_16(dst_opt)
EXPECT_EQ(0, err);
}
} // namespace libyuv } // namespace libyuv
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment