Commit 66fe097a authored by fbarchard@google.com's avatar fbarchard@google.com

Move compare modules into their own files, and scale for mips

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/920005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@434 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 74114559
......@@ -6,26 +6,35 @@ include $(CLEAR_VARS)
LOCAL_CPP_EXTENSION := .cc
LOCAL_SRC_FILES := \
source/compare.cc \
source/convert.cc \
source/convert_from.cc \
source/convert_from_argb.cc \
source/cpu_id.cc \
source/format_conversion.cc \
source/planar_functions.cc \
source/rotate.cc \
source/row_common.cc \
source/row_posix.cc \
source/scale.cc \
source/scale_argb.cc \
source/video_common.cc
source/compare.cc \
source/compare_common.cc \
source/compare_posix.cc \
source/convert.cc \
source/convert_argb.cc \
source/convert_from.cc \
source/convert_from_argb.cc \
source/cpu_id.cc \
source/format_conversion.cc \
source/planar_functions.cc \
source/rotate.cc \
source/rotate_argb.cc \
source/row_common.cc \
source/row_mips.cc \
source/row_posix.cc \
source/scale.cc \
source/scale_argb.cc \
source/scale_mips.cc \
source/video_common.cc \
# TODO(fbarchard): Enable mjpeg encoder.
# source/mjpeg_decoder.cc
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_CFLAGS += -DLIBYUV_NEON
LOCAL_SRC_FILES += \
source/compare_neon.cc.neon \
source/rotate_neon.cc.neon \
source/row_neon.cc.neon \
source/rotate_neon.cc.neon \
source/row_neon.cc.neon \
source/scale_neon.cc.neon
endif
......
......@@ -87,6 +87,7 @@
'source/row_win.cc',
'source/scale.cc',
'source/scale_argb.cc',
'source/scale_mips.cc',
'source/scale_neon.cc',
'source/video_common.cc',
],
......
......@@ -27,192 +27,15 @@ extern "C" {
// hash seed of 5381 recommended.
// Internal C version of HashDjb2 with int sized count for efficiency.
static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
uint32 hash = seed;
for (int i = 0; i < count; ++i) {
hash += (hash << 5) + src[i];
}
return hash;
}
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
// This module is for Visual C x86
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
#define HAS_HASHDJB2_SSE41
static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static const uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
static const uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
static const uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
static const uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg
__declspec(naked) __declspec(align(16))
static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33
align 16
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
movdqa xmm5, kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
pmulld(0xdd) // pmulld xmm3, xmm5
movdqa xmm5, kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
pmulld(0xe5) // pmulld xmm4, xmm5
movdqa xmm5, kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
pmulld(0xd5) // pmulld xmm2, xmm5
movdqa xmm5, kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 14 // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 1
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
movd eax, xmm0 // return hash
ret
}
}
#elif !defined(YUV_DISABLE_ASM) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
// GCC 4.2 on OSX has link error when passing static or const to inline.
// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
#ifdef __APPLE__
#define CONST
#else
#define CONST static const
#endif
#define HAS_HASHDJB2_SSE41
CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
CONST uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
CONST uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
CONST uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
CONST uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
uint32 hash;
asm volatile (
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
"pmulld %%xmm6,%%xmm0 \n"
"movdqa %5,%%xmm5 \n"
"movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm7,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm7,%%xmm3 \n"
"pmulld %%xmm5,%%xmm3 \n"
"movdqa %6,%%xmm5 \n"
"movdqa %%xmm2,%%xmm4 \n"
"punpckhwd %%xmm7,%%xmm4 \n"
"pmulld %%xmm5,%%xmm4 \n"
"movdqa %7,%%xmm5 \n"
"punpckhbw %%xmm7,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n"
"punpcklwd %%xmm7,%%xmm2 \n"
"pmulld %%xmm5,%%xmm2 \n"
"movdqa %8,%%xmm5 \n"
"punpckhwd %%xmm7,%%xmm1 \n"
"pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n"
"sub $0x10,%1 \n"
"paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"jg 1b \n"
"movd %%xmm0,%3 \n"
: "+r"(src), // %0
"+r"(count), // %1
"+rm"(seed), // %2
"=g"(hash) // %3
: "m"(kHash16x33), // %4
"m"(kHashMul0), // %5
"m"(kHashMul1), // %6
"m"(kHashMul2), // %7
"m"(kHashMul3) // %8
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
return hash;
}
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
#endif // HAS_HASHDJB2_SSE41
// hash seed of 5381 recommended.
......
......@@ -24,6 +24,16 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
// hash seed of 5381 recommended.
// Internal C version of HashDjb2 with int sized count for efficiency.
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
uint32 hash = seed;
for (int i = 0; i < count; ++i) {
hash += (hash << 5) + src[i];
}
return hash;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -9,6 +9,7 @@
*/
#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
......@@ -63,6 +64,100 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
#endif // defined(__x86_64__) || defined(__i386__)
#if !defined(YUV_DISABLE_ASM) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
// GCC 4.2 on OSX has link error when passing static or const to inline.
// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
#ifdef __APPLE__
#define CONST
#else
#define CONST static const
#endif
#define HAS_HASHDJB2_SSE41
CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
CONST uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
CONST uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
CONST uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
CONST uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
uint32 hash;
asm volatile (
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
"pmulld %%xmm6,%%xmm0 \n"
"movdqa %5,%%xmm5 \n"
"movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm7,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm7,%%xmm3 \n"
"pmulld %%xmm5,%%xmm3 \n"
"movdqa %6,%%xmm5 \n"
"movdqa %%xmm2,%%xmm4 \n"
"punpckhwd %%xmm7,%%xmm4 \n"
"pmulld %%xmm5,%%xmm4 \n"
"movdqa %7,%%xmm5 \n"
"punpckhbw %%xmm7,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n"
"punpcklwd %%xmm7,%%xmm2 \n"
"pmulld %%xmm5,%%xmm2 \n"
"movdqa %8,%%xmm5 \n"
"punpckhwd %%xmm7,%%xmm1 \n"
"pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n"
"sub $0x10,%1 \n"
"paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"jg 1b \n"
"movd %%xmm0,%3 \n"
: "+r"(src), // %0
"+r"(count), // %1
"+rm"(seed), // %2
"=g"(hash) // %3
: "m"(kHash16x33), // %4
"m"(kHashMul0), // %5
"m"(kHashMul1), // %6
"m"(kHashMul2), // %7
"m"(kHashMul3) // %8
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
return hash;
}
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -9,6 +9,7 @@
*/
#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
......@@ -55,6 +56,91 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
}
}
#define HAS_HASHDJB2_SSE41
static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static const uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
static const uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
static const uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
static const uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg
__declspec(naked) __declspec(align(16))
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33
align 16
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
movdqa xmm5, kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
pmulld(0xdd) // pmulld xmm3, xmm5
movdqa xmm5, kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
pmulld(0xe5) // pmulld xmm4, xmm5
movdqa xmm5, kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
pmulld(0xd5) // pmulld xmm2, xmm5
movdqa xmm5, kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 14 // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 1
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
movd eax, xmm0 // return hash
ret
}
}
#endif // _M_IX86
#ifdef __cplusplus
......
......@@ -1947,224 +1947,16 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
#endif // defined(__x86_64__) || defined(__i386__)
#if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_SCALEROWDOWN2_MIPS_DSPR2
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 4 \n" // iterations -> by 32
"beqz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
"precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t8, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t1, 8(%[dst]) \n"
"sw $t2, 12(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"2: \n"
"andi $t9, %[dst_width], 0xf \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t0, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 2 \n"
"addiu $t9, $t9, -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
uint8* dst, int dst_width);
void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 3 \n" // iterations -> step 8
"bltz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 0(%[t]) \n" // |19|18|17|16|
"lw $t5, 4(%[t]) \n" // |23|22|21|20|
"lw $t6, 8(%[t]) \n" // |27|26|25|24|
"lw $t7, 12(%[t]) \n" // |31|30|29|28|
"addiu $t9, $t9, -1 \n"
"srl $t8, $t0, 16 \n" // |X|X|3|2|
"ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
"ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
"raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
"raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
"shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
"shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
"srl $t8, $t1, 16 \n" // |X|X|7|6|
"ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
"ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
"raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
"raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
"shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
"shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
"srl $t8, $t2, 16 \n" // |X|X|11|10|
"ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
"ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
"raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
"raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
"shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
"shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
"srl $t8, $t3, 16 \n" // |X|X|15|14|
"ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
"ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
"raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
"raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
"shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
"shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
"addiu %[src_ptr], %[src_ptr], 16 \n"
"addiu %[t], %[t], 16 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"sb $t1, 2(%[dst]) \n"
"sb $t5, 3(%[dst]) \n"
"sb $t2, 4(%[dst]) \n"
"sb $t6, 5(%[dst]) \n"
"sb $t3, 6(%[dst]) \n"
"sb $t7, 7(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"andi $t9, %[dst_width], 0x7 \n" // x = residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lwr $t1, 0(%[src_ptr]) \n"
"lwl $t1, 3(%[src_ptr]) \n"
"lwr $t2, 0(%[t]) \n"
"lwl $t2, 3(%[t]) \n"
"srl $t8, $t1, 16 \n"
"ins $t1, $t2, 16, 16 \n"
"ins $t2, $t8, 0, 16 \n"
"raddu.w.qb $t1, $t1 \n"
"raddu.w.qb $t2, $t2 \n"
"shra_r.w $t1, $t1, 2 \n"
"shra_r.w $t2, $t2, 2 \n"
"sb $t1, 0(%[dst]) \n"
"sb $t2, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -2 \n"
"addiu %[t], %[t], 4 \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 2 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst), [t] "+r" (t)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
uint8* dst, int dst_width);
#define HAS_SCALEFILTERROWS_MIPS_DSPR2
static void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
const unsigned char* src_ptr,
ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;
const unsigned char* src_ptr1 = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n"
"1: \n"
"lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n"
"lw $t4, 4(%[src_ptr]) \n"
"lw $t5, 4(%[src_ptr1]) \n"
"muleu_s.ph.qbl $t6, $t2, $t0 \n"
"muleu_s.ph.qbr $t7, $t2, $t0 \n"
"muleu_s.ph.qbl $t8, $t3, $t1 \n"
"muleu_s.ph.qbr $t9, $t3, $t1 \n"
"muleu_s.ph.qbl $t2, $t4, $t0 \n"
"muleu_s.ph.qbr $t3, $t4, $t0 \n"
"muleu_s.ph.qbl $t4, $t5, $t1 \n"
"muleu_s.ph.qbr $t5, $t5, $t1 \n"
"addq.ph $t6, $t6, $t8 \n"
"addq.ph $t7, $t7, $t9 \n"
"addq.ph $t2, $t2, $t4 \n"
"addq.ph $t3, $t3, $t5 \n"
"shra.ph $t6, $t6, 8 \n"
"shra.ph $t7, $t7, 8 \n"
"shra.ph $t2, $t2, 8 \n"
"shra.ph $t3, $t3, 8 \n"
"precr.qb.ph $t6, $t6, $t7 \n"
"precr.qb.ph $t2, $t2, $t3 \n"
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[src_ptr1], %[src_ptr1], 8 \n"
"addiu %[dst_width], %[dst_width], -8 \n"
"sw $t6, 0(%[dst_ptr]) \n"
"sw $t2, 4(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[dst_ptr], %[dst_ptr], 8 \n"
"lbu $t0, -1(%[dst_ptr]) \n"
"sb $t0, 0(%[dst_ptr]) \n"
".set pop \n"
: [dst_ptr] "+r" (dst_ptr),
[src_ptr1] "+r" (src_ptr1),
[src_ptr] "+r" (src_ptr),
[dst_width] "+r" (dst_width)
: [source_y_fraction] "r" (source_y_fraction),
[y0_fraction] "r" (y0_fraction),
[src_stride] "r" (src_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
const unsigned char* src_ptr,
ptrdiff_t src_stride,
int dst_width, int source_y_fraction);
#endif // if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
// CPU agnostic row functions
......
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC MIPS DSPR2
#if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 4 \n" // iterations -> by 16
"beqz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
"precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t8, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t1, 8(%[dst]) \n"
"sw $t2, 12(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"2: \n"
"andi $t9, %[dst_width], 0xf \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t0, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 2 \n"
"addiu $t9, $t9, -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 3 \n" // iterations -> step 8
"bltz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 0(%[t]) \n" // |19|18|17|16|
"lw $t5, 4(%[t]) \n" // |23|22|21|20|
"lw $t6, 8(%[t]) \n" // |27|26|25|24|
"lw $t7, 12(%[t]) \n" // |31|30|29|28|
"addiu $t9, $t9, -1 \n"
"srl $t8, $t0, 16 \n" // |X|X|3|2|
"ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
"ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
"raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
"raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
"shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
"shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
"srl $t8, $t1, 16 \n" // |X|X|7|6|
"ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
"ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
"raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
"raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
"shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
"shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
"srl $t8, $t2, 16 \n" // |X|X|11|10|
"ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
"ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
"raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
"raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
"shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
"shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
"srl $t8, $t3, 16 \n" // |X|X|15|14|
"ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
"ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
"raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
"raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
"shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
"shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
"addiu %[src_ptr], %[src_ptr], 16 \n"
"addiu %[t], %[t], 16 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"sb $t1, 2(%[dst]) \n"
"sb $t5, 3(%[dst]) \n"
"sb $t2, 4(%[dst]) \n"
"sb $t6, 5(%[dst]) \n"
"sb $t3, 6(%[dst]) \n"
"sb $t7, 7(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"andi $t9, %[dst_width], 0x7 \n" // x = residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lwr $t1, 0(%[src_ptr]) \n"
"lwl $t1, 3(%[src_ptr]) \n"
"lwr $t2, 0(%[t]) \n"
"lwl $t2, 3(%[t]) \n"
"srl $t8, $t1, 16 \n"
"ins $t1, $t2, 16, 16 \n"
"ins $t2, $t8, 0, 16 \n"
"raddu.w.qb $t1, $t1 \n"
"raddu.w.qb $t2, $t2 \n"
"shra_r.w $t1, $t1, 2 \n"
"shra_r.w $t2, $t2, 2 \n"
"sb $t1, 0(%[dst]) \n"
"sb $t2, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -2 \n"
"addiu %[t], %[t], 4 \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 2 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst), [t] "+r" (t)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
const unsigned char* src_ptr,
ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;
const unsigned char* src_ptr1 = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n"
"1: \n"
"lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n"
"lw $t4, 4(%[src_ptr]) \n"
"lw $t5, 4(%[src_ptr1]) \n"
"muleu_s.ph.qbl $t6, $t2, $t0 \n"
"muleu_s.ph.qbr $t7, $t2, $t0 \n"
"muleu_s.ph.qbl $t8, $t3, $t1 \n"
"muleu_s.ph.qbr $t9, $t3, $t1 \n"
"muleu_s.ph.qbl $t2, $t4, $t0 \n"
"muleu_s.ph.qbr $t3, $t4, $t0 \n"
"muleu_s.ph.qbl $t4, $t5, $t1 \n"
"muleu_s.ph.qbr $t5, $t5, $t1 \n"
"addq.ph $t6, $t6, $t8 \n"
"addq.ph $t7, $t7, $t9 \n"
"addq.ph $t2, $t2, $t4 \n"
"addq.ph $t3, $t3, $t5 \n"
"shra.ph $t6, $t6, 8 \n"
"shra.ph $t7, $t7, 8 \n"
"shra.ph $t2, $t2, 8 \n"
"shra.ph $t3, $t3, 8 \n"
"precr.qb.ph $t6, $t6, $t7 \n"
"precr.qb.ph $t2, $t2, $t3 \n"
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[src_ptr1], %[src_ptr1], 8 \n"
"addiu %[dst_width], %[dst_width], -8 \n"
"sw $t6, 0(%[dst_ptr]) \n"
"sw $t2, 4(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[dst_ptr], %[dst_ptr], 8 \n"
"lbu $t0, -1(%[dst_ptr]) \n"
"sb $t0, 0(%[dst_ptr]) \n"
".set pop \n"
: [dst_ptr] "+r" (dst_ptr),
[src_ptr1] "+r" (src_ptr1),
[src_ptr] "+r" (src_ptr),
[dst_width] "+r" (dst_width)
: [source_y_fraction] "r" (source_y_fraction),
[y0_fraction] "r" (y0_fraction),
[src_stride] "r" (src_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
#endif // if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment