Commit b5491759 authored by fbarchard@google.com's avatar fbarchard@google.com

Neon optimized argb filter row for bilinear scale and Effects Interpolate.

BUG=none
TEST=./libyuv_unittest --gtest_filter=*ARGBScale*
Review URL: https://webrtc-codereview.appspot.com/964017

git-svn-id: http://libyuv.googlecode.com/svn/trunk@497 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 958a0b0c
...@@ -217,6 +217,9 @@ extern "C" { ...@@ -217,6 +217,9 @@ extern "C" {
#define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON #define HAS_YUY2TOYROW_NEON
#define HAS_ARGBMIRRORROW_NEON #define HAS_ARGBMIRRORROW_NEON
// Effects
#define HAS_ARGBINTERPOLATEROW_NEON
#endif #endif
// The following are available on Mips platforms // The following are available on Mips platforms
...@@ -1241,6 +1244,9 @@ void ARGBInterpolateRow_C(uint8* dst_argb, const uint8* src_argb, ...@@ -1241,6 +1244,9 @@ void ARGBInterpolateRow_C(uint8* dst_argb, const uint8* src_argb,
void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride_argb, int dst_width, ptrdiff_t src_stride_argb, int dst_width,
int source_y_fraction); int source_y_fraction);
void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride_argb, int dst_width,
int source_y_fraction);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
......
...@@ -1121,6 +1121,9 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, ...@@ -1121,6 +1121,9 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
} }
// Interpolate 2 ARGB images by specified amount (0 to 255). // Interpolate 2 ARGB images by specified amount (0 to 255).
// TODO(fbarchard): Check width is multiple of 16. Do Any version.
// TODO(fbarchard): Consider selecting a specialized interpolator so
// interpolation doesn't need to be checked on each row.
LIBYUV_API LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1, const uint8* src_argb1, int src_stride_argb1,
...@@ -1136,8 +1139,8 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, ...@@ -1136,8 +1139,8 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = ARGBInterpolateRow_C; int source_y_fraction) = ARGBInterpolateRow_C;
#if defined(HAS_ARGBINTERPOLATEROW_SSSE3) #if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
...@@ -1145,6 +1148,11 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, ...@@ -1145,6 +1148,11 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBInterpolateRow = ARGBInterpolateRow_SSSE3; ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
} }
#elif defined(HAS_ARGBINTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
ARGBInterpolateRow = ARGBInterpolateRow_NEON;
}
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0, ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
......
...@@ -2275,6 +2275,89 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ...@@ -2275,6 +2275,89 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
} }
#endif // HAS_RAWTOYROW_NEON #endif // HAS_RAWTOYROW_NEON
// 4x2 -> 4x1
// Same as ScaleARGBFilterRows_NEON but with last pixel not duplicated.
void ARGBInterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 100f \n"
"add %2, %1 \n"
"cmp %4, #64 \n"
"beq 75f \n"
"cmp %4, #128 \n"
"beq 50f \n"
"cmp %4, #192 \n"
"beq 25f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #4 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #4 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #4 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n"
"subs %3, #4 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #4 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
"+r"(source_y_fraction) // %4
:
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
}
#endif // __ARM_NEON__ #endif // __ARM_NEON__
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -4241,17 +4241,23 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -4241,17 +4241,23 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
} }
#endif // HAS_ARGBAFFINEROW_SSE2 #endif // HAS_ARGBAFFINEROW_SSE2
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version // Bilinear image filtering.
void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, // Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
asm volatile ( asm volatile (
"sub %1,%0 \n" "sub %1,%0 \n"
"shr %3 \n" "shr %3 \n"
"cmp $0x0,%3 \n" "cmp $0x0,%3 \n"
"je 2f \n" "je 100f \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n" "cmp $0x40,%3 \n"
"je 3f \n" "je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n" "movd %3,%%xmm0 \n"
"neg %3 \n" "neg %3 \n"
"add $0x80,%3 \n" "add $0x80,%3 \n"
...@@ -4259,6 +4265,8 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -4259,6 +4265,8 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"punpcklbw %%xmm0,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
...@@ -4275,28 +4283,60 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -4275,28 +4283,60 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 1b \n" "jg 1b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 25 / 75.
".p2align 4 \n" ".p2align 4 \n"
"2: \n" "25: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 2b \n" "jg 25b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 50 / 50.
".p2align 4 \n" ".p2align 4 \n"
"3: \n" "50: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n" "movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
".p2align 4 \n"
"75: \n"
"movdqa (%1),%%xmm1 \n"
"movdqa (%1,%4,1),%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 3b \n" "jg 75b \n"
"4: \n" "jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
".p2align 4 \n" ".p2align 4 \n"
: "+r"(dst_ptr), // %0 "100: \n"
"+r"(src_ptr), // %1 "movdqa (%1),%%xmm0 \n"
"+r"(dst_width), // %2 "sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 100b \n"
// Extrude last pixel.
"99: \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
"+r"(source_y_fraction) // %3 "+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4 : "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc" : "memory", "cc"
...@@ -4306,6 +4346,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -4306,6 +4346,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
); );
} }
void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) { uint8* dst_uv, int pix) {
asm volatile ( asm volatile (
......
...@@ -3580,8 +3580,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3580,8 +3580,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 1 pixel loop until destination pointer is aligned. // 1 pixel loop until destination pointer is aligned.
alignloop1: alignloop1:
test edx, 15 // aligned? // test edx, 15 // aligned?
je alignloop1b // je alignloop1b
movd xmm3, [eax] movd xmm3, [eax]
lea eax, [eax + 4] lea eax, [eax + 4]
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
...@@ -4439,25 +4439,31 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -4439,25 +4439,31 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
} }
#endif // HAS_ARGBAFFINEROW_SSE2 #endif // HAS_ARGBAFFINEROW_SSE2
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version. // Bilinear image filtering.
// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
__asm { __asm {
push esi push esi
push edi push edi
mov edi, [esp + 8 + 4] // dst_ptr mov edi, [esp + 8 + 4] // dst_argb
mov esi, [esp + 8 + 8] // src_ptr mov esi, [esp + 8 + 8] // src_argb
mov edx, [esp + 8 + 12] // src_stride mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255) mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi sub edi, esi
shr eax, 1 shr eax, 1
cmp eax, 0 cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop1 je xloop100
cmp eax, 32
je xloop75
cmp eax, 64 cmp eax, 64
je xloop2 je xloop50
cmp eax, 96
je xloop25
movd xmm0, eax // high fraction 0..127 movd xmm0, eax // high fraction 0..127
neg eax neg eax
add eax, 128 add eax, 128
...@@ -4482,32 +4488,57 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -4482,32 +4488,57 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop jg xloop
jmp xloop99
pop edi // Blend 25 / 75.
pop esi align 16
ret xloop25:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop25
jmp xloop99
// Blend 50 / 50.
align 16 align 16
xloop1: xloop50:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
sub ecx, 4 sub ecx, 4
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop1 jg xloop50
jmp xloop99
pop edi // Blend 75 / 25.
pop esi align 16
ret xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop75
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
align 16 align 16
xloop2: xloop100:
movdqa xmm0, [esi] movdqa xmm0, [esi]
pavgb xmm0, [esi + edx]
sub ecx, 4 sub ecx, 4
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop2 jg xloop100
// Extrude last pixel.
xloop99:
pop edi pop edi
pop esi pop esi
ret ret
......
...@@ -30,20 +30,21 @@ extern "C" { ...@@ -30,20 +30,21 @@ extern "C" {
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBFILTERROWS_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, int src_stride, void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, int src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
#define HAS_SCALEARGBROWDOWN2_NEON
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width); uint8* dst, int dst_width);
void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
void ScaleARGBFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction);
#endif #endif
/** /**
...@@ -964,6 +965,7 @@ static void ScaleARGBBilinear(int src_width, int src_height, ...@@ -964,6 +965,7 @@ static void ScaleARGBBilinear(int src_width, int src_height,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) = int dst_width, int source_y_fraction) =
ScaleARGBFilterRows_C; ScaleARGBFilterRows_C;
// TODO(fbarchard): Check aligned width.
#if defined(HAS_SCALEARGBFILTERROWS_SSE2) #if defined(HAS_SCALEARGBFILTERROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) { IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
...@@ -975,6 +977,11 @@ static void ScaleARGBBilinear(int src_width, int src_height, ...@@ -975,6 +977,11 @@ static void ScaleARGBBilinear(int src_width, int src_height,
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) { IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3; ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
} }
#endif
#if defined(HAS_SCALEARGBFILTERROWS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterRows = ScaleARGBFilterRows_NEON;
}
#endif #endif
int dx = (src_width << 16) / dst_width; int dx = (src_width << 16) / dst_width;
int dy = (src_height << 16) / dst_height; int dy = (src_height << 16) / dst_height;
......
...@@ -136,6 +136,90 @@ void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -136,6 +136,90 @@ void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride,
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
); );
} }
// 4x2 -> 4x1
void ScaleARGBFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 100f \n"
"add %2, %1 \n"
"cmp %4, #64 \n"
"beq 75f \n"
"cmp %4, #128 \n"
"beq 50f \n"
"cmp %4, #192 \n"
"beq 25f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #4 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #4 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #4 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n"
"subs %3, #4 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #4 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"vst1.u32 {d1[1]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
"+r"(source_y_fraction) // %4
:
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
}
#endif // __ARM_NEON__ #endif // __ARM_NEON__
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -585,6 +585,142 @@ TEST_F(libyuvTest, TestInterpolate) { ...@@ -585,6 +585,142 @@ TEST_F(libyuvTest, TestInterpolate) {
} }
} }
#define TESTTERP(FMT_A, BPP_A, STRIDE_A, \
FMT_B, BPP_B, STRIDE_B, \
W1280, TERP, DIFF, N, NEG, OFF) \
TEST_F(libyuvTest, ARGBInterpolate##TERP##N) { \
const int kWidth = W1280; \
const int kHeight = benchmark_height_; \
const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
align_buffer_64(src_argb_a, kStrideA * kHeight + OFF); \
align_buffer_64(src_argb_b, kStrideA * kHeight + OFF); \
align_buffer_64(dst_argb_c, kStrideB * kHeight); \
align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
srandom(time(NULL)); \
for (int i = 0; i < kStrideA * kHeight; ++i) { \
src_argb_a[i + OFF] = (random() & 0xff); \
src_argb_b[i + OFF] = (random() & 0xff); \
} \
MaskCpuFlags(0); \
ARGBInterpolate(src_argb_a + OFF, kStrideA, \
src_argb_b + OFF, kStrideA, \
dst_argb_c, kStrideB, \
kWidth, NEG kHeight, TERP); \
MaskCpuFlags(-1); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
ARGBInterpolate(src_argb_a + OFF, kStrideA, \
src_argb_b + OFF, kStrideA, \
dst_argb_opt, kStrideB, \
kWidth, NEG kHeight, TERP); \
} \
int max_diff = 0; \
for (int i = 0; i < kStrideB * kHeight; ++i) { \
int abs_diff = \
abs(static_cast<int>(dst_argb_c[i]) - \
static_cast<int>(dst_argb_opt[i])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_64(src_argb_a) \
free_aligned_buffer_64(src_argb_b) \
free_aligned_buffer_64(dst_argb_c) \
free_aligned_buffer_64(dst_argb_opt) \
}
#define TESTINTERPOLATE(TERP) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
benchmark_width_ - 4, TERP, 1, _Any, +, 0) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
benchmark_width_, TERP, 1, _Unaligned, +, 1) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
benchmark_width_, TERP, 1, _Invert, -, 0) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
benchmark_width_, TERP, 1, _Opt, +, 0)
TESTINTERPOLATE(0)
TESTINTERPOLATE(64)
TESTINTERPOLATE(128)
TESTINTERPOLATE(192)
TESTINTERPOLATE(255)
static int TestBlend(int kWidth, int kHeight, int benchmark_iterations,
int NEG, int OFF) {
const int BPP_A = 4;
const int STRIDE_A = 1;
const int BPP_B = 4;
const int STRIDE_B = 1;
const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;
const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;
align_buffer_64(src_argb_a, kStrideA * kHeight + OFF);
align_buffer_64(src_argb_b, kStrideA * kHeight + OFF);
align_buffer_64(dst_argb_c, kStrideB * kHeight);
align_buffer_64(dst_argb_opt, kStrideB * kHeight);
srandom(time(NULL));
for (int i = 0; i < kStrideA * kHeight; ++i) {
src_argb_a[i + OFF] = (random() & 0xff);
src_argb_b[i + OFF] = (random() & 0xff);
}
ARGBAttenuate(src_argb_a, kStrideA, src_argb_a, kStrideA, kWidth, kHeight);
ARGBAttenuate(src_argb_b, kStrideA, src_argb_b, kStrideA, kWidth, kHeight);
memset(dst_argb_c, 255, kStrideB * kHeight);
memset(dst_argb_opt, 255, kStrideB * kHeight);
MaskCpuFlags(0);
ARGBBlend(src_argb_a + OFF, kStrideA,
src_argb_b + OFF, kStrideA,
dst_argb_c, kStrideB,
kWidth, NEG * kHeight);
MaskCpuFlags(-1);
for (int i = 0; i < benchmark_iterations; ++i) {
ARGBBlend(src_argb_a + OFF, kStrideA,
src_argb_b + OFF, kStrideA,
dst_argb_opt, kStrideB,
kWidth, NEG * kHeight);
}
int max_diff = 0;
for (int i = 0; i < kStrideB * kHeight; ++i) {
int abs_diff =
abs(static_cast<int>(dst_argb_c[i]) -
static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_64(src_argb_a)
free_aligned_buffer_64(src_argb_b)
free_aligned_buffer_64(dst_argb_c)
free_aligned_buffer_64(dst_argb_opt)
return max_diff;
}
TEST_F(libyuvTest, ARGBBlend_Any) {
int max_diff = TestBlend(benchmark_width_ - 4, benchmark_height_,
benchmark_iterations_, +1, 0);
EXPECT_LE(max_diff, 1);
}
// TODO(fbarchard): Enable unaligned blend test.
// TEST_F(libyuvTest, ARGBBlend_Unaligned) {
// int max_diff = TestBlend(benchmark_width_, benchmark_height_,
// benchmark_iterations_, +1, 1);
// EXPECT_LE(max_diff, 1);
// }
TEST_F(libyuvTest, ARGBBlend_Invert) {
int max_diff = TestBlend(benchmark_width_, benchmark_height_,
benchmark_iterations_, -1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(libyuvTest, ARGBBlend_Opt) {
int max_diff = TestBlend(benchmark_width_, benchmark_height_,
benchmark_iterations_, +1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(libyuvTest, TestAffine) { TEST_F(libyuvTest, TestAffine) {
SIMD_ALIGNED(uint8 orig_pixels_0[256][4]); SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]); SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment