Commit 66d16f41 authored by fbarchard@google.com's avatar fbarchard@google.com

argb scale 2x upsample with specialization for 25/75%

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/938014

git-svn-id: http://libyuv.googlecode.com/svn/trunk@486 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f25ab6d8
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 485
Version: 486
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 485
#define LIBYUV_VERSION 486
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
This diff is collapsed.
......@@ -289,12 +289,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
shr eax, 1
cmp eax, 0
je xloop1
cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop100
sub edi, esi
cmp eax, 32
je xloop75
cmp eax, 64
je xloop2
je xloop50
cmp eax, 96
je xloop25
movd xmm0, eax // high fraction 0..127
neg eax
add eax, 128
......@@ -319,36 +324,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop
jmp xloop99
shufps xmm0, xmm0, 0xff
movdqa [esi + edi], xmm0 // duplicate last pixel for filtering
pop edi
pop esi
ret
// Blend 25 / 75.
align 16
xloop25:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop25
jmp xloop99
// Blend 50 / 50.
align 16
xloop1:
xloop50:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop1
jg xloop50
jmp xloop99
shufps xmm0, xmm0, 0xff
// Blend 75 / 25.
align 16
xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
lea esi, [esi + 16]
jg xloop75
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
align 16
xloop2:
xloop100:
movdqa xmm0, [esi]
pavgb xmm0, [esi + edx]
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop2
jg xloop100
// Extrude last pixel.
xloop99:
shufps xmm0, xmm0, 0xff
movdqa [esi + edi], xmm0
pop edi
......@@ -585,12 +611,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
asm volatile (
"sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
"je 2f \n"
"je 100f \n"
"sub %1,%0 \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n"
"je 3f \n"
"je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x80,%3 \n"
......@@ -598,6 +629,8 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
".p2align 4 \n"
"1: \n"
"movdqa (%1),%%xmm0 \n"
......@@ -614,30 +647,62 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 1b \n"
"jmp 4f \n"
"jmp 99f \n"
// Blend 25 / 75.
".p2align 4 \n"
"2: \n"
"25: \n"
"movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 2b \n"
"jmp 4f \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
".p2align 4 \n"
"3: \n"
"50: \n"
"movdqa (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 3b \n"
"4: \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
".p2align 4 \n"
"75: \n"
"movdqa (%1),%%xmm1 \n"
"movdqa (%1,%4,1),%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
".p2align 4 \n"
"100: \n"
"movdqa (%1),%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 100b \n"
// Extrude last pixel.
"99: \n"
"shufps $0xff,%%xmm0,%%xmm0 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
"+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc"
......@@ -645,6 +710,7 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
, "xmm0", "xmm1", "xmm2", "xmm5"
#endif
);
}
#endif // defined(__x86_64__) || defined(__i386__)
......
......@@ -477,14 +477,19 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 2f \n"
"beq 100f \n"
"add %2, %1 \n"
"cmp %4, #64 \n"
"beq 75f \n"
"cmp %4, #128 \n"
"beq 3f \n"
"beq 50f \n"
"cmp %4, #192 \n"
"beq 25f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
......@@ -497,23 +502,48 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 4f \n"
"b 99f \n"
"2: \n"
// Blend 25 / 75.
"25: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 2b \n"
"b 4f \n"
"bgt 25b \n"
"b 99f \n"
"3: \n"
// Blend 50 / 50.
"50: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 3b \n"
"4: \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment