Commit 66d16f41 authored by fbarchard@google.com's avatar fbarchard@google.com

argb scale 2x upsample with specialization for 25/75%

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/938014

git-svn-id: http://libyuv.googlecode.com/svn/trunk@486 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f25ab6d8
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 485 Version: 486
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 485 #define LIBYUV_VERSION 486
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
This diff is collapsed.
...@@ -289,12 +289,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -289,12 +289,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255) mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
shr eax, 1 shr eax, 1
cmp eax, 0 cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop1 je xloop100
sub edi, esi
cmp eax, 32
je xloop75
cmp eax, 64 cmp eax, 64
je xloop2 je xloop50
cmp eax, 96
je xloop25
movd xmm0, eax // high fraction 0..127 movd xmm0, eax // high fraction 0..127
neg eax neg eax
add eax, 128 add eax, 128
...@@ -319,36 +324,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -319,36 +324,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop jg xloop
jmp xloop99
shufps xmm0, xmm0, 0xff // Blend 25 / 75.
movdqa [esi + edi], xmm0 // duplicate last pixel for filtering align 16
pop edi xloop25:
pop esi movdqa xmm0, [esi]
ret movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop25
jmp xloop99
// Blend 50 / 50.
align 16 align 16
xloop1: xloop50:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
sub ecx, 4 sub ecx, 4
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop1 jg xloop50
jmp xloop99
shufps xmm0, xmm0, 0xff // Blend 75 / 25.
align 16
xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
pop edi lea esi, [esi + 16]
pop esi jg xloop75
ret jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
align 16 align 16
xloop2: xloop100:
movdqa xmm0, [esi] movdqa xmm0, [esi]
pavgb xmm0, [esi + edx]
sub ecx, 4 sub ecx, 4
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop2 jg xloop100
// Extrude last pixel.
xloop99:
shufps xmm0, xmm0, 0xff shufps xmm0, xmm0, 0xff
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
pop edi pop edi
...@@ -585,12 +611,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -585,12 +611,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
asm volatile ( asm volatile (
"sub %1,%0 \n"
"shr %3 \n" "shr %3 \n"
"cmp $0x0,%3 \n" "cmp $0x0,%3 \n"
"je 2f \n" "je 100f \n"
"sub %1,%0 \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n" "cmp $0x40,%3 \n"
"je 3f \n" "je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n" "movd %3,%%xmm0 \n"
"neg %3 \n" "neg %3 \n"
"add $0x80,%3 \n" "add $0x80,%3 \n"
...@@ -598,6 +629,8 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -598,6 +629,8 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"punpcklbw %%xmm0,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
...@@ -614,25 +647,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -614,25 +647,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 1b \n" "jg 1b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 25 / 75.
".p2align 4 \n" ".p2align 4 \n"
"2: \n" "25: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 2b \n" "jg 25b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 50 / 50.
".p2align 4 \n" ".p2align 4 \n"
"3: \n" "50: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n" "movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 3b \n" "jg 50b \n"
"4: \n" "jmp 99f \n"
// Blend 75 / 25.
".p2align 4 \n"
"75: \n"
"movdqa (%1),%%xmm1 \n"
"movdqa (%1,%4,1),%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
".p2align 4 \n" ".p2align 4 \n"
"100: \n"
"movdqa (%1),%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 100b \n"
// Extrude last pixel.
"99: \n"
"shufps $0xff,%%xmm0,%%xmm0 \n" "shufps $0xff,%%xmm0,%%xmm0 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -645,6 +710,7 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -645,6 +710,7 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
, "xmm0", "xmm1", "xmm2", "xmm5" , "xmm0", "xmm1", "xmm2", "xmm5"
#endif #endif
); );
} }
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
......
...@@ -477,14 +477,19 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -477,14 +477,19 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
int dst_width, int source_y_fraction) { int dst_width, int source_y_fraction) {
asm volatile ( asm volatile (
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 2f \n" "beq 100f \n"
"add %2, %1 \n" "add %2, %1 \n"
"cmp %4, #64 \n"
"beq 75f \n"
"cmp %4, #128 \n" "cmp %4, #128 \n"
"beq 3f \n" "beq 50f \n"
"cmp %4, #192 \n"
"beq 25f \n"
"vdup.8 d5, %4 \n" "vdup.8 d5, %4 \n"
"rsb %4, #256 \n" "rsb %4, #256 \n"
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n" "1: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.u8 {q1}, [%2]! \n"
...@@ -497,23 +502,48 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -497,23 +502,48 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vrshrn.u16 d1, q14, #8 \n" "vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n" "bgt 1b \n"
"b 4f \n" "b 99f \n"
"2: \n" // Blend 25 / 75.
"25: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n" "subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bgt 2b \n" "bgt 25b \n"
"b 4f \n" "b 99f \n"
"3: \n" // Blend 50 / 50.
"50: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n" "subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bgt 3b \n" "bgt 50b \n"
"4: \n" "b 99f \n"
// Blend 75 / 25.
"75: \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"vst1.u8 {d1[7]}, [%0] \n" "vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment