Commit 66d16f41 authored by fbarchard@google.com's avatar fbarchard@google.com

argb scale 2x upsample with specialization for 25/75%

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/938014

git-svn-id: http://libyuv.googlecode.com/svn/trunk@486 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f25ab6d8
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 485 Version: 486
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 485 #define LIBYUV_VERSION 486
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -1035,24 +1035,26 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1035,24 +1035,26 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255) mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
shr eax, 1 shr eax, 1
cmp eax, 0 // dispatch to specialized filters if applicable. cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop100 je xloop100
sub edi, esi
cmp eax, 32 cmp eax, 32
je xloop75 je xloop75
cmp eax, 64 cmp eax, 64
je xloop50 je xloop50
cmp eax, 96 cmp eax, 96
je xloop25 je xloop25
movd xmm0, eax // high fraction 0..127
movd xmm0, eax // high fraction 1..127.
neg eax neg eax
add eax, 128 add eax, 128
movd xmm5, eax // low fraction 128..1 movd xmm5, eax // low fraction 127..1.
punpcklbw xmm5, xmm0 punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5 punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0 pshufd xmm5, xmm5, 0
// General purpose row blend.
align 16 align 16
xloop: xloop:
movdqa xmm0, [esi] movdqa xmm0, [esi]
...@@ -1069,52 +1071,20 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1069,52 +1071,20 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop jg xloop
jmp xloop99
punpckhbw xmm0, xmm0 // duplicate last pixel for filtering // Blend 25 / 75.
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 100 / 0 - Copy row unchanged.
align 16 align 16
xloop100: xloop25:
movdqa xmm0, [esi] movdqa xmm0, [esi]
sub ecx, 16 movdqa xmm1, [esi + edx]
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop100
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 75 / 25.
align 16
xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pavgb xmm0, xmm1 pavgb xmm0, xmm1
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop75 jg xloop25
jmp xloop99
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 50 / 50. // Blend 50 / 50.
align 16 align 16
...@@ -1126,27 +1096,32 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1126,27 +1096,32 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop50 jg xloop50
jmp xloop99
punpckhbw xmm0, xmm0 // Blend 75 / 25.
pshufhw xmm0, xmm0, 0xff align 16
punpckhqdq xmm0, xmm0 xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
pop edi lea esi, [esi + 16]
pop esi jg xloop75
ret jmp xloop99
// Blend 25 / 75. // Blend 100 / 0 - Copy row unchanged.
align 16 align 16
xloop25: xloop100:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop25 jg xloop100
// Extrude last pixel.
xloop99:
punpckhbw xmm0, xmm0 punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0 punpckhqdq xmm0, xmm0
...@@ -1154,7 +1129,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1154,7 +1129,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
pop edi pop edi
pop esi pop esi
ret ret
} }
} }
...@@ -1171,29 +1145,31 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, ...@@ -1171,29 +1145,31 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
mov edx, [esp + 8 + 12] // src_stride mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255) mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
shr eax, 1 shr eax, 1
cmp eax, 0 cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop100 je xloop100
sub edi, esi
cmp eax, 32 cmp eax, 32
je xloop75 je xloop75
cmp eax, 64 cmp eax, 64
je xloop50 je xloop50
cmp eax, 96 cmp eax, 96
je xloop25 je xloop25
movd xmm0, eax // high fraction 0..127
movd xmm0, eax // high fraction 1..127.
neg eax neg eax
add eax, 128 add eax, 128
movd xmm5, eax // low fraction 128..1 movd xmm5, eax // low fraction 127..1.
punpcklbw xmm5, xmm0 punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5 punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0 pshufd xmm5, xmm5, 0
// General purpose row blend.
align 16 align 16
xloop: xloop:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm2, [esi + edx] movdqu xmm2, [esi + edx]
movdqu xmm1, xmm0 movdqa xmm1, xmm0
punpcklbw xmm0, xmm2 punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2 punpckhbw xmm1, xmm2
pmaddubsw xmm0, xmm5 pmaddubsw xmm0, xmm5
...@@ -1205,52 +1181,20 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, ...@@ -1205,52 +1181,20 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop jg xloop
jmp xloop99
punpckhbw xmm0, xmm0 // duplicate last pixel for filtering // Blend 25 / 75.
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqu [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 100 / 0 - Copy row unchanged.
align 16 align 16
xloop100: xloop25:
movdqu xmm0, [esi] movdqu xmm0, [esi]
sub ecx, 16 movdqu xmm1, [esi + edx]
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop100
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqu [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 75 / 25.
align 16
xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pavgb xmm0, xmm1 pavgb xmm0, xmm1
sub ecx, 16 sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop75 jg xloop25
jmp xloop99
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqu [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 50 / 50. // Blend 50 / 50.
align 16 align 16
...@@ -1262,27 +1206,32 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, ...@@ -1262,27 +1206,32 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop50 jg xloop50
jmp xloop99
punpckhbw xmm0, xmm0 // Blend 75 / 25.
pshufhw xmm0, xmm0, 0xff align 16
punpckhqdq xmm0, xmm0 xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
pop edi lea esi, [esi + 16]
pop esi jg xloop75
ret jmp xloop99
// Blend 25 / 75. // Blend 100 / 0 - Copy row unchanged.
align 16 align 16
xloop25: xloop100:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16 sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop25 jg xloop100
// Extrude last pixel.
xloop99:
punpckhbw xmm0, xmm0 punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0 punpckhqdq xmm0, xmm0
...@@ -1290,7 +1239,6 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, ...@@ -1290,7 +1239,6 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
pop edi pop edi
pop esi pop esi
ret ret
} }
} }
...@@ -2068,9 +2016,13 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2068,9 +2016,13 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"sub %1,%0 \n" "sub %1,%0 \n"
"shr %3 \n" "shr %3 \n"
"cmp $0x0,%3 \n" "cmp $0x0,%3 \n"
"je 2f \n" "je 100f \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n" "cmp $0x40,%3 \n"
"je 3f \n" "je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n" "movd %3,%%xmm0 \n"
"neg %3 \n" "neg %3 \n"
"add $0x80,%3 \n" "add $0x80,%3 \n"
...@@ -2078,6 +2030,8 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2078,6 +2030,8 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"punpcklbw %%xmm0,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
...@@ -2094,25 +2048,57 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2094,25 +2048,57 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 1b \n" "jg 1b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 25 / 75.
".p2align 4 \n" ".p2align 4 \n"
"2: \n" "25: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 2b \n" "jg 25b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 50 / 50.
".p2align 4 \n" ".p2align 4 \n"
"3: \n" "50: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n" "movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 3b \n" "jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
".p2align 4 \n" ".p2align 4 \n"
"4: \n" "75: \n"
"movdqa (%1),%%xmm1 \n"
"movdqa (%1,%4,1),%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
".p2align 4 \n"
"100: \n"
"movdqa (%1),%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 100b \n"
// Extrude last pixel.
"99: \n"
"punpckhbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm0 \n"
"punpckhqdq %%xmm0,%%xmm0 \n" "punpckhqdq %%xmm0,%%xmm0 \n"
...@@ -2137,9 +2123,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, ...@@ -2137,9 +2123,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
"sub %1,%0 \n" "sub %1,%0 \n"
"shr %3 \n" "shr %3 \n"
"cmp $0x0,%3 \n" "cmp $0x0,%3 \n"
"je 2f \n" "je 100f \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n" "cmp $0x40,%3 \n"
"je 3f \n" "je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n" "movd %3,%%xmm0 \n"
"neg %3 \n" "neg %3 \n"
"add $0x80,%3 \n" "add $0x80,%3 \n"
...@@ -2147,11 +2137,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, ...@@ -2147,11 +2137,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
"punpcklbw %%xmm0,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"movdqu (%1,%4,1),%%xmm2 \n" "movdqu (%1,%4,1),%%xmm2 \n"
"movdqu %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n" "punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n" "punpckhbw %%xmm2,%%xmm1 \n"
"pmaddubsw %%xmm5,%%xmm0 \n" "pmaddubsw %%xmm5,%%xmm0 \n"
...@@ -2163,25 +2155,57 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, ...@@ -2163,25 +2155,57 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
"movdqu %%xmm0,(%1,%0,1) \n" "movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 1b \n" "jg 1b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 25 / 75.
".p2align 4 \n" ".p2align 4 \n"
"2: \n" "25: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"movdqu (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqu %%xmm0,(%1,%0,1) \n" "movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 2b \n" "jg 25b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 50 / 50.
".p2align 4 \n" ".p2align 4 \n"
"3: \n" "50: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n" "movdqu (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqu %%xmm0,(%1,%0,1) \n" "movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 3b \n" "jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
".p2align 4 \n" ".p2align 4 \n"
"4: \n" "75: \n"
"movdqu (%1),%%xmm1 \n"
"movdqu (%1,%4,1),%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
".p2align 4 \n"
"100: \n"
"movdqu (%1),%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 100b \n"
// Extrude last pixel.
"99: \n"
"punpckhbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm0 \n"
"punpckhqdq %%xmm0,%%xmm0 \n" "punpckhqdq %%xmm0,%%xmm0 \n"
......
...@@ -289,12 +289,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -289,12 +289,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255) mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
shr eax, 1 shr eax, 1
cmp eax, 0 cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop1 je xloop100
sub edi, esi
cmp eax, 32
je xloop75
cmp eax, 64 cmp eax, 64
je xloop2 je xloop50
cmp eax, 96
je xloop25
movd xmm0, eax // high fraction 0..127 movd xmm0, eax // high fraction 0..127
neg eax neg eax
add eax, 128 add eax, 128
...@@ -319,36 +324,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -319,36 +324,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop jg xloop
jmp xloop99
shufps xmm0, xmm0, 0xff // Blend 25 / 75.
movdqa [esi + edi], xmm0 // duplicate last pixel for filtering align 16
pop edi xloop25:
pop esi movdqa xmm0, [esi]
ret movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop25
jmp xloop99
// Blend 50 / 50.
align 16 align 16
xloop1: xloop50:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
sub ecx, 4 sub ecx, 4
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop1 jg xloop50
jmp xloop99
shufps xmm0, xmm0, 0xff // Blend 75 / 25.
align 16
xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
pop edi lea esi, [esi + 16]
pop esi jg xloop75
ret jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
align 16 align 16
xloop2: xloop100:
movdqa xmm0, [esi] movdqa xmm0, [esi]
pavgb xmm0, [esi + edx]
sub ecx, 4 sub ecx, 4
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
jg xloop2 jg xloop100
// Extrude last pixel.
xloop99:
shufps xmm0, xmm0, 0xff shufps xmm0, xmm0, 0xff
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
pop edi pop edi
...@@ -585,12 +611,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -585,12 +611,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
asm volatile ( asm volatile (
"sub %1,%0 \n"
"shr %3 \n" "shr %3 \n"
"cmp $0x0,%3 \n" "cmp $0x0,%3 \n"
"je 2f \n" "je 100f \n"
"sub %1,%0 \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n" "cmp $0x40,%3 \n"
"je 3f \n" "je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n" "movd %3,%%xmm0 \n"
"neg %3 \n" "neg %3 \n"
"add $0x80,%3 \n" "add $0x80,%3 \n"
...@@ -598,6 +629,8 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -598,6 +629,8 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"punpcklbw %%xmm0,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
...@@ -614,30 +647,62 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -614,30 +647,62 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 1b \n" "jg 1b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 25 / 75.
".p2align 4 \n" ".p2align 4 \n"
"2: \n" "25: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 2b \n" "jg 25b \n"
"jmp 4f \n" "jmp 99f \n"
// Blend 50 / 50.
".p2align 4 \n" ".p2align 4 \n"
"3: \n" "50: \n"
"movdqa (%1),%%xmm0 \n" "movdqa (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n" "movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"jg 3b \n" "jg 50b \n"
"4: \n" "jmp 99f \n"
// Blend 75 / 25.
".p2align 4 \n" ".p2align 4 \n"
"75: \n"
"movdqa (%1),%%xmm1 \n"
"movdqa (%1,%4,1),%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
".p2align 4 \n"
"100: \n"
"movdqa (%1),%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 100b \n"
// Extrude last pixel.
"99: \n"
"shufps $0xff,%%xmm0,%%xmm0 \n" "shufps $0xff,%%xmm0,%%xmm0 \n"
"movdqa %%xmm0,(%1,%0,1) \n" "movdqa %%xmm0,(%1,%0,1) \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(source_y_fraction) // %3 "+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4 : "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc" : "memory", "cc"
...@@ -645,6 +710,7 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -645,6 +710,7 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
, "xmm0", "xmm1", "xmm2", "xmm5" , "xmm0", "xmm1", "xmm2", "xmm5"
#endif #endif
); );
} }
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
......
...@@ -477,14 +477,19 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -477,14 +477,19 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
int dst_width, int source_y_fraction) { int dst_width, int source_y_fraction) {
asm volatile ( asm volatile (
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 2f \n" "beq 100f \n"
"add %2, %1 \n" "add %2, %1 \n"
"cmp %4, #64 \n"
"beq 75f \n"
"cmp %4, #128 \n" "cmp %4, #128 \n"
"beq 3f \n" "beq 50f \n"
"cmp %4, #192 \n"
"beq 25f \n"
"vdup.8 d5, %4 \n" "vdup.8 d5, %4 \n"
"rsb %4, #256 \n" "rsb %4, #256 \n"
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n" "1: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.u8 {q1}, [%2]! \n"
...@@ -497,23 +502,48 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -497,23 +502,48 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vrshrn.u16 d1, q14, #8 \n" "vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n" "bgt 1b \n"
"b 4f \n" "b 99f \n"
"2: \n" // Blend 25 / 75.
"25: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n" "subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bgt 2b \n" "bgt 25b \n"
"b 4f \n" "b 99f \n"
"3: \n" // Blend 50 / 50.
"50: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n" "subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bgt 3b \n" "bgt 50b \n"
"4: \n" "b 99f \n"
// Blend 75 / 25.
"75: \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"vst1.u8 {d1[7]}, [%0] \n" "vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment