Commit 4e218671 authored by fbarchard@google.com's avatar fbarchard@google.com

addrows improvements for general purpose down size box filter. scale sse avoid…

addrows improvements for general purpose down size box filter.  scale sse avoid pushad.  sub ecx before store to avoid wait
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/405007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@191 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 0b9a65b0
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 190 Version: 191
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 190 #define LIBYUV_VERSION 191
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -80,7 +80,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, ...@@ -80,7 +80,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
__declspec(naked) __declspec(naked)
static void TransposeWx8_SSSE3(const uint8* src, int src_stride, static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) { uint8* dst, int dst_stride, int width) {
__asm { __asm {
push edi push edi
push esi push esi
push ebp push ebp
...@@ -154,9 +154,9 @@ __asm { ...@@ -154,9 +154,9 @@ __asm {
movq qword ptr [edx], xmm3 movq qword ptr [edx], xmm3
movdqa xmm7, xmm3 movdqa xmm7, xmm3
palignr xmm7, xmm7, 8 palignr xmm7, xmm7, 8
sub ecx, 8
movq qword ptr [edx + esi], xmm7 movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi] lea edx, [edx + 2 * esi]
sub ecx, 8
ja convertloop ja convertloop
pop ebp pop ebp
...@@ -172,7 +172,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -172,7 +172,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int w) { int w) {
__asm { __asm {
push ebx push ebx
push esi push esi
push edi push edi
...@@ -278,11 +278,11 @@ __asm { ...@@ -278,11 +278,11 @@ __asm {
movlpd qword ptr [edx], xmm3 movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3 movhpd qword ptr [ebx], xmm3
punpckhdq xmm0, xmm7 punpckhdq xmm0, xmm7
sub ecx, 8
movlpd qword ptr [edx + esi], xmm0 movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi] lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0 movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp] lea ebx, [ebx + 2 * ebp]
sub ecx, 8
ja convertloop ja convertloop
mov esp, [esp + 16] mov esp, [esp + 16]
...@@ -365,9 +365,9 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -365,9 +365,9 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
"movq %%xmm3,(%1) \n" "movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n" "movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n" "movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n" "ja 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -490,11 +490,11 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -490,11 +490,11 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"movlpd %xmm3,(%edx) \n" "movlpd %xmm3,(%edx) \n"
"movhpd %xmm3,(%ebx) \n" "movhpd %xmm3,(%ebx) \n"
"punpckhdq %xmm7,%xmm0 \n" "punpckhdq %xmm7,%xmm0 \n"
"sub $0x8,%ecx \n"
"movlpd %xmm0,(%edx,%esi,1) \n" "movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n" "lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n" "movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n" "lea (%ebx,%ebp,2),%ebx \n"
"sub $0x8,%ecx \n"
"ja 1b \n" "ja 1b \n"
"mov 0x10(%esp),%esp \n" "mov 0x10(%esp),%esp \n"
"pop %ebp \n" "pop %ebp \n"
...@@ -628,9 +628,9 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, ...@@ -628,9 +628,9 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
"movq %%xmm11,(%1) \n" "movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n" "movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n" "palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n" "movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n" "ja 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -734,11 +734,11 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -734,11 +734,11 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"movlpd %%xmm3,(%1) \n" "movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n" "movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n" "punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n" "movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"sub $0x8,%3 \n"
"ja 1b \n" "ja 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_a), // %1 "+r"(dst_a), // %1
...@@ -1023,11 +1023,11 @@ __asm { ...@@ -1023,11 +1023,11 @@ __asm {
movdqa xmm0, [eax] movdqa xmm0, [eax]
lea eax, [eax - 16] lea eax, [eax - 16]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
sub ecx, 8
movlpd qword ptr [edx], xmm0 movlpd qword ptr [edx], xmm0
movhpd qword ptr [edi], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
movhpd qword ptr [edi], xmm0
lea edi, [edi + 8] lea edi, [edi + 8]
sub ecx, 8
ja convertloop ja convertloop
pop edi pop edi
ret ret
...@@ -1042,18 +1042,18 @@ void MirrorRowUV_SSSE3(const uint8* src, ...@@ -1042,18 +1042,18 @@ void MirrorRowUV_SSSE3(const uint8* src,
int width) { int width) {
intptr_t temp_width = static_cast<intptr_t>(width); intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile ( asm volatile (
"movdqa %4,%%xmm5 \n" "movdqa %4,%%xmm5 \n"
"lea -16(%0,%3,2),%0 \n" "lea -16(%0,%3,2),%0 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"lea -16(%0),%0 \n" "lea -16(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" "sub $8,%3 \n"
"movhpd %%xmm0,(%2) \n" "movlpd %%xmm0,(%1) \n"
"lea 8(%1),%1 \n" "lea 8(%1),%1 \n"
"lea 8(%2),%2 \n" "movhpd %%xmm0,(%2) \n"
"sub $8,%3 \n" "lea 8(%2),%2 \n"
"ja 1b \n" "ja 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_a), // %1 "+r"(dst_a), // %1
"+r"(dst_b), // %2 "+r"(dst_b), // %2
......
...@@ -692,9 +692,9 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, ...@@ -692,9 +692,9 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
pand xmm0, xmm5 pand xmm0, xmm5
pand xmm1, xmm5 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
ja wloop ja wloop
ret ret
...@@ -733,9 +733,9 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -733,9 +733,9 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
pavgw xmm1, xmm3 pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
ja wloop ja wloop
pop esi pop esi
...@@ -750,28 +750,26 @@ __declspec(naked) ...@@ -750,28 +750,26 @@ __declspec(naked)
static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad mov eax, [esp + 4] // src_ptr
mov esi, [esp + 32 + 4] // src_ptr
// src_stride ignored // src_stride ignored
mov edi, [esp + 32 + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 32 + 16] // dst_width mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x000000ff pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
psrld xmm5, 24 psrld xmm5, 24
wloop: wloop:
movdqa xmm0, [esi] movdqa xmm0, [eax]
movdqa xmm1, [esi + 16] movdqa xmm1, [eax + 16]
lea esi, [esi + 32] lea eax, [eax + 32]
pand xmm0, xmm5 pand xmm0, xmm5
pand xmm1, xmm5 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edi], xmm0
lea edi, [edi + 8]
sub ecx, 8 sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
ja wloop ja wloop
popad
ret ret
} }
} }
...@@ -782,27 +780,28 @@ __declspec(naked) ...@@ -782,27 +780,28 @@ __declspec(naked)
static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad push esi
mov esi, [esp + 32 + 4] // src_ptr push edi
mov ebx, [esp + 32 + 8] // src_stride mov eax, [esp + 8 + 4] // src_ptr
mov edi, [esp + 32 + 12] // dst_ptr mov esi, [esp + 8 + 8] // src_stride
mov ecx, [esp + 32 + 16] // dst_width mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm7, 8
lea edx, [ebx + ebx * 2] // src_stride * 3
wloop: wloop:
movdqa xmm0, [esi] movdqa xmm0, [eax]
movdqa xmm1, [esi + 16] movdqa xmm1, [eax + 16]
movdqa xmm2, [esi + ebx] movdqa xmm2, [eax + esi]
movdqa xmm3, [esi + ebx + 16] movdqa xmm3, [eax + esi + 16]
pavgb xmm0, xmm2 // average rows pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3 pavgb xmm1, xmm3
movdqa xmm2, [esi + ebx * 2] movdqa xmm2, [eax + esi * 2]
movdqa xmm3, [esi + ebx * 2 + 16] movdqa xmm3, [eax + esi * 2 + 16]
movdqa xmm4, [esi + edx] movdqa xmm4, [eax + edi]
movdqa xmm5, [esi + edx + 16] movdqa xmm5, [eax + edi + 16]
lea esi, [esi + 32] lea eax, [eax + 32]
pavgb xmm2, xmm4 pavgb xmm2, xmm4
pavgb xmm3, xmm5 pavgb xmm3, xmm5
pavgb xmm0, xmm2 pavgb xmm0, xmm2
...@@ -824,12 +823,13 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -824,12 +823,13 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
pavgw xmm0, xmm2 pavgw xmm0, xmm2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edi], xmm0
lea edi, [edi + 8]
sub ecx, 8 sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
ja wloop ja wloop
popad pop edi
pop esi
ret ret
} }
} }
...@@ -841,29 +841,27 @@ __declspec(naked) ...@@ -841,29 +841,27 @@ __declspec(naked)
static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad mov eax, [esp + 4] // src_ptr
mov esi, [esp + 32 + 4] // src_ptr
// src_stride ignored // src_stride ignored
mov edi, [esp + 32 + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 32 + 16] // dst_width mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
psrlq xmm5, 56 psrlq xmm5, 56
wloop: wloop:
movdqa xmm0, [esi] movdqa xmm0, [eax]
movdqa xmm1, [esi + 16] movdqa xmm1, [eax + 16]
lea esi, [esi + 32] lea eax, [eax + 32]
pand xmm0, xmm5 pand xmm0, xmm5
pand xmm1, xmm5 pand xmm1, xmm5
packuswb xmm0, xmm1 // 32->16 packuswb xmm0, xmm1 // 32->16
packuswb xmm0, xmm0 // 16->8 packuswb xmm0, xmm0 // 16->8
packuswb xmm0, xmm0 // 8->4 packuswb xmm0, xmm0 // 8->4
movd dword ptr [edi], xmm0
lea edi, [edi + 4]
sub ecx, 4 sub ecx, 4
movd dword ptr [edx], xmm0
lea edx, [edx + 4]
ja wloop ja wloop
popad
ret ret
} }
} }
...@@ -874,27 +872,29 @@ __declspec(naked) ...@@ -874,27 +872,29 @@ __declspec(naked)
static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad push esi
mov esi, [esp + 32 + 4] // src_ptr push edi
mov ebx, [esp + 32 + 8] // src_stride push ebp
mov edi, [esp + 32 + 12] // dst_ptr mov eax, [esp + 12 + 4] // src_ptr
mov ecx, [esp + 32 + 16] // dst_width mov esi, [esp + 12 + 8] // src_stride
lea edx, [ebx + ebx * 2] // src_stride * 3 mov edx, [esp + 12 + 12] // dst_ptr
mov ecx, [esp + 12 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
pxor xmm7, xmm7 pxor xmm7, xmm7
wloop: wloop:
movdqa xmm0, [esi] // average 8 rows to 1 movdqa xmm0, [eax] // average 8 rows to 1
movdqa xmm1, [esi + 16] movdqa xmm1, [eax + 16]
movdqa xmm2, [esi + ebx] movdqa xmm2, [eax + esi]
movdqa xmm3, [esi + ebx + 16] movdqa xmm3, [eax + esi + 16]
pavgb xmm0, xmm2 pavgb xmm0, xmm2
pavgb xmm1, xmm3 pavgb xmm1, xmm3
movdqa xmm2, [esi + ebx * 2] movdqa xmm2, [eax + esi * 2]
movdqa xmm3, [esi + ebx * 2 + 16] movdqa xmm3, [eax + esi * 2 + 16]
movdqa xmm4, [esi + edx] movdqa xmm4, [eax + edi]
movdqa xmm5, [esi + edx + 16] movdqa xmm5, [eax + edi + 16]
lea ebp, [esi + ebx * 4] lea ebp, [eax + esi * 4]
lea esi, [esi + 32] lea eax, [eax + 32]
pavgb xmm2, xmm4 pavgb xmm2, xmm4
pavgb xmm3, xmm5 pavgb xmm3, xmm5
pavgb xmm0, xmm2 pavgb xmm0, xmm2
...@@ -902,15 +902,15 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -902,15 +902,15 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
movdqa xmm2, [ebp] movdqa xmm2, [ebp]
movdqa xmm3, [ebp + 16] movdqa xmm3, [ebp + 16]
movdqa xmm4, [ebp + ebx] movdqa xmm4, [ebp + esi]
movdqa xmm5, [ebp + ebx + 16] movdqa xmm5, [ebp + esi + 16]
pavgb xmm2, xmm4 pavgb xmm2, xmm4
pavgb xmm3, xmm5 pavgb xmm3, xmm5
movdqa xmm4, [ebp + ebx * 2] movdqa xmm4, [ebp + esi * 2]
movdqa xmm5, [ebp + ebx * 2 + 16] movdqa xmm5, [ebp + esi * 2 + 16]
movdqa xmm6, [ebp + edx] movdqa xmm6, [ebp + edi]
pavgb xmm4, xmm6 pavgb xmm4, xmm6
movdqa xmm6, [ebp + edx + 16] movdqa xmm6, [ebp + edi + 16]
pavgb xmm5, xmm6 pavgb xmm5, xmm6
pavgb xmm2, xmm4 pavgb xmm2, xmm4
pavgb xmm3, xmm5 pavgb xmm3, xmm5
...@@ -925,13 +925,15 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -925,13 +925,15 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
psrlw xmm0, 3 psrlw xmm0, 3
packuswb xmm0, xmm0 packuswb xmm0, xmm0
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movd dword ptr [edi], xmm0
lea edi, [edi + 4]
sub ecx, 4 sub ecx, 4
movd dword ptr [edx], xmm0
lea edx, [edx + 4]
ja wloop ja wloop
popad pop ebp
pop edi
pop esi
ret ret
} }
} }
...@@ -947,32 +949,30 @@ __declspec(naked) ...@@ -947,32 +949,30 @@ __declspec(naked)
static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad mov eax, [esp + 4] // src_ptr
mov esi, [esp + 32 + 4] // src_ptr
// src_stride ignored // src_stride ignored
mov edi, [esp + 32 + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 32 + 16] // dst_width mov ecx, [esp + 16] // dst_width
movdqa xmm3, _shuf0 movdqa xmm3, _shuf0
movdqa xmm4, _shuf1 movdqa xmm4, _shuf1
movdqa xmm5, _shuf2 movdqa xmm5, _shuf2
wloop: wloop:
movdqa xmm0, [esi] movdqa xmm0, [eax]
movdqa xmm1, [esi + 16] movdqa xmm1, [eax + 16]
lea esi, [esi + 32] lea eax, [eax + 32]
movdqa xmm2, xmm1 movdqa xmm2, xmm1
palignr xmm1, xmm0, 8 palignr xmm1, xmm0, 8
pshufb xmm0, xmm3 pshufb xmm0, xmm3
pshufb xmm1, xmm4 pshufb xmm1, xmm4
pshufb xmm2, xmm5 pshufb xmm2, xmm5
movq qword ptr [edi], xmm0 movq qword ptr [edx], xmm0
movq qword ptr [edi + 8], xmm1 movq qword ptr [edx + 8], xmm1
movq qword ptr [edi + 16], xmm2 movq qword ptr [edx + 16], xmm2
lea edi, [edi + 24] lea edx, [edx + 24]
sub ecx, 24 sub ecx, 24
ja wloop ja wloop
popad
ret ret
} }
} }
...@@ -997,11 +997,11 @@ __declspec(naked) ...@@ -997,11 +997,11 @@ __declspec(naked)
static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad push esi
mov esi, [esp + 32 + 4] // src_ptr mov eax, [esp + 4 + 4] // src_ptr
mov ebx, [esp + 32 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edi, [esp + 32 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 32 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, _shuf01 movdqa xmm2, _shuf01
movdqa xmm3, _shuf11 movdqa xmm3, _shuf11
movdqa xmm4, _shuf21 movdqa xmm4, _shuf21
...@@ -1010,27 +1010,27 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1010,27 +1010,27 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
movdqa xmm7, _round34 movdqa xmm7, _round34
wloop: wloop:
movdqa xmm0, [esi] // pixels 0..7 movdqa xmm0, [eax] // pixels 0..7
movdqa xmm1, [esi+ebx] movdqa xmm1, [eax + esi]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pshufb xmm0, xmm2 pshufb xmm0, xmm2
pmaddubsw xmm0, xmm5 pmaddubsw xmm0, xmm5
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edi], xmm0 movq qword ptr [edx], xmm0
movdqu xmm0, [esi+8] // pixels 8..15 movdqu xmm0, [eax + 8] // pixels 8..15
movdqu xmm1, [esi+ebx+8] movdqu xmm1, [eax + esi + 8]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pshufb xmm0, xmm3 pshufb xmm0, xmm3
pmaddubsw xmm0, xmm6 pmaddubsw xmm0, xmm6
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edi+8], xmm0 movq qword ptr [edx + 8], xmm0
movdqa xmm0, [esi+16] // pixels 16..23 movdqa xmm0, [eax + 16] // pixels 16..23
movdqa xmm1, [esi+ebx+16] movdqa xmm1, [eax + esi + 16]
lea esi, [esi+32] lea eax, [eax + 32]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pshufb xmm0, xmm4 pshufb xmm0, xmm4
movdqa xmm1, _madd21 movdqa xmm1, _madd21
...@@ -1038,12 +1038,12 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1038,12 +1038,12 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edi+16], xmm0
lea edi, [edi+24]
sub ecx, 24 sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx + 24]
ja wloop ja wloop
popad pop esi
ret ret
} }
} }
...@@ -1054,11 +1054,11 @@ __declspec(naked) ...@@ -1054,11 +1054,11 @@ __declspec(naked)
static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad push esi
mov esi, [esp + 32 + 4] // src_ptr mov eax, [esp + 4 + 4] // src_ptr
mov ebx, [esp + 32 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edi, [esp + 32 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 32 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, _shuf01 movdqa xmm2, _shuf01
movdqa xmm3, _shuf11 movdqa xmm3, _shuf11
movdqa xmm4, _shuf21 movdqa xmm4, _shuf21
...@@ -1067,8 +1067,8 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1067,8 +1067,8 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
movdqa xmm7, _round34 movdqa xmm7, _round34
wloop: wloop:
movdqa xmm0, [esi] // pixels 0..7 movdqa xmm0, [eax] // pixels 0..7
movdqa xmm1, [esi+ebx] movdqa xmm1, [eax + esi]
pavgb xmm1, xmm0 pavgb xmm1, xmm0
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pshufb xmm0, xmm2 pshufb xmm0, xmm2
...@@ -1076,9 +1076,9 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1076,9 +1076,9 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edi], xmm0 movq qword ptr [edx], xmm0
movdqu xmm0, [esi+8] // pixels 8..15 movdqu xmm0, [eax + 8] // pixels 8..15
movdqu xmm1, [esi+ebx+8] movdqu xmm1, [eax + esi + 8]
pavgb xmm1, xmm0 pavgb xmm1, xmm0
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pshufb xmm0, xmm3 pshufb xmm0, xmm3
...@@ -1086,10 +1086,10 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1086,10 +1086,10 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edi+8], xmm0 movq qword ptr [edx + 8], xmm0
movdqa xmm0, [esi+16] // pixels 16..23 movdqa xmm0, [eax + 16] // pixels 16..23
movdqa xmm1, [esi+ebx+16] movdqa xmm1, [eax + esi + 16]
lea esi, [esi+32] lea eax, [eax + 32]
pavgb xmm1, xmm0 pavgb xmm1, xmm0
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pshufb xmm0, xmm4 pshufb xmm0, xmm4
...@@ -1098,12 +1098,12 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1098,12 +1098,12 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edi+16], xmm0
lea edi, [edi+24]
sub ecx, 24 sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx+24]
ja wloop ja wloop
popad pop esi
ret ret
} }
} }
...@@ -1116,30 +1116,28 @@ __declspec(naked) ...@@ -1116,30 +1116,28 @@ __declspec(naked)
static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad mov eax, [esp + 4] // src_ptr
mov esi, [esp + 32 + 4] // src_ptr // src_stride ignored
mov edx, [esp + 32 + 8] // src_stride mov edx, [esp + 12] // dst_ptr
mov edi, [esp + 32 + 12] // dst_ptr mov ecx, [esp + 16] // dst_width
mov ecx, [esp + 32 + 16] // dst_width
movdqa xmm4, _shuf38a movdqa xmm4, _shuf38a
movdqa xmm5, _shuf38b movdqa xmm5, _shuf38b
xloop: xloop:
movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
lea esi, [esi + 32] lea eax, [eax + 32]
pshufb xmm0, xmm4 pshufb xmm0, xmm4
pshufb xmm1, xmm5 pshufb xmm1, xmm5
paddusb xmm0, xmm1 paddusb xmm0, xmm1
movq qword ptr [edi], xmm0 // write 12 pixels
movhlps xmm1, xmm0
movd [edi + 8], xmm1
lea edi, [edi + 12]
sub ecx, 12 sub ecx, 12
movq qword ptr [edx], xmm0 // write 12 pixels
movhlps xmm1, xmm0
movd [edx + 8], xmm1
lea edx, [edx + 12]
ja xloop ja xloop
popad
ret ret
} }
} }
...@@ -1149,19 +1147,20 @@ __declspec(naked) ...@@ -1149,19 +1147,20 @@ __declspec(naked)
static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad push esi
mov esi, [esp + 32 + 4] // src_ptr push ebx
mov edx, [esp + 32 + 8] // src_stride mov eax, [esp + 8 + 4] // src_ptr
mov edi, [esp + 32 + 12] // dst_ptr mov esi, [esp + 8 + 8] // src_stride
mov ecx, [esp + 32 + 16] // dst_width mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
movdqa xmm4, _shufac0 movdqa xmm4, _shufac0
movdqa xmm5, _shufac3 movdqa xmm5, _shufac3
movdqa xmm6, _scaleac3 movdqa xmm6, _scaleac3
pxor xmm7, xmm7 pxor xmm7, xmm7
xloop: xloop:
movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
movdqa xmm2, [esi + edx] movdqa xmm2, [eax + esi]
movhlps xmm1, xmm0 movhlps xmm1, xmm0
movhlps xmm3, xmm2 movhlps xmm3, xmm2
punpcklbw xmm0, xmm7 punpcklbw xmm0, xmm7
...@@ -1170,8 +1169,8 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1170,8 +1169,8 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
punpcklbw xmm3, xmm7 punpcklbw xmm3, xmm7
paddusw xmm0, xmm2 paddusw xmm0, xmm2
paddusw xmm1, xmm3 paddusw xmm1, xmm3
movdqa xmm2, [esi + edx * 2] movdqa xmm2, [eax + esi * 2]
lea esi, [esi + 16] lea eax, [eax + 16]
movhlps xmm3, xmm2 movhlps xmm3, xmm2
punpcklbw xmm2, xmm7 punpcklbw xmm2, xmm7
punpcklbw xmm3, xmm7 punpcklbw xmm3, xmm7
...@@ -1196,14 +1195,15 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1196,14 +1195,15 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6
packuswb xmm2, xmm2 packuswb xmm2, xmm2
movd [edi], xmm2 // write 6 pixels
pextrw eax, xmm2, 2
mov [edi + 4], ax
lea edi, [edi + 6]
sub ecx, 6 sub ecx, 6
movd [edx], xmm2 // write 6 pixels
pextrw ebx, xmm2, 2
mov [edx + 4], bx
lea edx, [edx + 6]
ja xloop ja xloop
popad pop ebx
pop esi
ret ret
} }
} }
...@@ -1213,20 +1213,21 @@ __declspec(naked) ...@@ -1213,20 +1213,21 @@ __declspec(naked)
static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
pushad push esi
mov esi, [esp + 32 + 4] // src_ptr push ebx
mov edx, [esp + 32 + 8] // src_stride mov eax, [esp + 8 + 4] // src_ptr
mov edi, [esp + 32 + 12] // dst_ptr mov esi, [esp + 8 + 8] // src_stride
mov ecx, [esp + 32 + 16] // dst_width mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
movdqa xmm4, _shufab0 movdqa xmm4, _shufab0
movdqa xmm5, _shufab1 movdqa xmm5, _shufab1
movdqa xmm6, _shufab2 movdqa xmm6, _shufab2
movdqa xmm7, _scaleab2 movdqa xmm7, _scaleab2
xloop: xloop:
movdqa xmm2, [esi] // average 2 rows into xmm2 movdqa xmm2, [eax] // average 2 rows into xmm2
pavgb xmm2, [esi + edx] pavgb xmm2, [eax + esi]
lea esi, [esi + 16] lea eax, [eax + 16]
movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0
pshufb xmm0, xmm4 pshufb xmm0, xmm4
...@@ -1239,65 +1240,72 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1239,65 +1240,72 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movd [edi], xmm0 // write 6 pixels
pextrw eax, xmm0, 2
mov [edi + 4], ax
lea edi, [edi + 6]
sub ecx, 6 sub ecx, 6
movd [edx], xmm0 // write 6 pixels
pextrw ebx, xmm0, 2
mov [edx + 4], bx
lea edx, [edx + 6]
ja xloop ja xloop
popad pop ebx
pop esi
ret ret
} }
} }
#define HAS_SCALEADDROWS_SSE2 #define HAS_SCALEADDROWS_SSE2
// Reads 8xN bytes and produces 16 shorts at a time. // Reads 16xN bytes and produces 16 shorts at a time.
__declspec(naked) __declspec(naked)
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, uint16* dst_ptr, int src_width,
int src_height) { int src_height) {
__asm { __asm {
pushad push esi
mov esi, [esp + 32 + 4] // src_ptr push edi
mov edx, [esp + 32 + 8] // src_stride push ebx
mov edi, [esp + 32 + 12] // dst_ptr push ebp
mov ecx, [esp + 32 + 16] // dst_width mov esi, [esp + 16 + 4] // src_ptr
mov ebx, [esp + 32 + 20] // height mov edx, [esp + 16 + 8] // src_stride
pxor xmm5, xmm5 mov edi, [esp + 16 + 12] // dst_ptr
mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height
pxor xmm4, xmm4
dec ebx dec ebx
xloop: xloop:
// first row // first row
movdqa xmm2, [esi] movdqa xmm0, [esi]
lea eax, [esi + edx] lea eax, [esi + edx]
movhlps xmm3, xmm2 movdqa xmm1, xmm0
punpcklbw xmm0, xmm4
punpckhbw xmm1, xmm4
lea esi, [esi + 16]
mov ebp, ebx mov ebp, ebx
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
// sum remaining rows // sum remaining rows
yloop: yloop:
movdqa xmm0, [eax] // read 16 pixels movdqa xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row lea eax, [eax + edx] // advance to next row
movhlps xmm1, xmm0 movdqa xmm3, xmm2
punpcklbw xmm0, xmm5 punpcklbw xmm2, xmm4
punpcklbw xmm1, xmm5 punpckhbw xmm3, xmm4
paddusw xmm2, xmm0 // sum 16 words paddusw xmm0, xmm2 // sum 16 words
paddusw xmm3, xmm1 paddusw xmm1, xmm3
sub ebp, 1 sub ebp, 1
ja yloop ja yloop
movdqa [edi], xmm2 movdqa [edi], xmm0
movdqa [edi + 16], xmm3 movdqa [edi + 16], xmm1
lea edi, [edi + 32] lea edi, [edi + 32]
lea esi, [esi + 16]
sub ecx, 16 sub ecx, 16
ja xloop ja xloop
popad pop ebp
pop ebx
pop edi
pop esi
ret ret
} }
} }
...@@ -1508,9 +1516,9 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1508,9 +1516,9 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
paddsw xmm0, xmm1 paddsw xmm0, xmm1
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx+16], xmm0 movq qword ptr [edx+16], xmm0
lea edx, [edx+24] lea edx, [edx+24]
sub ecx, 24
ja wloop ja wloop
ret ret
} }
...@@ -1527,7 +1535,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1527,7 +1535,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n"
"1:" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
...@@ -1551,7 +1559,7 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1551,7 +1559,7 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n"
"1:" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa 0x10(%0),%%xmm1 \n"
"movdqa (%0,%3,1),%%xmm2 \n" "movdqa (%0,%3,1),%%xmm2 \n"
...@@ -1586,7 +1594,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1586,7 +1594,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n" "psrld $0x18,%%xmm5 \n"
"1:" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
...@@ -1613,7 +1621,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1613,7 +1621,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
"pcmpeqb %%xmm7,%%xmm7 \n" "pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $0x8,%%xmm7 \n" "psrlw $0x8,%%xmm7 \n"
"lea (%4,%4,2),%3 \n" "lea (%4,%4,2),%3 \n"
"1:" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa 0x10(%0),%%xmm1 \n"
"movdqa (%0,%4,1),%%xmm2 \n" "movdqa (%0,%4,1),%%xmm2 \n"
...@@ -1663,27 +1671,72 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1663,27 +1671,72 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrlq $0x38,%%xmm5 \n" "psrlq $0x38,%%xmm5 \n"
"1:" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,(%1) \n" "movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n" "lea 0x4(%1),%1 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"ja 1b \n" "ja 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory", "cc" : "memory", "cc"
); );
}
#define HAS_SCALEADDROWS_SSE2
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0;
intptr_t tmp_src = 0;
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%3 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea (%0,%6,1),%5 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n"
"lea 0x10(%0),%0 \n"
"mov %3,%4 \n"
"2: \n"
"movdqa (%5),%%xmm2 \n"
"lea (%5,%6,1),%5 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n"
"paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%4 \n"
"ja 2b \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+rm"(src_width), // %2
"+rm"(src_height), // %3
"+r"(tmp_height), // %4
"+r"(tmp_src) // %5
: "r"(static_cast<intptr_t>(src_stride)) // %6
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
} }
#if defined(__i386__) #if defined(__i386__)
...@@ -1740,9 +1793,9 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1740,9 +1793,9 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
"psrlw $0x3,%xmm0 \n" "psrlw $0x3,%xmm0 \n"
"packuswb %xmm0,%xmm0 \n" "packuswb %xmm0,%xmm0 \n"
"packuswb %xmm0,%xmm0 \n" "packuswb %xmm0,%xmm0 \n"
"sub $0x4,%ecx \n"
"movd %xmm0,(%edi) \n" "movd %xmm0,(%edi) \n"
"lea 0x4(%edi),%edi \n" "lea 0x4(%edi),%edi \n"
"sub $0x4,%ecx \n"
"ja 1b \n" "ja 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
...@@ -1827,9 +1880,9 @@ extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1827,9 +1880,9 @@ extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
"paddsw %xmm7,%xmm0 \n" "paddsw %xmm7,%xmm0 \n"
"psrlw $0x2,%xmm0 \n" "psrlw $0x2,%xmm0 \n"
"packuswb %xmm0,%xmm0 \n" "packuswb %xmm0,%xmm0 \n"
"sub $0x18,%ecx \n"
"movq %xmm0,0x10(%edi) \n" "movq %xmm0,0x10(%edi) \n"
"lea 0x18(%edi),%edi \n" "lea 0x18(%edi),%edi \n"
"sub $0x18,%ecx \n"
"ja 1b \n" "ja 1b \n"
"popa \n" "popa \n"
...@@ -1884,9 +1937,9 @@ extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1884,9 +1937,9 @@ extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
"paddsw %xmm7,%xmm0 \n" "paddsw %xmm7,%xmm0 \n"
"psrlw $0x2,%xmm0 \n" "psrlw $0x2,%xmm0 \n"
"packuswb %xmm0,%xmm0 \n" "packuswb %xmm0,%xmm0 \n"
"sub $0x18,%ecx \n"
"movq %xmm0,0x10(%edi) \n" "movq %xmm0,0x10(%edi) \n"
"lea 0x18(%edi),%edi \n" "lea 0x18(%edi),%edi \n"
"sub $0x18,%ecx \n"
"ja 1b \n" "ja 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
...@@ -1914,9 +1967,9 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1914,9 +1967,9 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
"paddusb %xmm1,%xmm0 \n" "paddusb %xmm1,%xmm0 \n"
"movq %xmm0,(%edi) \n" "movq %xmm0,(%edi) \n"
"movhlps %xmm0,%xmm1 \n" "movhlps %xmm0,%xmm1 \n"
"sub $0xc,%ecx \n"
"movd %xmm1,0x8(%edi) \n" "movd %xmm1,0x8(%edi) \n"
"lea 0xc(%edi),%edi \n" "lea 0xc(%edi),%edi \n"
"sub $0xc,%ecx \n"
"ja 1b \n" "ja 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
...@@ -2017,49 +2070,6 @@ extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2017,49 +2070,6 @@ extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
); );
#endif // __PIC__ #endif // __PIC__
#define HAS_SCALEADDROWS_SSE2
extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width,
int src_height);
asm(
DECLARE_FUNCTION(ScaleAddRows_SSE2)
"pusha \n"
"mov 0x24(%esp),%esi \n"
"mov 0x28(%esp),%edx \n"
"mov 0x2c(%esp),%edi \n"
"mov 0x30(%esp),%ecx \n"
"mov 0x34(%esp),%ebx \n"
"pxor %xmm5,%xmm5 \n"
"1:"
"movdqa (%esi),%xmm2 \n"
"lea (%esi,%edx,1),%eax \n"
"movhlps %xmm2,%xmm3 \n"
"lea -0x1(%ebx),%ebp \n"
"punpcklbw %xmm5,%xmm2 \n"
"punpcklbw %xmm5,%xmm3 \n"
"2:"
"movdqa (%eax),%xmm0 \n"
"lea (%eax,%edx,1),%eax \n"
"movhlps %xmm0,%xmm1 \n"
"punpcklbw %xmm5,%xmm0 \n"
"punpcklbw %xmm5,%xmm1 \n"
"paddusw %xmm0,%xmm2 \n"
"paddusw %xmm1,%xmm3 \n"
"sub $0x1,%ebp \n"
"ja 2b \n"
"movdqa %xmm2,(%edi) \n"
"movdqa %xmm3,0x10(%edi) \n"
"lea 0x20(%edi),%edi \n"
"lea 0x10(%esi),%esi \n"
"sub $0x10,%ecx \n"
"ja 1b \n"
"popa \n"
"ret \n"
);
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
#define HAS_SCALEFILTERROWS_SSE2 #define HAS_SCALEFILTERROWS_SSE2
extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
...@@ -2554,46 +2564,6 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2554,46 +2564,6 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
); );
} }
#define HAS_SCALEADDROWS_SSE2
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width,
int src_height) {
asm volatile (
"pxor %%xmm5,%%xmm5 \n"
"1:"
"movdqa (%0),%%xmm2 \n"
"lea (%0,%4,1),%%r10 \n"
"movhlps %%xmm2,%%xmm3 \n"
"lea -0x1(%3),%%r11 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm3 \n"
"2:"
"movdqa (%%r10),%%xmm0 \n"
"lea (%%r10,%4,1),%%r10 \n"
"movhlps %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm1 \n"
"paddusw %%xmm0,%%xmm2 \n"
"paddusw %%xmm1,%%xmm3 \n"
"sub $0x1,%%r11 \n"
"ja 2b \n"
"movdqa %%xmm2,(%1) \n"
"movdqa %%xmm3,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width), // %2
"+r"(src_height) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc", "r10", "r11"
);
}
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
#define HAS_SCALEFILTERROWS_SSE2 #define HAS_SCALEFILTERROWS_SSE2
static void ScaleFilterRows_SSE2(uint8* dst_ptr, static void ScaleFilterRows_SSE2(uint8* dst_ptr,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment