Commit f1b8e2a5 authored by fbarchard@google.com's avatar fbarchard@google.com

4 pixel sse2 point scaler for better use of SSE2 that can hold 4 coordinates.

BUG=none
TEST=*ARGBScaleFrom*None
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/3319004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@839 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 4a4b7374
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 838
Version: 839
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 838
#define LIBYUV_VERSION 839
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -199,61 +199,84 @@ static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
}
}
// Column scaling unfiltered. SSSE3 version.
// TODO(fbarchard): Port to Neon
// Column scaling unfiltered. SSE2 version.
#define HAS_SCALEARGBCOLS_SSE2
__declspec(naked) __declspec(align(16))
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
__asm {
push esi
push edi
push esi
mov edi, [esp + 8 + 4] // dst_argb
mov esi, [esp + 8 + 8] // src_argb
mov ecx, [esp + 8 + 12] // dst_width
movd xmm2, [esp + 8 + 16] // x
movd xmm3, [esp + 8 + 20] // dx
pextrw eax, xmm2, 1 // get x0 integer. preroll
sub ecx, 2
jl xloop29
movdqa xmm0, xmm2 // x1 = x0 + dx
paddd xmm0, xmm3
punpckldq xmm2, xmm0 // x0 x1
punpckldq xmm3, xmm3 // dx dx
paddd xmm3, xmm3 // dx * 2, dx * 2
pextrw edx, xmm2, 3 // get x1 integer. preroll
pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
paddd xmm2, xmm0
paddd xmm3, xmm3 // 0, 0, 0, dx * 2
pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
paddd xmm2, xmm0 // x3 x2 x1 x0
paddd xmm3, xmm3 // 0, 0, 0, dx * 4
pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
pextrw eax, xmm2, 1 // get x0 integer.
pextrw edx, xmm2, 3 // get x1 integer.
cmp ecx, 0
jle xloop99
sub ecx, 4
jl xloop49
// 4 Pixel loop.
align 4
xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
pextrw eax, xmm2, 5 // get x2 integer.
pextrw edx, xmm2, 7 // get x3 integer.
paddd xmm2, xmm3 // x += dx
punpckldq xmm0, xmm1 // x0 x1
movd xmm1, [esi + eax * 4] // 1 source x2 pixels
movd xmm4, [esi + edx * 4] // 1 source x3 pixels
pextrw eax, xmm2, 1 // get x0 integer. next iteration.
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
punpckldq xmm1, xmm4 // x2 x3
punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
sub ecx, 4 // 4 pixels
movdqu [edi], xmm0
lea edi, [edi + 16]
jge xloop4
align 4
xloop49:
test ecx, 2
je xloop29
// 2 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
pextrw eax, xmm2, 5 // get x2 integer.
punpckldq xmm0, xmm1 // x0 x1
// 2 Pixel loop.
align 16
xloop2:
paddd xmm2, xmm3 // x += dx
movd xmm0, qword ptr [esi + eax * 4] // 1 source x0 pixels
movd xmm1, qword ptr [esi + edx * 4] // 1 source x1 pixels
punpckldq xmm0, xmm1 // x0 x1
pextrw eax, xmm2, 1 // get x0 integer. next iteration.
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
movq qword ptr [edi], xmm0
lea edi, [edi + 8]
sub ecx, 2 // 2 pixels
jge xloop2
align 16
xloop29:
test ecx, 1
je xloop99
add ecx, 2 - 1
jl xloop99
// 1 pixel remainder
movd xmm0, qword ptr [esi + eax * 4] // 1 source x0 pixels
movd [edi], xmm0
align 16
// 1 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
movd dword ptr [edi], xmm0
align 4
xloop99:
pop edi
pop esi
pop edi
ret
}
}
......@@ -551,53 +574,67 @@ static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
}
#define HAS_SCALEARGBCOLS_SSE2
// TODO(fbarchard): p2align 5 is for nacl branch targets. Reduce using
// pseudoop, bundle or macro.
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
intptr_t x0 = 0, x1 = 0;
asm volatile (
"movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n"
"pextrw $0x1,%%xmm2,%k3 \n"
"sub $0x2,%2 \n"
"jl 29f \n"
"movdqa %%xmm2,%%xmm0 \n"
"paddd %%xmm3,%%xmm0 \n"
"punpckldq %%xmm0,%%xmm2 \n"
"punpckldq %%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
"pshufd $0x11,%%xmm3,%%xmm0 \n"
"paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm3 \n"
"pextrw $0x3,%%xmm2,%k4 \n"
".p2align 5 \n"
"pshufd $0x5,%%xmm3,%%xmm0 \n"
"paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm3,%%xmm3 \n"
"pextrw $0x1,%%xmm2,%k0 \n"
"pextrw $0x3,%%xmm2,%k1 \n"
"cmp $0x0,%4 \n"
"jl 99f \n"
"sub $0x4,%4 \n"
"jl 49f \n"
".p2align 2 \n"
BUNDLEALIGN
"2: \n"
"40: \n"
MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
"pextrw $0x5,%%xmm2,%k0 \n"
"pextrw $0x7,%%xmm2,%k1 \n"
"paddd %%xmm3,%%xmm2 \n"
MEMOPREG(movd,0x00,1,3,4,xmm0) // movd (%1,%3,4),%%xmm0
MEMOPREG(movd,0x00,1,4,4,xmm1) // movd (%1,%4,4),%%xmm1
"punpckldq %%xmm1,%%xmm0 \n"
"pextrw $0x1,%%xmm2,%k3 \n"
"pextrw $0x3,%%xmm2,%k4 \n"
"movq %%xmm0," MEMACCESS(0) " \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
"sub $0x2,%2 \n"
"jge 2b \n"
MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
"pextrw $0x1,%%xmm2,%k0 \n"
"pextrw $0x3,%%xmm2,%k1 \n"
"punpckldq %%xmm4,%%xmm1 \n"
"punpcklqdq %%xmm1,%%xmm0 \n"
"sub $0x4,%4 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 40b \n"
".p2align 5 \n"
"29: \n"
"add $0x1,%2 \n"
"jl 99f \n"
"49: \n"
"test $0x2,%4 \n"
"je 29f \n"
BUNDLEALIGN
MEMOPREG(movd,0x00,1,3,4,xmm0) // movd (%1,%3,4),%%xmm0
"movd %%xmm0," MEMACCESS(0) " \n"
".p2align 5 \n"
MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
"pextrw $0x5,%%xmm2,%k0 \n"
"punpckldq %%xmm1,%%xmm0 \n"
"movq %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x8,2) ",%2 \n"
"29: \n"
"test $0x1,%4 \n"
"je 99f \n"
MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
"movd %%xmm0," MEMACCESS(2) " \n"
"99: \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+rm"(dst_width), // %2
"+r"(x0), // %3
"+r"(x1) // %4
: "+a"(x0), // %0
"+d"(x1), // %1
"+r"(dst_argb), // %2
"+r"(src_argb), // %3
"+r"(dst_width) // %4
: "rm"(x), // %5
"rm"(dx) // %6
: "memory", "cc"
......@@ -605,7 +642,7 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif
);
}
......@@ -705,6 +742,7 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
#endif
);
}
#endif // defined(__x86_64__) || defined(__i386__)
static void ScaleARGBRowDown2_C(const uint8* src_argb,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment