Commit e3cc7694 authored by fbarchard@google.com's avatar fbarchard@google.com

4 pixel version of affine for gcc and aligned version of win.

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/714007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@320 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 845e94d1
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 319 Version: 320
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 319 #define LIBYUV_VERSION 320
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -3220,61 +3220,91 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -3220,61 +3220,91 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#endif // HAS_ARGBSHADE_SSE2 #endif // HAS_ARGBSHADE_SSE2
#ifdef HAS_ARGBAFFINEROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2
// TODO(fbarchard): Find 64 bit way to avoid masking.
// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
// Copy ARGB pixels from source image with slope to a row of destination. // Copy ARGB pixels from source image with slope to a row of destination.
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width) { uint8* dst_argb, const float* uv_dudv, int width) {
intptr_t src_argb_stride_temp = src_argb_stride; intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp = 0;
asm volatile ( asm volatile (
"movq (%3),%%xmm2 \n" "movq (%3),%%xmm2 \n"
"movq 0x8(%3),%%xmm3 \n" "movq 0x8(%3),%%xmm7 \n"
"shl $0x10,%1 \n" "shl $0x10,%1 \n"
"add $0x4,%1 \n" "add $0x4,%1 \n"
"movd %1,%%xmm4 \n" "movd %1,%%xmm5 \n"
"xor %1,%1 \n" // cleanse upper bits. "sub $0x4,%4 \n"
"sub $0x2,%4 \n" "jl 49f \n"
"jl 29f \n"
"pshufd $0x44,%%xmm7,%%xmm7 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
"movdqa %%xmm2,%%xmm0 \n" "movdqa %%xmm2,%%xmm0 \n"
"addps %%xmm3,%%xmm0 \n" "addps %%xmm7,%%xmm0 \n"
"movlhps %%xmm0,%%xmm2 \n" "movlhps %%xmm0,%%xmm2 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n" "movdqa %%xmm7,%%xmm4 \n"
"movlhps %%xmm3,%%xmm3 \n" "addps %%xmm4,%%xmm4 \n"
"addps %%xmm3,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n" "addps %%xmm4,%%xmm3 \n"
"addps %%xmm4,%%xmm4 \n"
// 2 pixel loop \n" // 4 pixel loop \n"
".p2align 2 \n" ".p2align 4 \n"
"20: \n" "40: \n"
"cvttps2dq %%xmm2,%%xmm1 \n" "cvttps2dq %%xmm2,%%xmm0 \n"
"packssdw %%xmm1,%%xmm1 \n" "cvttps2dq %%xmm3,%%xmm1 \n"
"pmaddwd %%xmm4,%%xmm1 \n" "packssdw %%xmm1,%%xmm0 \n"
"addps %%xmm3,%%xmm2 \n" "pmaddwd %%xmm5,%%xmm0 \n"
"movd %%xmm1,%1 \n" #if defined(__x86_64__)
"movq %%xmm0,%1 \n"
"mov %1,%5 \n"
"and $0x0fffffff,%1 \n" "and $0x0fffffff,%1 \n"
"movdqa %%xmm1,%%xmm5 \n" "shr $32,%5 \n"
"pshufd $0x55,%%xmm5,%%xmm5 \n" "pshufd $0xEE,%%xmm0,%%xmm0 \n"
"movd (%0,%1,1),%%xmm0 \n" #else
"movd %%xmm5,%1 \n" "movd %%xmm0,%1 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
"movd %%xmm0,%5 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
#endif
"movd (%0,%1,1),%%xmm1 \n"
"movd (%0,%5,1),%%xmm6 \n"
"punpckldq %%xmm6,%%xmm1 \n"
"addps %%xmm4,%%xmm2 \n"
"movq %%xmm1,(%2) \n"
#if defined(__x86_64__)
"movq %%xmm0,%1 \n"
"mov %1,%5 \n"
"and $0x0fffffff,%1 \n" "and $0x0fffffff,%1 \n"
"movd (%0,%1,1),%%xmm5 \n" "shr $32,%5 \n"
"punpckldq %%xmm5,%%xmm0 \n" #else
"sub $0x2,%4 \n" "movd %%xmm0,%1 \n"
"movq %%xmm0,(%2) \n" "pshufd $0x39,%%xmm0,%%xmm0 \n"
"lea 0x8(%2),%2 \n" "movd %%xmm0,%5 \n"
"jge 20b \n" #endif
"movd (%0,%1,1),%%xmm0 \n"
"29: \n" "movd (%0,%5,1),%%xmm6 \n"
"add $0x1,%4 \n" "punpckldq %%xmm6,%%xmm0 \n"
"addps %%xmm4,%%xmm3 \n"
"sub $0x4,%4 \n"
"movq %%xmm0,0x08(%2) \n"
"lea 0x10(%2),%2 \n"
"jge 40b \n"
"49: \n"
"add $0x3,%4 \n"
"jl 19f \n" "jl 19f \n"
// 1 pixel loop \n" // 1 pixel loop \n"
".p2align 2 \n" ".p2align 4 \n"
"10: \n" "10: \n"
"cvttps2dq %%xmm2,%%xmm1 \n" "cvttps2dq %%xmm2,%%xmm0 \n"
"packssdw %%xmm1,%%xmm1 \n" "packssdw %%xmm0,%%xmm0 \n"
"pmaddwd %%xmm4,%%xmm1 \n" "pmaddwd %%xmm5,%%xmm0 \n"
"addps %%xmm3,%%xmm2 \n" "addps %%xmm7,%%xmm2 \n"
"movd %%xmm1,%1 \n" "movd %%xmm0,%1 \n"
#if defined(__x86_64__)
"and $0x0fffffff,%1 \n" "and $0x0fffffff,%1 \n"
#endif
"movd (%0,%1,1),%%xmm0 \n" "movd (%0,%1,1),%%xmm0 \n"
"sub $0x1,%4 \n" "sub $0x1,%4 \n"
"movd %%xmm0,(%2) \n" "movd %%xmm0,(%2) \n"
...@@ -3285,11 +3315,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -3285,11 +3315,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"+r"(src_argb_stride_temp), // %1 "+r"(src_argb_stride_temp), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(uv_dudv), // %3 "+r"(uv_dudv), // %3
"+rm"(width) // %4 "+rm"(width), // %4
"+r"(temp) // %5
: :
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif #endif
); );
} }
......
...@@ -3354,13 +3354,14 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -3354,13 +3354,14 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width) { uint8* dst_argb, const float* uv_dudv, int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 8] // src_argb push edi
mov esi, [esp + 12] // stride mov eax, [esp + 12] // src_argb
mov edx, [esp + 16] // dst_argb mov esi, [esp + 16] // stride
mov ecx, [esp + 20] // pointer to uv_dudv mov edx, [esp + 20] // dst_argb
mov ecx, [esp + 24] // pointer to uv_dudv
movq xmm2, qword ptr [ecx] // uv movq xmm2, qword ptr [ecx] // uv
movq xmm7, qword ptr [ecx + 8] // dudv movq xmm7, qword ptr [ecx + 8] // dudv
mov ecx, [esp + 24] // width mov ecx, [esp + 28] // width
shl esi, 16 // 4, stride shl esi, 16 // 4, stride
add esi, 4 add esi, 4
movd xmm5, esi movd xmm5, esi
...@@ -3386,24 +3387,24 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -3386,24 +3387,24 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
cvttps2dq xmm1, xmm3 // x, y float to int next 2 cvttps2dq xmm1, xmm3 // x, y float to int next 2
packssdw xmm0, xmm1 // x, y as 8 shorts packssdw xmm0, xmm1 // x, y as 8 shorts
pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
addps xmm2, xmm4 // x, y += dx, dy first 2
addps xmm3, xmm4 // x, y += dx, dy next 2
movd esi, xmm0 movd esi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right pshufd xmm0, xmm0, 0x39 // shift right
movd xmm1, [eax + esi] // read pixel 0 movd edi, xmm0
movd esi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right pshufd xmm0, xmm0, 0x39 // shift right
movd xmm6, [eax + esi] // read pixel 1 movd xmm1, [eax + esi] // read pixel 0
movd xmm6, [eax + edi] // read pixel 1
punpckldq xmm1, xmm6 // combine pixel 0 and 1 punpckldq xmm1, xmm6 // combine pixel 0 and 1
addps xmm2, xmm4 // x, y += dx, dy first 2
movq qword ptr [edx], xmm1
movd esi, xmm0 movd esi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right pshufd xmm0, xmm0, 0x39 // shift right
movd edi, xmm0
movd xmm6, [eax + esi] // read pixel 2 movd xmm6, [eax + esi] // read pixel 2
movd esi, xmm0 movd xmm0, [eax + edi] // read pixel 3
movd xmm0, [eax + esi] // read pixel 3
punpckldq xmm6, xmm0 // combine pixel 2 and 3 punpckldq xmm6, xmm0 // combine pixel 2 and 3
punpcklqdq xmm1, xmm6 // combine pixel 0, 1, 2 and 3 addps xmm3, xmm4 // x, y += dx, dy next 2
sub ecx, 4 sub ecx, 4
movdqu [edx], xmm1 movq qword ptr 8[edx], xmm6
lea edx, [edx + 16] lea edx, [edx + 16]
jge l4 jge l4
...@@ -3425,6 +3426,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -3425,6 +3426,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
lea edx, [edx + 4] lea edx, [edx + 4]
jge l1 jge l1
l1b: l1b:
pop edi
pop esi pop esi
ret ret
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment