Commit e3cc7694 authored by fbarchard@google.com's avatar fbarchard@google.com

4 pixel version of affine for gcc and aligned version of win.

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/714007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@320 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 845e94d1
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 319
Version: 320
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 319
#define LIBYUV_VERSION 320
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -3220,61 +3220,91 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#endif // HAS_ARGBSHADE_SSE2
#ifdef HAS_ARGBAFFINEROW_SSE2
// TODO(fbarchard): Find 64 bit way to avoid masking.
// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
// Copy ARGB pixels from source image with slope to a row of destination.
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width) {
intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp = 0;
asm volatile (
"movq (%3),%%xmm2 \n"
"movq 0x8(%3),%%xmm3 \n"
"movq 0x8(%3),%%xmm7 \n"
"shl $0x10,%1 \n"
"add $0x4,%1 \n"
"movd %1,%%xmm4 \n"
"xor %1,%1 \n" // cleanse upper bits.
"sub $0x2,%4 \n"
"jl 29f \n"
"movd %1,%%xmm5 \n"
"sub $0x4,%4 \n"
"jl 49f \n"
"pshufd $0x44,%%xmm7,%%xmm7 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
"movdqa %%xmm2,%%xmm0 \n"
"addps %%xmm3,%%xmm0 \n"
"addps %%xmm7,%%xmm0 \n"
"movlhps %%xmm0,%%xmm2 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
"movlhps %%xmm3,%%xmm3 \n"
"addps %%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
"movdqa %%xmm7,%%xmm4 \n"
"addps %%xmm4,%%xmm4 \n"
"movdqa %%xmm2,%%xmm3 \n"
"addps %%xmm4,%%xmm3 \n"
"addps %%xmm4,%%xmm4 \n"
// 2 pixel loop \n"
".p2align 2 \n"
"20: \n"
"cvttps2dq %%xmm2,%%xmm1 \n"
"packssdw %%xmm1,%%xmm1 \n"
"pmaddwd %%xmm4,%%xmm1 \n"
"addps %%xmm3,%%xmm2 \n"
"movd %%xmm1,%1 \n"
// 4 pixel loop \n"
".p2align 4 \n"
"40: \n"
"cvttps2dq %%xmm2,%%xmm0 \n"
"cvttps2dq %%xmm3,%%xmm1 \n"
"packssdw %%xmm1,%%xmm0 \n"
"pmaddwd %%xmm5,%%xmm0 \n"
#if defined(__x86_64__)
"movq %%xmm0,%1 \n"
"mov %1,%5 \n"
"and $0x0fffffff,%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"pshufd $0x55,%%xmm5,%%xmm5 \n"
"movd (%0,%1,1),%%xmm0 \n"
"movd %%xmm5,%1 \n"
"shr $32,%5 \n"
"pshufd $0xEE,%%xmm0,%%xmm0 \n"
#else
"movd %%xmm0,%1 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
"movd %%xmm0,%5 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
#endif
"movd (%0,%1,1),%%xmm1 \n"
"movd (%0,%5,1),%%xmm6 \n"
"punpckldq %%xmm6,%%xmm1 \n"
"addps %%xmm4,%%xmm2 \n"
"movq %%xmm1,(%2) \n"
#if defined(__x86_64__)
"movq %%xmm0,%1 \n"
"mov %1,%5 \n"
"and $0x0fffffff,%1 \n"
"movd (%0,%1,1),%%xmm5 \n"
"punpckldq %%xmm5,%%xmm0 \n"
"sub $0x2,%4 \n"
"movq %%xmm0,(%2) \n"
"lea 0x8(%2),%2 \n"
"jge 20b \n"
"29: \n"
"add $0x1,%4 \n"
"shr $32,%5 \n"
#else
"movd %%xmm0,%1 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
"movd %%xmm0,%5 \n"
#endif
"movd (%0,%1,1),%%xmm0 \n"
"movd (%0,%5,1),%%xmm6 \n"
"punpckldq %%xmm6,%%xmm0 \n"
"addps %%xmm4,%%xmm3 \n"
"sub $0x4,%4 \n"
"movq %%xmm0,0x08(%2) \n"
"lea 0x10(%2),%2 \n"
"jge 40b \n"
"49: \n"
"add $0x3,%4 \n"
"jl 19f \n"
// 1 pixel loop \n"
".p2align 2 \n"
".p2align 4 \n"
"10: \n"
"cvttps2dq %%xmm2,%%xmm1 \n"
"packssdw %%xmm1,%%xmm1 \n"
"pmaddwd %%xmm4,%%xmm1 \n"
"addps %%xmm3,%%xmm2 \n"
"movd %%xmm1,%1 \n"
"cvttps2dq %%xmm2,%%xmm0 \n"
"packssdw %%xmm0,%%xmm0 \n"
"pmaddwd %%xmm5,%%xmm0 \n"
"addps %%xmm7,%%xmm2 \n"
"movd %%xmm0,%1 \n"
#if defined(__x86_64__)
"and $0x0fffffff,%1 \n"
#endif
"movd (%0,%1,1),%%xmm0 \n"
"sub $0x1,%4 \n"
"movd %%xmm0,(%2) \n"
......@@ -3285,11 +3315,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"+r"(src_argb_stride_temp), // %1
"+r"(dst_argb), // %2
"+r"(uv_dudv), // %3
"+rm"(width) // %4
"+rm"(width), // %4
"+r"(temp) // %5
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
......
......@@ -3354,13 +3354,14 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width) {
__asm {
push esi
mov eax, [esp + 8] // src_argb
mov esi, [esp + 12] // stride
mov edx, [esp + 16] // dst_argb
mov ecx, [esp + 20] // pointer to uv_dudv
push edi
mov eax, [esp + 12] // src_argb
mov esi, [esp + 16] // stride
mov edx, [esp + 20] // dst_argb
mov ecx, [esp + 24] // pointer to uv_dudv
movq xmm2, qword ptr [ecx] // uv
movq xmm7, qword ptr [ecx + 8] // dudv
mov ecx, [esp + 24] // width
mov ecx, [esp + 28] // width
shl esi, 16 // 4, stride
add esi, 4
movd xmm5, esi
......@@ -3386,24 +3387,24 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
cvttps2dq xmm1, xmm3 // x, y float to int next 2
packssdw xmm0, xmm1 // x, y as 8 shorts
pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
addps xmm2, xmm4 // x, y += dx, dy first 2
addps xmm3, xmm4 // x, y += dx, dy next 2
movd esi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right
movd xmm1, [eax + esi] // read pixel 0
movd esi, xmm0
movd edi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right
movd xmm6, [eax + esi] // read pixel 1
movd xmm1, [eax + esi] // read pixel 0
movd xmm6, [eax + edi] // read pixel 1
punpckldq xmm1, xmm6 // combine pixel 0 and 1
addps xmm2, xmm4 // x, y += dx, dy first 2
movq qword ptr [edx], xmm1
movd esi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right
movd edi, xmm0
movd xmm6, [eax + esi] // read pixel 2
movd esi, xmm0
movd xmm0, [eax + esi] // read pixel 3
movd xmm0, [eax + edi] // read pixel 3
punpckldq xmm6, xmm0 // combine pixel 2 and 3
punpcklqdq xmm1, xmm6 // combine pixel 0, 1, 2 and 3
addps xmm3, xmm4 // x, y += dx, dy next 2
sub ecx, 4
movdqu [edx], xmm1
movq qword ptr 8[edx], xmm6
lea edx, [edx + 16]
jge l4
......@@ -3425,6 +3426,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
lea edx, [edx + 4]
jge l1
l1b:
pop edi
pop esi
ret
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment