Commit 845e94d1 authored by fbarchard@google.com's avatar fbarchard@google.com

Affine do 4 pixels at a time.

BUG=none
TEST=affine unitest
Review URL: https://webrtc-codereview.appspot.com/729005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@319 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 749950d7
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 318 Version: 319
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 318 #define LIBYUV_VERSION 319
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -3359,53 +3359,66 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -3359,53 +3359,66 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
mov edx, [esp + 16] // dst_argb mov edx, [esp + 16] // dst_argb
mov ecx, [esp + 20] // pointer to uv_dudv mov ecx, [esp + 20] // pointer to uv_dudv
movq xmm2, qword ptr [ecx] // uv movq xmm2, qword ptr [ecx] // uv
movq xmm3, qword ptr [ecx + 8] // dudv movq xmm7, qword ptr [ecx + 8] // dudv
mov ecx, [esp + 24] // width mov ecx, [esp + 24] // width
shl esi, 16 // 4, stride shl esi, 16 // 4, stride
add esi, 4 add esi, 4
movd xmm4, esi movd xmm5, esi
sub ecx, 2 sub ecx, 4
jl l2b jl l4b
// setup for 4 pixel loop
pshufd xmm7, xmm7, 0x44 // dup dudv
pshufd xmm5, xmm5, 0 // dup 4, stride
movdqa xmm0, xmm2 // x0, y0, x1, y1 movdqa xmm0, xmm2 // x0, y0, x1, y1
addps xmm0, xmm3 addps xmm0, xmm7
movlhps xmm2, xmm0 movlhps xmm2, xmm0
pshufd xmm4, xmm4, 0 // dup 4, stride movdqa xmm4, xmm7
movlhps xmm3, xmm3 // dudv addps xmm4, xmm4 // dudv *= 2
addps xmm3, xmm3 // dudv *= 2 movdqa xmm3, xmm2 // x2, y2, x3, y3
pshufd xmm4, xmm4, 0 addps xmm3, xmm4
addps xmm4, xmm4 // dudv *= 4
// 2 pixel loop // 4 pixel loop
align 4 align 4
l2: l4:
cvttps2dq xmm1, xmm2 // x, y float to int cvttps2dq xmm0, xmm2 // x, y float to int first 2
packssdw xmm1, xmm1 // x, y as shorts cvttps2dq xmm1, xmm3 // x, y float to int next 2
pmaddwd xmm1, xmm4 // offset = x * 4 + y * stride packssdw xmm0, xmm1 // x, y as 8 shorts
addps xmm2, xmm3 // x, y += dx, dy pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
movd esi, xmm1 addps xmm2, xmm4 // x, y += dx, dy first 2
movdqa xmm5, xmm1 addps xmm3, xmm4 // x, y += dx, dy next 2
pshufd xmm5, xmm5, 0x55 movd esi, xmm0
movd xmm0, [eax + esi] // read pixel 0 pshufd xmm0, xmm0, 0x39 // shift right
movd esi, xmm5 movd xmm1, [eax + esi] // read pixel 0
movd xmm5, [eax + esi] // read pixel 1 movd esi, xmm0
punpckldq xmm0, xmm5 pshufd xmm0, xmm0, 0x39 // shift right
sub ecx, 2 movd xmm6, [eax + esi] // read pixel 1
movq qword ptr [edx], xmm0 punpckldq xmm1, xmm6 // combine pixel 0 and 1
lea edx, [edx + 8] movd esi, xmm0
jge l2 pshufd xmm0, xmm0, 0x39 // shift right
movd xmm6, [eax + esi] // read pixel 2
movd esi, xmm0
movd xmm0, [eax + esi] // read pixel 3
punpckldq xmm6, xmm0 // combine pixel 2 and 3
punpcklqdq xmm1, xmm6 // combine pixel 0, 1, 2 and 3
sub ecx, 4
movdqu [edx], xmm1
lea edx, [edx + 16]
jge l4
l2b: l4b:
add ecx, 2 - 1 add ecx, 4 - 1
jl l1b jl l1b
// 1 pixel loop // 1 pixel loop
align 4 align 4
l1: l1:
cvttps2dq xmm1, xmm2 // x, y float to int cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm1, xmm1 // x, y as shorts packssdw xmm0, xmm0 // x, y as shorts
pmaddwd xmm1, xmm4 // offset = x * 4 + y * stride pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
addps xmm2, xmm3 // x, y += dx, dy addps xmm2, xmm7 // x, y += dx, dy
movd esi, xmm1 movd esi, xmm0
movd xmm0, [eax + esi] // copy a pixel movd xmm0, [eax + esi] // copy a pixel
sub ecx, 1 sub ecx, 1
movd [edx], xmm0 movd [edx], xmm0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment