Commit 191ab180 authored by fbarchard@google.com's avatar fbarchard@google.com

Use fixed point for small blurs

BUG=none
TEST=libyuvTest.ARGBBlurSmall_Opt
R=ryanpetrie@google.com

Review URL: https://webrtc-codereview.appspot.com/3389004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@843 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 5daa25f9
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 842
Version: 843
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 842
#define LIBYUV_VERSION 843
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -5707,7 +5707,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
pcmpeqb xmm5, xmm5 // alpha 255
pcmpeqb xmm5, xmm5 // alpha 255
align 16
convertloop:
......@@ -5772,6 +5772,60 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
sub ecx, 4
jl l4b
cmp area, 128 // 128 pixels will not overflow 15 bits.
ja l4
pcmpeqb xmm5, xmm5 // constant of 65536.0
psrld xmm5, 31
pslld xmm5, 16
cvtdq2ps xmm5, xmm5
mulps xmm5, xmm4 // 65536.0 * 1 / area
cvtps2dq xmm5, xmm5 // 0.16 fixed point
packssdw xmm5, xmm5
// 4 pixel loop small blocks.
align 4
s4:
// top left
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
// - top right
psubd xmm0, [eax + edx * 4]
psubd xmm1, [eax + edx * 4 + 16]
psubd xmm2, [eax + edx * 4 + 32]
psubd xmm3, [eax + edx * 4 + 48]
lea eax, [eax + 64]
// - bottom left
psubd xmm0, [esi]
psubd xmm1, [esi + 16]
psubd xmm2, [esi + 32]
psubd xmm3, [esi + 48]
// + bottom right
paddd xmm0, [esi + edx * 4]
paddd xmm1, [esi + edx * 4 + 16]
paddd xmm2, [esi + edx * 4 + 32]
paddd xmm3, [esi + edx * 4 + 48]
lea esi, [esi + 64]
packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
packssdw xmm2, xmm3
pmulhuw xmm0, xmm5
pmulhuw xmm2, xmm5
packuswb xmm0, xmm2
movdqu [edi], xmm0
lea edi, [edi + 16]
sub ecx, 4
jge s4
jmp l4b
// 4 pixel loop
align 4
l4:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment