Commit c3c06ec3 authored by fbarchard@google.com's avatar fbarchard@google.com

polynomial sse2 do 2 pixels at a time.

BUG=265
TEST=*Poly*
R=changjun.yang@intel.com

Review URL: https://webrtc-codereview.appspot.com/2195004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@782 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 5442018d
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 781 Version: 782
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 781 #define LIBYUV_VERSION 782
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -2052,7 +2052,7 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, ...@@ -2052,7 +2052,7 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, const float* poly, uint8* dst_argb, const float* poly,
int width) = ARGBPolynomialRow_C; int width) = ARGBPolynomialRow_C;
#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) #if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
ARGBPolynomialRow = ARGBPolynomialRow_SSE2; ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
} }
#endif #endif
......
...@@ -6774,42 +6774,53 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, ...@@ -6774,42 +6774,53 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
uint8* dst_argb, const float* poly, uint8* dst_argb, const float* poly,
int width) { int width) {
__asm { __asm {
mov eax, [esp + 12] /* poly */ push esi
movdqu xmm4, [eax] mov eax, [esp + 4 + 4] /* src_argb */
movdqu xmm5, [eax + 16] mov edx, [esp + 4 + 8] /* dst_argb */
movdqu xmm6, [eax + 32] mov esi, [esp + 4 + 12] /* poly */
movdqu xmm7, [eax + 48] mov ecx, [esp + 4 + 16] /* width */
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 16] /* width */
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
align 16 align 16
convertloop: convertloop:
// (slow) vpmovzxbd xmm0, dword ptr [eax] // BGRA pixel // (slow) pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
movd xmm0, [eax] // BGRA movq xmm0, qword ptr [eax] // BGRABGRA
lea eax, [eax + 4] lea eax, [eax + 8]
punpcklbw xmm0, xmm3 punpcklbw xmm0, xmm3
punpcklwd xmm0, xmm3 movdqa xmm4, xmm0
punpcklwd xmm0, xmm3 // pixel 0
punpckhwd xmm4, xmm3 // pixel 1
cvtdq2ps xmm0, xmm0 // 4 floats cvtdq2ps xmm0, xmm0 // 4 floats
cvtdq2ps xmm4, xmm4
movdqa xmm1, xmm0 // X movdqa xmm1, xmm0 // X
mulps xmm0, xmm5 // C1 * X movdqa xmm5, xmm4
addps xmm0, xmm4 // result = C0 + C1 * X mulps xmm0, [esi + 16] // C1 * X
mulps xmm4, [esi + 16]
addps xmm0, [esi] // result = C0 + C1 * X
addps xmm4, [esi]
movdqa xmm2, xmm1 movdqa xmm2, xmm1
movdqa xmm6, xmm5
mulps xmm2, xmm1 // X * X mulps xmm2, xmm1 // X * X
mulps xmm6, xmm5
mulps xmm1, xmm2 // X * X * X mulps xmm1, xmm2 // X * X * X
mulps xmm2, xmm6 // C2 * X * X mulps xmm5, xmm6
mulps xmm1, xmm7 // C3 * X * X * X mulps xmm2, [esi + 32] // C2 * X * X
mulps xmm6, [esi + 32]
mulps xmm1, [esi + 48] // C3 * X * X * X
mulps xmm5, [esi + 48]
addps xmm0, xmm2 // result += C2 * X * X addps xmm0, xmm2 // result += C2 * X * X
addps xmm4, xmm6
addps xmm0, xmm1 // result += C3 * X * X * X addps xmm0, xmm1 // result += C3 * X * X * X
addps xmm4, xmm5
cvttps2dq xmm0, xmm0 cvttps2dq xmm0, xmm0
cvttps2dq xmm4, xmm4
packuswb xmm0, xmm4
packuswb xmm0, xmm0 packuswb xmm0, xmm0
packuswb xmm0, xmm0 sub ecx, 2
sub ecx, 1 movq qword ptr [edx], xmm0
movd [edx], xmm0 lea edx, [edx + 8]
lea edx, [edx + 4]
jg convertloop jg convertloop
pop esi
ret ret
} }
} }
......
...@@ -1660,7 +1660,7 @@ TEST_F(libyuvTest, TestARGBPolynomial) { ...@@ -1660,7 +1660,7 @@ TEST_F(libyuvTest, TestARGBPolynomial) {
SIMD_ALIGNED(uint8 orig_pixels[1280][4]); SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
SIMD_ALIGNED(uint8 dst_pixels[1280][4]); SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
static const float kWarmifyPolynomial[16] = { SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
0.94230f, -3.03300f, -2.92500f, 0.f, // C0 0.94230f, -3.03300f, -2.92500f, 0.f, // C0
0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x
0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment