Commit 5442018d authored by fbarchard@google.com's avatar fbarchard@google.com

Improved polynomial for avx2 using vpmovzxbd and remove movdqa.

BUG=265
TESTED=libyuvTest.TestARGBPolynomial
R=jingning@google.com, ryanpetrie@google.com

Review URL: https://webrtc-codereview.appspot.com/2184005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@781 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 6da76f3b
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 780 Version: 781
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 780 #define LIBYUV_VERSION 781
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -6787,6 +6787,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, ...@@ -6787,6 +6787,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
align 16 align 16
convertloop: convertloop:
// (slow) vpmovzxbd xmm0, dword ptr [eax] // BGRA pixel
movd xmm0, [eax] // BGRA movd xmm0, [eax] // BGRA
lea eax, [eax + 4] lea eax, [eax + 4]
punpcklbw xmm0, xmm3 punpcklbw xmm0, xmm3
...@@ -6833,39 +6834,26 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, ...@@ -6833,39 +6834,26 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */ mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 16] /* width */ mov ecx, [esp + 16] /* width */
vpxor ymm3, ymm3, ymm3 // 0 constant for zero extending bytes to ints.
align 16 align 16
convertloop: convertloop:
vmovq xmm0, qword ptr [eax] // 2 BGRA pixels vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
lea eax, [eax + 8] lea eax, [eax + 8]
vcvtdq2ps ymm0, ymm0 // X 8 floats
// vpmovzxbd ymm0, ymm0 vmulps ymm2, ymm0, ymm0 // X * X
// TODO(fbarchard): Consider vex256 to avoid vpermq. vmulps ymm3, ymm0, ymm7 // C3 * X
vpunpcklbw xmm0, xmm0, xmm3 // b0g0r0a0_b0g0r0a0_00000000_00000000 vmulps ymm1, ymm0, ymm5 // C1 * X
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_00000000_b0g0r0a0_00000000 vmulps ymm3, ymm2, ymm3 // C3 * X * X * X
vpunpcklwd ymm0, ymm0, ymm3 // b000g000_r000a000_b000g000_r000a000
vcvtdq2ps ymm0, ymm0 // 8 floats
vmovdqa ymm1, ymm0 // X
vmulps ymm0, ymm0, ymm5 // C1 * X
vaddps ymm0, ymm0, ymm4 // result = C0 + C1 * X
vmovdqa ymm2, ymm1
vmulps ymm2, ymm2, ymm1 // X * X
vmulps ymm1, ymm1, ymm2 // X * X * X
vmulps ymm2, ymm2, ymm6 // C2 * X * X vmulps ymm2, ymm2, ymm6 // C2 * X * X
vmulps ymm1, ymm1, ymm7 // C3 * X * X * X vaddps ymm1, ymm1, ymm4 // result = C0 + C1 * X
vaddps ymm0, ymm0, ymm2 // result += C2 * X * X vaddps ymm1, ymm1, ymm3 // result += C3 * X * X * X
vaddps ymm0, ymm0, ymm1 // result += C3 * X * X * X vaddps ymm1, ymm1, ymm2 // result += C2 * X * X
vcvttps2dq ymm0, ymm0 vcvttps2dq ymm1, ymm1
vpackusdw ymm1, ymm1, ymm1 // b0g0r0a0_00000000_b0g0r0a0_00000000
// vpmovzxdb ymm0, ymm0 // b000g000_r000a000_b000g000_r000a000 vpermq ymm1, ymm1, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
vpackusdw ymm0, ymm0, ymm3 // b0g0r0a0_00000000_b0g0r0a0_00000000 vpackuswb xmm1, xmm1, xmm1 // bgrabgra_00000000_00000000_00000000
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
vpackuswb xmm0, xmm0, xmm3 // b0g0r0a0_b0g0r0a0_00000000_00000000
sub ecx, 2 sub ecx, 2
vmovq qword ptr [edx], xmm0 vmovq qword ptr [edx], xmm1
lea edx, [edx + 8] lea edx, [edx + 8]
jg convertloop jg convertloop
vzeroupper vzeroupper
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment