Commit 2bbb64df authored by fbarchard@google.com's avatar fbarchard@google.com

FMA3 version of Polynomial

BUG=265
TEST=cpuid and Polynomial unittest
R=changjun.yang@intel.com

Review URL: https://webrtc-codereview.appspot.com/2217004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@790 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 65d1ba6a
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 789
Version: 790
License: BSD
License File: LICENSE
......
......@@ -18,6 +18,7 @@ namespace libyuv {
extern "C" {
#endif
// TODO(fbarchard): Consider overlapping bits for different architectures.
// Internal flag to indicate cpuid requires initialization.
static const int kCpuInit = 0x1;
......@@ -35,11 +36,13 @@ static const int kCpuHasSSE42 = 0x100;
static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x1000;
static const int kCpuHasMIPS_DSP = 0x2000;
static const int kCpuHasMIPS_DSPR2 = 0x4000;
static const int kCpuHasMIPS = 0x10000;
static const int kCpuHasMIPS_DSP = 0x20000;
static const int kCpuHasMIPS_DSPR2 = 0x40000;
// Internal function used to auto-init.
LIBYUV_API
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 789
#define LIBYUV_VERSION 790
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -179,6 +179,7 @@ int InitCpuFlags(void) {
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86;
#ifdef HAS_XGETBV
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
......@@ -212,6 +213,9 @@ int InitCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
cpu_info_ &= ~kCpuHasERMS;
}
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
cpu_info_ &= ~kCpuHasFMA3;
}
#elif defined(__mips__) && defined(__linux__)
// Linux mips parse text file for dsp detect.
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
......
......@@ -2057,7 +2057,8 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 2)) {
if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
IS_ALIGNED(width, 2)) {
ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
}
#endif
......
......@@ -6834,10 +6834,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
int width) {
__asm {
mov eax, [esp + 12] /* poly */
vmovdqu xmm4, [eax]
vmovdqu xmm5, [eax + 16]
vmovdqu xmm6, [eax + 32]
vmovdqu xmm7, [eax + 48]
vmovdqu xmm4, [eax] // C0
vmovdqu xmm5, [eax + 16] // C1
vmovdqu xmm6, [eax + 32] // C2
vmovdqu xmm7, [eax + 48] // C3
vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords
vpermq ymm5, ymm5, 0x44
vpermq ymm6, ymm6, 0x44
......@@ -6855,18 +6855,15 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
vcvtdq2ps ymm0, ymm0 // X 8 floats
vmulps ymm2, ymm0, ymm0 // X * X
vmulps ymm3, ymm0, ymm7 // C3 * X
vmulps ymm1, ymm0, ymm5 // C1 * X
vmulps ymm3, ymm2, ymm3 // C3 * X * X * X
vmulps ymm2, ymm2, ymm6 // C2 * X * X
vaddps ymm1, ymm1, ymm4 // result = C0 + C1 * X
vaddps ymm1, ymm1, ymm3 // result += C3 * X * X * X
vaddps ymm1, ymm1, ymm2 // result += C2 * X * X
vcvttps2dq ymm1, ymm1
vpackusdw ymm1, ymm1, ymm1 // b0g0r0a0_00000000_b0g0r0a0_00000000
vpermq ymm1, ymm1, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
vpackuswb xmm1, xmm1, xmm1 // bgrabgra_00000000_00000000_00000000
vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
vcvttps2dq ymm0, ymm0
vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
sub ecx, 2
vmovq qword ptr [edx], xmm1
vmovq qword ptr [edx], xmm0
lea edx, [edx + 8]
jg convertloop
vzeroupper
......
......@@ -41,6 +41,8 @@ TEST_F(libyuvTest, TestCpuHas) {
printf("Has AVX2 %x\n", has_avx2);
int has_erms = TestCpuFlag(kCpuHasERMS);
printf("Has ERMS %x\n", has_erms);
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
printf("Has FMA3 %x\n", has_fma3);
int has_mips = TestCpuFlag(kCpuHasMIPS);
printf("Has MIPS %x\n", has_mips);
int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
......@@ -93,10 +95,8 @@ TEST_F(libyuvTest, TestCpuId) {
TEST_F(libyuvTest, TestLinuxNeon) {
int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
if (testdata) {
EXPECT_EQ(0,
ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
EXPECT_EQ(kCpuHasNEON,
ArmCpuCaps("unit_test/testdata/tegra3.txt"));
EXPECT_EQ(0, ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("unit_test/testdata/tegra3.txt"));
} else {
printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
}
......
......@@ -79,6 +79,7 @@ int main(int argc, const char* argv[]) {
int has_avx = TestCpuFlag(kCpuHasAVX);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_erms = TestCpuFlag(kCpuHasERMS);
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
printf("Has SSE2 %x\n", has_sse2);
printf("Has SSSE3 %x\n", has_ssse3);
printf("Has SSE4.1 %x\n", has_sse41);
......@@ -86,6 +87,7 @@ int main(int argc, const char* argv[]) {
printf("Has AVX %x\n", has_avx);
printf("Has AVX2 %x\n", has_avx2);
printf("Has ERMS %x\n", has_erms);
printf("Has FMA3 %x\n", has_fma3);
}
return 0;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment