Commit 3cb6071c authored by fbarchard@google.com's avatar fbarchard@google.com

Port Polynomial AVX2 code to GCC/NaCL

BUG=269
TESTED=untested
R=johannkoenig@google.com

Review URL: https://webrtc-codereview.appspot.com/2262004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@795 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent afd1d6b4
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 794
Version: 795
License: BSD
License File: LICENSE
......
......@@ -90,7 +90,6 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2
#endif
// The following are available on all x86 platforms except NaCL x64:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
......@@ -142,6 +141,16 @@ extern "C" {
#define HAS_RGBCOLORTABLEROW_X86
#endif
// The following are available on all x86 platforms, including NaCL, but
// require VS2012, llvm or NaCL.
// Caveat: llvm 3.1 required, but does not provide a version.
#if !defined(LIBYUV_DISABLE_X86) && \
((defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700) || \
defined(__native_client__) || defined(__llvm__))
// Effects:
#define HAS_ARGBPOLYNOMIALROW_AVX2
#endif
// The following are Windows only:
// TODO(fbarchard): Port to gcc.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
......@@ -173,7 +182,6 @@ extern "C" {
#define HAS_ARGBATTENUATEROW_AVX2
#define HAS_ARGBMIRRORROW_AVX2
#define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBPOLYNOMIALROW_AVX2
#define HAS_ARGBSUBTRACTROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
#endif // _MSC_VER >= 1700
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 794
#define LIBYUV_VERSION 795
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -5875,6 +5875,52 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
}
#endif // HAS_ARGBPOLYNOMIALROW_SSE2
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
void ARGBPolynomialRow_AVX2(const uint8* src_argb,
uint8* dst_argb, const float* poly,
int width) {
asm volatile (
"vmovdqu "MEMACCESS(3)",%%xmm4 \n"
"vmovdqu "MEMACCESS2(0x10,3)",%%xmm5 \n"
"vmovdqu "MEMACCESS2(0x20,3)",%%xmm6 \n"
"vmovdqu "MEMACCESS2(0x30,3)",%%xmm7 \n"
"vpermq $0x44,%%ymm4,%%ymm4 \n"
"vpermq $0x44,%%ymm5,%%ymm5 \n"
"vpermq $0x44,%%ymm6,%%ymm6 \n"
"vpermq $0x44,%%ymm7,%%ymm7 \n"
// 2 pixel loop.
".p2align 4 \n"
"1: \n"
"vpmovzxbd "MEMACCESS(0)",%%ymm0 \n" // 2 ARGB pixels
"lea "MEMLEA(0x8,0)",%0 \n"
"vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
"vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
"vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
"vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
"vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
"vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
"vcvttps2dq %%ymm0,%%ymm0 \n"
"vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
"sub $0x2,%2 \n"
"movq %%xmm0,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x8,1)",%1 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(poly) // %3
: "memory", "cc"
#if defined(__SSE2__)
, "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"
#endif
);
}
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment