Commit baafc97d authored by fbarchard@google.com's avatar fbarchard@google.com

port YToARGB AVX2 to GCC

BUG=393
TESTED=untested
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/39819004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1262 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f7e5b5e3
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1261
Version: 1262
License: BSD
License File: LICENSE
......
......@@ -203,6 +203,7 @@ extern "C" {
#define HAS_UYVYTOUV422ROW_AVX2
#define HAS_UYVYTOUVROW_AVX2
#define HAS_UYVYTOYROW_AVX2
#define HAS_YTOARGBROW_AVX2
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
......@@ -217,7 +218,6 @@ extern "C" {
// The following are available require VS2012. Port to GCC.
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_YTOARGBROW_AVX2
// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393
#define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1261
#define LIBYUV_VERSION 1262
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -2292,9 +2292,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
#endif // HAS_I422TORGBAROW_AVX2
#ifdef HAS_YTOARGBROW_SSE2
void YToARGBRow_SSE2(const uint8* y_buf,
uint8* dst_argb,
int width) {
void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
asm volatile (
"mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
"movd %%eax,%%xmm2 \n"
......@@ -2340,6 +2338,55 @@ void YToARGBRow_SSE2(const uint8* y_buf,
}
#endif // HAS_YTOARGBROW_SSE2
#ifdef HAS_YTOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
asm volatile (
"mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
"vmovd %%eax,%%xmm2 \n"
"vbroadcastss %%xmm2,%%ymm2 \n"
"mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
"vmovd %%eax,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpslld $0x18,%%ymm4,%%ymm4 \n"
\n"
LABELALIGN
"1: \n"
// Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
"vmovdqu (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
"vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
"vpsrlw $0x6,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
"vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
"vpor %%ymm4,%%ymm0,%%ymm0 \n"
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(y_buf), // %0
"+r"(dst_argb), // %1
"+rm"(width) // %2
:
: "memory", "cc", "eax"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif
);
}
#endif // HAS_YTOARGBROW_AVX2
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
static uvec8 kShuffleMirror = {
......
......@@ -2354,14 +2354,14 @@ void YToARGBRow_AVX2(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm {
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
vpslld ymm4, ymm4, 24
mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
vmovd xmm3, eax
vbroadcastss ymm3, xmm3
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
vmovd xmm2, eax
vbroadcastss ymm2, xmm2
mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
vmovd xmm3, eax
vbroadcastss ymm3, xmm3
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
vpslld ymm4, ymm4, 24
mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment