Commit 3982998c authored by fbarchard@google.com's avatar fbarchard@google.com

YToARGB AVX2 port from SSE2

BUG=393
TESTED=YToARGB unittest
R=brucedawson@google.com, harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/41679004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1258 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 0494ffee
......@@ -220,6 +220,11 @@ extern "C" {
#define HAS_ARGBUNATTENUATEROW_AVX2
#endif
// The following are available require VS2012
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_YTOARGBROW_AVX2
#endif
// The following are Yasm x86 only:
// TODO(fbarchard): Port AVX2 to inline.
#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
......@@ -980,9 +985,6 @@ void I422ToRGB565Row_C(const uint8* src_y,
const uint8* src_v,
uint8* dst_rgb565,
int width);
void YToARGBRow_C(const uint8* src_y,
uint8* dst_argb,
int width);
void I422ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......@@ -1182,15 +1184,25 @@ void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
void YToARGBRow_C(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_SSE2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_AVX2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_SSE2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_AVX2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
......
......@@ -276,6 +276,14 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_YTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
YToARGBRow = YToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
YToARGBRow = YToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_YTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
YToARGBRow = YToARGBRow_Any_NEON;
......
......@@ -169,6 +169,10 @@ RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C,
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
7, 1, 4)
#endif
#if defined(HAS_YTOARGBROW_AVX2)
RGBANY(YToARGBRow_Any_AVX2, YToARGBRow_AVX2, YToARGBRow_C,
15, 1, 4)
#endif
#if defined(HAS_YUY2TOARGBROW_SSSE3)
RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C,
15, 2, 4)
......
......@@ -2299,8 +2299,8 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
#endif // HAS_I422TOARGBROW_SSSE3
// TODO(fbarchard): Remove shift by 6.
#ifdef HAS_YTOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void YToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
......@@ -2341,12 +2341,62 @@ void YToARGBRow_SSE2(const uint8* y_buf,
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
ret
}
}
#endif // HAS_YTOARGBROW_SSE2
#ifdef HAS_YTOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
__declspec(naked) __declspec(align(16))
void YToARGBRow_AVX2(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm {
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
vpslld ymm4, ymm4, 24
mov eax, 0x04ad04ad // 04ad = 1197 = round(1.164 * 64 * 16)
vmovd xmm3, eax
vbroadcastss ymm3, xmm3
mov eax, 0x4a7f4a7f // 4a7f = 19071 = round(1.164 * 64 * 256)
vmovd xmm2, eax
vbroadcastss ymm2, xmm2
mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width
convertloop:
// Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
vmovdqu xmm0, [eax]
lea eax, [eax + 16]
vpermq ymm0, ymm0, 0xd8
vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
vpmulhuw ymm0, ymm0, ymm2
vpsubusw ymm0, ymm0, ymm3
vpsrlw ymm0, ymm0, 6
vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
// TODO(fbarchard): Weave alpha with unpack.
// Step 2: Weave into ARGB
vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
vpermq ymm1, ymm1, 0xd8
vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 4 pixels
vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 4 pixels
vpor ymm0, ymm0, ymm4
vpor ymm1, ymm1, ymm4
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_YTOARGBROW_AVX2
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
static const uvec8 kShuffleMirror = {
......
......@@ -1255,11 +1255,13 @@ TEST_F(libyuvTest, TestYToARGB) {
YToARGB(y, 0, argb, 0, 32, 1);
for (int i = 0; i < 32; ++i) {
printf("%d: %d <-> %d,%d,%d,%d\n", y[i], expectedg[i],
printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
argb[i * 4 + 0],
argb[i * 4 + 1],
argb[i * 4 + 2],
argb[i * 4 + 3]);
}
for (int i = 0; i < 32; ++i) {
EXPECT_NEAR(expectedg[i], argb[i * 4 + 0], 1);
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment