Commit 3982998c authored by fbarchard@google.com's avatar fbarchard@google.com

YToARGB AVX2 port from SSE2

BUG=393
TESTED=YToARGB unittest
R=brucedawson@google.com, harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/41679004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1258 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 0494ffee
...@@ -220,6 +220,11 @@ extern "C" { ...@@ -220,6 +220,11 @@ extern "C" {
#define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2
#endif #endif
// The following are available require VS2012
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_YTOARGBROW_AVX2
#endif
// The following are Yasm x86 only: // The following are Yasm x86 only:
// TODO(fbarchard): Port AVX2 to inline. // TODO(fbarchard): Port AVX2 to inline.
#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM) #if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
...@@ -980,9 +985,6 @@ void I422ToRGB565Row_C(const uint8* src_y, ...@@ -980,9 +985,6 @@ void I422ToRGB565Row_C(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_rgb565, uint8* dst_rgb565,
int width); int width);
void YToARGBRow_C(const uint8* src_y,
uint8* dst_argb,
int width);
void I422ToARGBRow_AVX2(const uint8* src_y, void I422ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1182,15 +1184,25 @@ void I422ToRAWRow_Any_SSSE3(const uint8* src_y, ...@@ -1182,15 +1184,25 @@ void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void YToARGBRow_C(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_SSE2(const uint8* src_y, void YToARGBRow_SSE2(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void YToARGBRow_AVX2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_NEON(const uint8* src_y, void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void YToARGBRow_Any_SSE2(const uint8* src_y, void YToARGBRow_Any_SSE2(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void YToARGBRow_Any_AVX2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_NEON(const uint8* src_y, void YToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
......
...@@ -276,6 +276,14 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, ...@@ -276,6 +276,14 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_YTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
YToARGBRow = YToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
YToARGBRow = YToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_YTOARGBROW_NEON) #if defined(HAS_YTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
YToARGBRow = YToARGBRow_Any_NEON; YToARGBRow = YToARGBRow_Any_NEON;
......
...@@ -169,6 +169,10 @@ RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, ...@@ -169,6 +169,10 @@ RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C,
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
7, 1, 4) 7, 1, 4)
#endif #endif
#if defined(HAS_YTOARGBROW_AVX2)
RGBANY(YToARGBRow_Any_AVX2, YToARGBRow_AVX2, YToARGBRow_C,
15, 1, 4)
#endif
#if defined(HAS_YUY2TOARGBROW_SSSE3) #if defined(HAS_YUY2TOARGBROW_SSSE3)
RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C, RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C,
15, 2, 4) 15, 2, 4)
......
...@@ -2299,8 +2299,8 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -2299,8 +2299,8 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
#endif // HAS_I422TOARGBROW_SSSE3 #endif // HAS_I422TOARGBROW_SSSE3
// TODO(fbarchard): Remove shift by 6.
#ifdef HAS_YTOARGBROW_SSE2 #ifdef HAS_YTOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void YToARGBRow_SSE2(const uint8* y_buf, void YToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -2341,12 +2341,62 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -2341,12 +2341,62 @@ void YToARGBRow_SSE2(const uint8* y_buf,
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
ret ret
} }
} }
#endif // HAS_YTOARGBROW_SSE2 #endif // HAS_YTOARGBROW_SSE2
#ifdef HAS_YTOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
__declspec(naked) __declspec(align(16))
void YToARGBRow_AVX2(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm {
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
vpslld ymm4, ymm4, 24
mov eax, 0x04ad04ad // 04ad = 1197 = round(1.164 * 64 * 16)
vmovd xmm3, eax
vbroadcastss ymm3, xmm3
mov eax, 0x4a7f4a7f // 4a7f = 19071 = round(1.164 * 64 * 256)
vmovd xmm2, eax
vbroadcastss ymm2, xmm2
mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width
convertloop:
// Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
vmovdqu xmm0, [eax]
lea eax, [eax + 16]
vpermq ymm0, ymm0, 0xd8
vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
vpmulhuw ymm0, ymm0, ymm2
vpsubusw ymm0, ymm0, ymm3
vpsrlw ymm0, ymm0, 6
vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
// TODO(fbarchard): Weave alpha with unpack.
// Step 2: Weave into ARGB
vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
vpermq ymm1, ymm1, 0xd8
vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 4 pixels
vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 4 pixels
vpor ymm0, ymm0, ymm4
vpor ymm1, ymm1, ymm4
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_YTOARGBROW_AVX2
#ifdef HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes. // Shuffle table for reversing the bytes.
static const uvec8 kShuffleMirror = { static const uvec8 kShuffleMirror = {
......
...@@ -1255,11 +1255,13 @@ TEST_F(libyuvTest, TestYToARGB) { ...@@ -1255,11 +1255,13 @@ TEST_F(libyuvTest, TestYToARGB) {
YToARGB(y, 0, argb, 0, 32, 1); YToARGB(y, 0, argb, 0, 32, 1);
for (int i = 0; i < 32; ++i) { for (int i = 0; i < 32; ++i) {
printf("%d: %d <-> %d,%d,%d,%d\n", y[i], expectedg[i], printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
argb[i * 4 + 0], argb[i * 4 + 0],
argb[i * 4 + 1], argb[i * 4 + 1],
argb[i * 4 + 2], argb[i * 4 + 2],
argb[i * 4 + 3]); argb[i * 4 + 3]);
}
for (int i = 0; i < 32; ++i) {
EXPECT_NEAR(expectedg[i], argb[i * 4 + 0], 1); EXPECT_NEAR(expectedg[i], argb[i * 4 + 0], 1);
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment