Commit 685b92b0 authored by fbarchard@google.com's avatar fbarchard@google.com

I400ToARGB_AVX2 port from SSE2 to AVX2.

BUG=403
TESTED=libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*I400ToARGB*
R=brucedawson@google.com

Review URL: https://webrtc-codereview.appspot.com/46569004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1322 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f5a7b2b4
...@@ -200,6 +200,7 @@ extern "C" { ...@@ -200,6 +200,7 @@ extern "C" {
#define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I444TOARGBROW_AVX2 #define HAS_I444TOARGBROW_AVX2
#define HAS_I411TOARGBROW_AVX2 #define HAS_I411TOARGBROW_AVX2
#define HAS_I400TOARGBROW_AVX2
// TODO(fbarchard): Port to Neon // TODO(fbarchard): Port to Neon
#define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565DITHERROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2
...@@ -935,9 +936,11 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); ...@@ -935,9 +936,11 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I444ToARGBRow_C(const uint8* src_y, void I444ToARGBRow_C(const uint8* src_y,
......
...@@ -349,6 +349,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, ...@@ -349,6 +349,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I400TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I400ToARGBRow = I400ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I400ToARGBRow = I400ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I400TOARGBROW_NEON) #if defined(HAS_I400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I400ToARGBRow = I400ToARGBRow_Any_NEON; I400ToARGBRow = I400ToARGBRow_Any_NEON;
......
...@@ -192,6 +192,9 @@ RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C, ...@@ -192,6 +192,9 @@ RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C,
#if defined(HAS_I400TOARGBROW_SSE2) #if defined(HAS_I400TOARGBROW_SSE2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7) RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7)
#endif #endif
#if defined(HAS_I400TOARGBROW_AVX2)
RGBANY(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, I400ToARGBRow_C, 1, 4, 15)
#endif
#if defined(HAS_YTOARGBROW_SSE2) #if defined(HAS_YTOARGBROW_SSE2)
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 1, 4, 7) RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 1, 4, 7)
#endif #endif
......
...@@ -284,6 +284,38 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -284,6 +284,38 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
} }
} }
#ifdef HAS_I400TOARGBROW_AVX2
// Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked) __declspec(align(16))
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
convertloop:
vmovdqu xmm0, [eax]
lea eax, [eax + 16]
vpermq ymm0, ymm0, 0xd8
vpunpcklbw ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
vpunpckhwd ymm1, ymm0, ymm0
vpunpcklwd ymm0, ymm0, ymm0
vpor ymm0, ymm0, ymm5
vpor ymm1, ymm1, ymm5
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_I400TOARGBROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
__asm { __asm {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment