Commit bffd326f authored by fbarchard@google.com's avatar fbarchard@google.com

AVX2 version of ARGBToARGB4444

BUG=403
TESTED=local build on windows
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/43429004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1297 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 94e3d5a3
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1295 Version: 1296
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -216,16 +216,17 @@ extern "C" { ...@@ -216,16 +216,17 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2 #define HAS_YUY2TOYROW_AVX2
#define HAS_ARGBTOARGB4444ROW_AVX2
// The following require HAS_I422TOARGBROW_AVX2 // The following require HAS_I422TOARGBROW_AVX2
#if defined(HAS_I422TOARGBROW_AVX2) #if defined(HAS_I422TOARGBROW_AVX2)
#define HAS_YUY2TOARGBROW_AVX2 #define HAS_YUY2TOARGBROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2
// TODO(fbarchard): Enable once low levels are ported to AVX2 // TODO(fbarchard): Enable once low levels are ported to AVX2
#define HAS_NV12TORGB565ROW_AVX2 // #define HAS_NV12TORGB565ROW_AVX2
#define HAS_NV21TORGB565ROW_AVX2 // #define HAS_NV21TORGB565ROW_AVX2
#define HAS_I422TORGB565ROW_AVX2 // #define HAS_I422TORGB565ROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2 // #define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2
#endif #endif
...@@ -902,6 +903,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); ...@@ -902,6 +903,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
...@@ -1365,6 +1368,8 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); ...@@ -1365,6 +1368,8 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1295 #define LIBYUV_VERSION 1296
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -931,6 +931,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, ...@@ -931,6 +931,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOARGB4444ROW_NEON) #if defined(HAS_ARGBTOARGB4444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
......
...@@ -34,7 +34,7 @@ extern "C" { ...@@ -34,7 +34,7 @@ extern "C" {
#ifdef HAS_I422TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C, YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C,
1, 4, 7) 1, 4, 7)
#endif // HAS_I422TOARGBROW_SSSE3 #endif
#ifdef HAS_I444TOARGBROW_SSSE3 #ifdef HAS_I444TOARGBROW_SSSE3
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C, YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C,
0, 4, 7) 0, 4, 7)
...@@ -60,24 +60,28 @@ YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) ...@@ -60,24 +60,28 @@ YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
#ifdef HAS_J422TOARGBROW_SSSE3 #ifdef HAS_J422TOARGBROW_SSSE3
YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C, YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C,
1, 4, 7) 1, 4, 7)
#endif // HAS_J422TOARGBROW_SSSE3 #endif
#ifdef HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422TOARGBROW_AVX2
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
#endif // HAS_I422TOARGBROW_AVX2 #endif
#ifdef HAS_I422TOBGRAROW_AVX2 #ifdef HAS_I422TOBGRAROW_AVX2
YANY(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, I422ToBGRARow_C, 1, 4, 15) YANY(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, I422ToBGRARow_C, 1, 4, 15)
#endif // HAS_I422TOBGRAROW_AVX2 #endif
#ifdef HAS_I422TORGBAROW_AVX2 #ifdef HAS_I422TORGBAROW_AVX2
YANY(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, I422ToRGBARow_C, 1, 4, 15) YANY(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, I422ToRGBARow_C, 1, 4, 15)
#endif // HAS_I422TORGBAROW_AVX2 #endif
#ifdef HAS_I422TOABGRROW_AVX2 #ifdef HAS_I422TOABGRROW_AVX2
YANY(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, I422ToABGRRow_C, 1, 4, 15) YANY(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, I422ToABGRRow_C, 1, 4, 15)
#endif // HAS_I422TOABGRROW_AVX2 #endif
#ifdef HAS_I422TOARGB4444ROW_AVX2 #ifdef HAS_I422TOARGB4444ROW_AVX2
YANY(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, I422ToARGB4444Row_C, YANY(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, I422ToARGB4444Row_C,
1, 2, 7) 1, 2, 7)
#endif
#ifdef HAS_I422TOARGB1555ROW_AVX2
YANY(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, I422ToARGB1555Row_C, YANY(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, I422ToARGB1555Row_C,
1, 2, 7) 1, 2, 7)
#endif
#ifdef HAS_I422TORGB565ROW_AVX2
YANY(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, I422ToRGB565Row_C, YANY(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, I422ToRGB565Row_C,
1, 2, 7) 1, 2, 7)
#endif #endif
...@@ -95,13 +99,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, ...@@ -95,13 +99,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
1, 2, 7) 1, 2, 7)
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
#endif // HAS_I422TOARGBROW_NEON #endif
#ifdef HAS_I422TOYUY2ROW_NEON #ifdef HAS_I422TOYUY2ROW_NEON
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
#endif // HAS_I422TOYUY2ROW_NEON #endif
#ifdef HAS_I422TOUYVYROW_NEON #ifdef HAS_I422TOUYVYROW_NEON
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOUYVYROW_NEON #endif
#undef YANY #undef YANY
// Wrappers to handle odd width // Wrappers to handle odd width
...@@ -120,33 +124,33 @@ YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) ...@@ -120,33 +124,33 @@ YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#ifdef HAS_NV12TOARGBROW_SSSE3 #ifdef HAS_NV12TOARGBROW_SSSE3
NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4, 7) NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4, 7)
NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4, 7) NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4, 7)
#endif // HAS_NV12TOARGBROW_SSSE3 #endif
#ifdef HAS_NV12TOARGBROW_AVX2 #ifdef HAS_NV12TOARGBROW_AVX2
NV2NY(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, NV12ToARGBRow_C, 0, 4, 15) NV2NY(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, NV12ToARGBRow_C, 0, 4, 15)
NV2NY(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, NV21ToARGBRow_C, 0, 4, 15) NV2NY(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, NV21ToARGBRow_C, 0, 4, 15)
#endif // HAS_NV12TOARGBROW_AVX2 #endif
#ifdef HAS_NV12TOARGBROW_NEON #ifdef HAS_NV12TOARGBROW_NEON
NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4, 7) NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4, 7)
NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4, 7) NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4, 7)
#endif // HAS_NV12TOARGBROW_NEON #endif
#ifdef HAS_NV12TORGB565ROW_SSSE3 #ifdef HAS_NV12TORGB565ROW_SSSE3
NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C, NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
0, 2, 7) 0, 2, 7)
NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C, NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
0, 2, 7) 0, 2, 7)
#endif // HAS_NV12TORGB565ROW_SSSE3 #endif
#ifdef HAS_NV12TORGB565ROW_AVX2 #ifdef HAS_NV12TORGB565ROW_AVX2
NV2NY(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, NV12ToRGB565Row_C, NV2NY(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, NV12ToRGB565Row_C,
0, 2, 15) 0, 2, 15)
NV2NY(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, NV21ToRGB565Row_C, NV2NY(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, NV21ToRGB565Row_C,
0, 2, 15) 0, 2, 15)
#endif // HAS_NV12TORGB565ROW_AVX2 #endif
#ifdef HAS_NV12TORGB565ROW_NEON #ifdef HAS_NV12TORGB565ROW_NEON
NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C,
0, 2, 7) 0, 2, 7)
NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C,
0, 2, 7) 0, 2, 7)
#endif // HAS_NV12TORGB565ROW_NEON #endif
#undef NVANY #undef NVANY
#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \ #define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \
...@@ -170,6 +174,11 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C, ...@@ -170,6 +174,11 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
4, 2, 3) 4, 2, 3)
#endif #endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C,
4, 2, 7)
#endif
#if defined(HAS_I400TOARGBROW_SSE2) #if defined(HAS_I400TOARGBROW_SSE2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7) RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7)
#endif #endif
......
...@@ -680,8 +680,8 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -680,8 +680,8 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
movdqa xmm1, xmm0 movdqa xmm1, xmm0
pand xmm0, xmm3 // low nibble pand xmm0, xmm3 // low nibble
pand xmm1, xmm4 // high nibble pand xmm1, xmm4 // high nibble
psrl xmm0, 4 psrld xmm0, 4
psrl xmm1, 8 psrld xmm1, 8
por xmm0, xmm1 por xmm0, xmm1
packuswb xmm0, xmm0 packuswb xmm0, xmm0
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -693,6 +693,37 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -693,6 +693,37 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
} }
} }
#ifdef HAS_ARGBTOARGB4444ROW_AVX2
__declspec(naked) __declspec(align(16))
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
vpsllw ymm4, ymm4, 12
vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
convertloop:
vmovdqu ymm0, [eax] // fetch 8 pixels of argb
vpand ymm1, ymm0, ymm4 // high nibble
vpand ymm0, ymm0, ymm3 // low nibble
vpsrld ymm1, ymm1, 8
vpsrld ymm0, ymm0, 4
vpor ymm0, ymm0, ymm1
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
lea eax, [eax + 32]
vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
lea edx, [edx + 16]
sub ecx, 8
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_ARGBTOARGB4444ROW_AVX2
// Convert 16 ARGB pixels (64 bytes) to 16 Y values. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment