Commit bffd326f authored by fbarchard@google.com's avatar fbarchard@google.com

AVX2 version of ARGBToARGB4444

BUG=403
TESTED=local build on windows
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/43429004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1297 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 94e3d5a3
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1295
Version: 1296
License: BSD
License File: LICENSE
......
......@@ -216,16 +216,17 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
#define HAS_ARGBTOARGB4444ROW_AVX2
// The following require HAS_I422TOARGBROW_AVX2
#if defined(HAS_I422TOARGBROW_AVX2)
#define HAS_YUY2TOARGBROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
// TODO(fbarchard): Enable once low levels are ported to AVX2
#define HAS_NV12TORGB565ROW_AVX2
#define HAS_NV21TORGB565ROW_AVX2
#define HAS_I422TORGB565ROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
// #define HAS_NV12TORGB565ROW_AVX2
// #define HAS_NV21TORGB565ROW_AVX2
// #define HAS_I422TORGB565ROW_AVX2
// #define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
#endif
......@@ -902,6 +903,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
......@@ -1365,6 +1368,8 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1295
#define LIBYUV_VERSION 1296
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -931,6 +931,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOARGB4444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
......
......@@ -34,7 +34,7 @@ extern "C" {
#ifdef HAS_I422TOARGBROW_SSSE3
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C,
1, 4, 7)
#endif // HAS_I422TOARGBROW_SSSE3
#endif
#ifdef HAS_I444TOARGBROW_SSSE3
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C,
0, 4, 7)
......@@ -60,24 +60,28 @@ YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
#ifdef HAS_J422TOARGBROW_SSSE3
YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C,
1, 4, 7)
#endif // HAS_J422TOARGBROW_SSSE3
#endif
#ifdef HAS_I422TOARGBROW_AVX2
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
#endif // HAS_I422TOARGBROW_AVX2
#endif
#ifdef HAS_I422TOBGRAROW_AVX2
YANY(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, I422ToBGRARow_C, 1, 4, 15)
#endif // HAS_I422TOBGRAROW_AVX2
#endif
#ifdef HAS_I422TORGBAROW_AVX2
YANY(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, I422ToRGBARow_C, 1, 4, 15)
#endif // HAS_I422TORGBAROW_AVX2
#endif
#ifdef HAS_I422TOABGRROW_AVX2
YANY(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, I422ToABGRRow_C, 1, 4, 15)
#endif // HAS_I422TOABGRROW_AVX2
#endif
#ifdef HAS_I422TOARGB4444ROW_AVX2
YANY(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, I422ToARGB4444Row_C,
1, 2, 7)
#endif
#ifdef HAS_I422TOARGB1555ROW_AVX2
YANY(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, I422ToARGB1555Row_C,
1, 2, 7)
#endif
#ifdef HAS_I422TORGB565ROW_AVX2
YANY(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, I422ToRGB565Row_C,
1, 2, 7)
#endif
......@@ -95,13 +99,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
1, 2, 7)
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
#endif // HAS_I422TOARGBROW_NEON
#endif
#ifdef HAS_I422TOYUY2ROW_NEON
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
#endif // HAS_I422TOYUY2ROW_NEON
#endif
#ifdef HAS_I422TOUYVYROW_NEON
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOUYVYROW_NEON
#endif
#undef YANY
// Wrappers to handle odd width
......@@ -120,33 +124,33 @@ YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#ifdef HAS_NV12TOARGBROW_SSSE3
NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4, 7)
NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4, 7)
#endif // HAS_NV12TOARGBROW_SSSE3
#endif
#ifdef HAS_NV12TOARGBROW_AVX2
NV2NY(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, NV12ToARGBRow_C, 0, 4, 15)
NV2NY(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, NV21ToARGBRow_C, 0, 4, 15)
#endif // HAS_NV12TOARGBROW_AVX2
#endif
#ifdef HAS_NV12TOARGBROW_NEON
NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4, 7)
NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4, 7)
#endif // HAS_NV12TOARGBROW_NEON
#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
0, 2, 7)
NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
0, 2, 7)
#endif // HAS_NV12TORGB565ROW_SSSE3
#endif
#ifdef HAS_NV12TORGB565ROW_AVX2
NV2NY(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, NV12ToRGB565Row_C,
0, 2, 15)
NV2NY(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, NV21ToRGB565Row_C,
0, 2, 15)
#endif // HAS_NV12TORGB565ROW_AVX2
#endif
#ifdef HAS_NV12TORGB565ROW_NEON
NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C,
0, 2, 7)
NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C,
0, 2, 7)
#endif // HAS_NV12TORGB565ROW_NEON
#endif
#undef NVANY
#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \
......@@ -170,6 +174,11 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
4, 2, 3)
#endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C,
4, 2, 7)
#endif
#if defined(HAS_I400TOARGBROW_SSE2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7)
#endif
......
......@@ -680,8 +680,8 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
movdqa xmm1, xmm0
pand xmm0, xmm3 // low nibble
pand xmm1, xmm4 // high nibble
psrl xmm0, 4
psrl xmm1, 8
psrld xmm0, 4
psrld xmm1, 8
por xmm0, xmm1
packuswb xmm0, xmm0
lea eax, [eax + 16]
......@@ -693,6 +693,37 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
#ifdef HAS_ARGBTOARGB4444ROW_AVX2
__declspec(naked) __declspec(align(16))
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
vpsllw ymm4, ymm4, 12
vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
convertloop:
vmovdqu ymm0, [eax] // fetch 8 pixels of argb
vpand ymm1, ymm0, ymm4 // high nibble
vpand ymm0, ymm0, ymm3 // low nibble
vpsrld ymm1, ymm1, 8
vpsrld ymm0, ymm0, 4
vpor ymm0, ymm0, ymm1
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
lea eax, [eax + 32]
vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
lea edx, [edx + 16]
sub ecx, 8
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_ARGBTOARGB4444ROW_AVX2
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
__declspec(naked) __declspec(align(16))
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment