Commit bb5a009d authored by fbarchard@google.com's avatar fbarchard@google.com

ARGB4444ToARGB and ARGB1555ToARGB ported to AVX2.

BUG=421
TESTED=out\release\libyuv_unittest --gtest_filter=*ARGB4444ToARGB*
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/48009004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1363 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8b9f9081
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1362 Version: 1363
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -65,6 +65,7 @@ extern "C" { ...@@ -65,6 +65,7 @@ extern "C" {
#define HAS_ABGRTOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2
#define HAS_ARGBSETROW_X86
#define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSE2
#define HAS_ARGBSHUFFLEROW_SSSE3 #define HAS_ARGBSHUFFLEROW_SSSE3
#define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB1555ROW_SSE2
...@@ -83,7 +84,7 @@ extern "C" { ...@@ -83,7 +84,7 @@ extern "C" {
#define HAS_BGRATOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS #define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2 #define HAS_COPYROW_SSE2
#define HAS_J400TOARGBROW_SSE2 #define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3 #define HAS_I411TOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3
...@@ -97,6 +98,8 @@ extern "C" { ...@@ -97,6 +98,8 @@ extern "C" {
#define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOUYVYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOARGBROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3
#define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2 #define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSE2
#define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_SSSE3
...@@ -113,20 +116,17 @@ extern "C" { ...@@ -113,20 +116,17 @@ extern "C" {
#define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGB565TOARGBROW_SSE2
#define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_X86
#define HAS_SETROW_ERMS #define HAS_SETROW_ERMS
#define HAS_ARGBSETROW_X86 #define HAS_SETROW_X86
#define HAS_SPLITUVROW_SSE2 #define HAS_SPLITUVROW_SSE2
#define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOARGBROW_SSSE3
#define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUV422ROW_SSE2
#define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOUVROW_SSE2
#define HAS_UYVYTOYROW_SSE2 #define HAS_UYVYTOYROW_SSE2
#define HAS_I400TOARGBROW_SSE2
#define HAS_YUY2TOARGBROW_SSSE3 #define HAS_YUY2TOARGBROW_SSSE3
#define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2 #define HAS_YUY2TOYROW_SSE2
#define HAS_J422TOARGBROW_SSSE3
// Effects: // Effects:
#define HAS_ARGBADDROW_SSE2 #define HAS_ARGBADDROW_SSE2
...@@ -186,10 +186,12 @@ extern "C" { ...@@ -186,10 +186,12 @@ extern "C" {
// The following are available require VS2012. Port to GCC. // The following are available require VS2012. Port to GCC.
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393 // TODO(fbarchard): Fix GCC AVX2 versions of YUV conversion. bug=393
#define HAS_ARGBTOARGB1555ROW_AVX2 #define HAS_ARGBTOARGB1555ROW_AVX2
#define HAS_ARGBTOARGB4444ROW_AVX2 #define HAS_ARGBTOARGB4444ROW_AVX2
#define HAS_ARGBTORGB565ROW_AVX2 #define HAS_ARGBTORGB565ROW_AVX2
#define HAS_ARGB1555TOARGBROW_AVX2
#define HAS_ARGB4444TOARGBROW_AVX2
#define HAS_I411TOARGBROW_AVX2 #define HAS_I411TOARGBROW_AVX2
#define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2
...@@ -208,7 +210,6 @@ extern "C" { ...@@ -208,7 +210,6 @@ extern "C" {
#define HAS_NV21TOARGBROW_AVX2 #define HAS_NV21TOARGBROW_AVX2
#define HAS_NV21TORGB565ROW_AVX2 #define HAS_NV21TORGB565ROW_AVX2
#define HAS_RGB565TOARGBROW_AVX2 #define HAS_RGB565TOARGBROW_AVX2
// TODO(fbarchard): Port to Neon
#define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565DITHERROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2
#endif #endif
...@@ -882,6 +883,10 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, ...@@ -882,6 +883,10 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
int pix); int pix);
void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix); void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
int pix);
void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
int pix);
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
...@@ -897,15 +902,20 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); ...@@ -897,15 +902,20 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
int pix); int pix);
void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
int pix);
void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
int pix); int pix);
void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
int pix); int pix);
void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
int pix);
void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
int pix);
void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
int pix);
void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix); void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1362 #define LIBYUV_VERSION 1363
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -1193,6 +1193,14 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, ...@@ -1193,6 +1193,14 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
} }
} }
#endif #endif
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
...@@ -1308,6 +1316,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, ...@@ -1308,6 +1316,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
} }
} }
#endif #endif
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
......
...@@ -634,6 +634,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, ...@@ -634,6 +634,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
} }
} }
#endif #endif
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGB1555TOARGBROW_NEON) #if defined(HAS_ARGB1555TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
...@@ -684,6 +692,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, ...@@ -684,6 +692,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
} }
} }
#endif #endif
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGB4444TOARGBROW_NEON) #if defined(HAS_ARGB4444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
......
...@@ -226,6 +226,14 @@ RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C, ...@@ -226,6 +226,14 @@ RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
RGBANY(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, RGB565ToARGBRow_C, RGBANY(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, RGB565ToARGBRow_C,
2, 4, 15) 2, 4, 15)
#endif #endif
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
RGBANY(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, ARGB1555ToARGBRow_C,
2, 4, 15)
#endif
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
RGBANY(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, ARGB4444ToARGBRow_C,
2, 4, 15)
#endif
#if defined(HAS_YUY2TOARGBROW_AVX2) #if defined(HAS_YUY2TOARGBROW_AVX2)
RGBANY(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, YUY2ToARGBRow_C, 2, 4, 31) RGBANY(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, YUY2ToARGBRow_C, 2, 4, 31)
RGBANY(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, UYVYToARGBRow_C, 2, 4, 31) RGBANY(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, UYVYToARGBRow_C, 2, 4, 31)
......
...@@ -573,6 +573,92 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, ...@@ -573,6 +573,92 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
} }
#endif // HAS_RGB565TOARGBROW_AVX2 #endif // HAS_RGB565TOARGBROW_AVX2
#ifdef HAS_ARGB1555TOARGBROW_AVX2
__declspec(naked) __declspec(align(16))
void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
int pix) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
vmovd xmm5, eax
vbroadcastss ymm5, xmm5
mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
movd xmm6, eax
vbroadcastss ymm6, xmm6
vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
vpsllw ymm3, ymm3, 11
vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
vpsllw ymm7, ymm7, 8
mov eax, [esp + 4] // src_argb1555
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
sub edx, eax
sub edx, eax
convertloop:
vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
vpsllw ymm1, ymm0, 1 // R in upper 5 bits
vpsllw ymm2, ymm0, 11 // B in upper 5 bits
vpand ymm1, ymm1, ymm3
vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
vpsllw ymm1, ymm1, 8
vpor ymm1, ymm1, ymm2 // RB
vpsraw ymm2, ymm0, 8 // A
vpand ymm0, ymm0, ymm4 // G in middle 5 bits
vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
vpand ymm2, ymm2, ymm7
vpor ymm0, ymm0, ymm2 // AG
vpunpckhbw ymm2, ymm1, ymm0
vpunpcklbw ymm1, ymm1, ymm0
vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
lea eax, [eax + 32]
sub ecx, 16
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_ARGB1555TOARGBROW_AVX2
#ifdef HAS_ARGB4444TOARGBROW_AVX2
__declspec(naked) __declspec(align(16))
void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
int pix) {
__asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
vmovd xmm4, eax
vbroadcastss ymm4, xmm4
vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
mov eax, [esp + 4] // src_argb4444
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
sub edx, eax
sub edx, eax
convertloop:
vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
vpand ymm2, ymm0, ymm5 // mask high nibbles
vpand ymm0, ymm0, ymm4 // mask low nibbles
vpsrlw ymm3, ymm2, 4
vpsllw ymm1, ymm0, 4
vpor ymm2, ymm2, ymm3
vpor ymm0, ymm0, ymm1
vpunpckhbw ymm1, ymm0, ymm2
vpunpcklbw ymm0, ymm0, ymm2
vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
lea eax, [eax + 32]
sub ecx, 16
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_ARGB4444TOARGBROW_AVX2
// 24 instructions // 24 instructions
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment