Commit c3677514 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

ARGBToAR30 SSSE3 use pmulhuw to replicate fields

AR30 is optimized with 3 techniques
1. pmulhuw is used to replicate 8 bits to 10 bits.
2. Two channels are processed at a time.  R and B, and A and G.
3. pshufb is used to shift and mask 2 channels of R and B

Bug: libyuv:751
Test: ARGBToAR30_Opt
Change-Id: I4e62d6caa4df7d0ae80395fa911d3c922b6b897b
Reviewed-on: https://chromium-review.googlesource.com/822520Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent d94a4867
...@@ -268,7 +268,7 @@ extern "C" { ...@@ -268,7 +268,7 @@ extern "C" {
// TODO(fbarchard): Port to Visual C // TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_ARGBTOAR30ROW_SSE2 #define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
...@@ -1796,7 +1796,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); ...@@ -1796,7 +1796,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToAR30Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToAR30Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565DitherRow_C(const uint8* src_argb, void ARGBToRGB565DitherRow_C(const uint8* src_argb,
uint8* dst_rgb, uint8* dst_rgb,
...@@ -2424,7 +2424,7 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, ...@@ -2424,7 +2424,7 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb,
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb,
uint8* dst_rgb, uint8* dst_rgb,
int width); int width);
void ARGBToAR30Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToAR30Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb,
uint8* dst_rgb, uint8* dst_rgb,
......
...@@ -478,11 +478,11 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -478,11 +478,11 @@ static int H010ToAR30Matrix(const uint16* src_y,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOAR30ROW_SSE2) #if defined(HAS_ARGBTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToAR30Row = ARGBToAR30Row_Any_SSE2; ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
if (IS_ALIGNED(width, 4)) { if (IS_ALIGNED(width, 4)) {
ARGBToAR30Row = ARGBToAR30Row_SSE2; ARGBToAR30Row = ARGBToAR30Row_SSSE3;
} }
} }
#endif #endif
......
...@@ -1333,11 +1333,11 @@ int ARGBToAR30(const uint8* src_argb, ...@@ -1333,11 +1333,11 @@ int ARGBToAR30(const uint8* src_argb,
height = 1; height = 1;
src_stride_argb = dst_stride_ar30 = 0; src_stride_argb = dst_stride_ar30 = 0;
} }
#if defined(HAS_ARGBTOAR30ROW_SSE2) #if defined(HAS_ARGBTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToAR30Row = ARGBToAR30Row_Any_SSE2; ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
if (IS_ALIGNED(width, 4)) { if (IS_ALIGNED(width, 4)) {
ARGBToAR30Row = ARGBToAR30Row_SSE2; ARGBToAR30Row = ARGBToAR30Row_SSSE3;
} }
} }
#endif #endif
......
...@@ -396,8 +396,8 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ...@@ -396,8 +396,8 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#endif #endif
#if defined(HAS_ARGBTOAR30ROW_SSE2) #if defined(HAS_ARGBTOAR30ROW_SSSE3)
ANY11(ARGBToAR30Row_Any_SSE2, ARGBToAR30Row_SSE2, 0, 4, 4, 3) ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
#endif #endif
#if defined(HAS_ARGBTOAR30ROW_AVX2) #if defined(HAS_ARGBTOAR30ROW_AVX2)
ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
......
...@@ -699,12 +699,16 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { ...@@ -699,12 +699,16 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
); );
} }
#endif // HAS_RGB24TOARGBROW_SSSE3 #endif // HAS_RGB24TOARGBROW_SSSE3
/* /*
ARGBToAR30Row:
Red Blue Red Blue
With the 8 bit value in the upper bits, vpmulhuw by (1024+4) will produce a 10 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
bit value in the low 10 bits of each 16 bit value. This is whats wanted for the produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
blue channel. The red needs to be shifted 4 left, so multiply by (1024+4)*16 for wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
red. (1024+4)*16 for red.
Alpha Green Alpha Green
Alpha and Green are already in the high bits so vpand can zero out the other Alpha and Green are already in the high bits so vpand can zero out the other
...@@ -717,61 +721,6 @@ and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the ...@@ -717,61 +721,6 @@ and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
result left 10 to position the A and G channels. result left 10 to position the A and G channels.
*/ */
void ARGBToAR30Row_SSE2(const uint8* src, uint8* dst, int width) {
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" // 0x000000ff mask
"psrld $0x18,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n" // 0xc0000000 mask
"pslld $30,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
// alpha
"movdqa %%xmm0,%%xmm3 \n"
"pand %%xmm5,%%xmm3 \n"
// red
"movdqa %%xmm0,%%xmm1 \n"
"psrld $0x10,%%xmm1 \n"
"pand %%xmm4,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n"
"psrld $0x6,%%xmm2 \n"
"pslld $22,%%xmm1 \n"
"pslld $20,%%xmm2 \n"
"por %%xmm1,%%xmm3 \n"
"por %%xmm2,%%xmm3 \n"
// green
"movdqa %%xmm0,%%xmm1 \n"
"psrld $0x08,%%xmm1 \n"
"pand %%xmm4,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n"
"psrld $0x6,%%xmm2 \n"
"pslld $12,%%xmm1 \n"
"pslld $10,%%xmm2 \n"
"por %%xmm1,%%xmm3 \n"
"por %%xmm2,%%xmm3 \n"
// blue
"pand %%xmm4,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"psrld $0x6,%%xmm1 \n"
"pslld $2,%%xmm0 \n"
"por %%xmm0,%%xmm3 \n"
"por %%xmm1,%%xmm3 \n"
"movdqu %%xmm3,(%1) \n"
"add $0x10,%0 \n"
"add $0x10,%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
::"memory",
"cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#ifdef HAS_ARGBTOAR30ROW_AVX2
// Shuffle table for converting RAW to RGB24. Last 8. // Shuffle table for converting RAW to RGB24. Last 8.
static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
...@@ -780,6 +729,47 @@ static const uint32 kMaskRB10 = 0x3ff003ff; ...@@ -780,6 +729,47 @@ static const uint32 kMaskRB10 = 0x3ff003ff;
static const uint32 kMaskAG10 = 0xc000ff00; static const uint32 kMaskAG10 = 0xc000ff00;
static const uint32 kMulAG10 = 64 * 65536 + 1028; static const uint32 kMulAG10 = 64 * 65536 + 1028;
void ARGBToAR30Row_SSSE3(const uint8* src, uint8* dst, int width) {
asm volatile(
"movdqa %3,%%xmm2 \n" // shuffler for RB
"movd %4,%%xmm3 \n" // multipler for RB
"movd %5,%%xmm4 \n" // mask for R10 B10
"movd %6,%%xmm5 \n" // mask for AG
"movd %7,%%xmm6 \n" // multipler for AG
"pshufd $0x0,%%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm6,%%xmm6 \n"
"sub %0,%1 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
"movdqa %%xmm0,%%xmm1 \n"
"pshufb %%xmm2,%%xmm1 \n" // R0B0
"pand %%xmm5,%%xmm0 \n" // A0G0
"pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
"pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
"pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
"pslld $10,%%xmm0 \n" // A2 x10 G10 x10
"por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
"movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
"add $0x10,%0 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "m"(kShuffleRB30), // %3
"m"(kMulRB10), // %4
"m"(kMaskRB10), // %5
"m"(kMaskAG10), // %6
"m"(kMulAG10) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#ifdef HAS_ARGBTOAR30ROW_AVX2
void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) { void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
asm volatile( asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
...@@ -804,15 +794,16 @@ void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -804,15 +794,16 @@ void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kShuffleRB30), // %3 : "m"(kShuffleRB30), // %3
"m"(kMulRB10), // %4 "m"(kMulRB10), // %4
"m"(kMaskRB10), // %5 "m"(kMaskRB10), // %5
"m"(kMaskAG10), // %6 "m"(kMaskAG10), // %6
"m"(kMulAG10) // %7 "m"(kMulAG10) // %7
: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6");
} }
#endif #endif
......
...@@ -1947,12 +1947,12 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { ...@@ -1947,12 +1947,12 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
ARGBToAR30Row_C(src, dst_c, kPixels); ARGBToAR30Row_C(src, dst_c, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) { if (has_avx2) {
ARGBToAR30Row_AVX2(src, dst_opt, kPixels); ARGBToAR30Row_AVX2(src, dst_opt, kPixels);
} else if (has_sse2) { } else if (has_ssse3) {
ARGBToAR30Row_SSE2(src, dst_opt, kPixels); ARGBToAR30Row_SSSE3(src, dst_opt, kPixels);
} else { } else {
ARGBToAR30Row_C(src, dst_opt, kPixels); ARGBToAR30Row_C(src, dst_opt, kPixels);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment