Commit ffec313d authored by Frank Barchard's avatar Frank Barchard Committed by Frank Barchard

ABGRToAR30 used AVX2 with reversed shuffler

vpshufb is used to reverse R and B channels;
Code is otherwise the same as ARGBToAR30.

Bug: libyuv:751
Test: ABGRToAR30 unittest
Change-Id: I30e02925f5c729e4496c5963ba4ba4af16633b3b
Reviewed-on: https://chromium-review.googlesource.com/891807
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
parent ff8ab9ba
......@@ -55,6 +55,15 @@ int ARGBToRGBA(const uint8_t* src_argb,
int width,
int height);
// Convert ABGR To AR30.
LIBYUV_API
int ABGRToAR30(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_ar30,
int dst_stride_ar30,
int width,
int height);
// Convert ARGB To AR30.
LIBYUV_API
int ARGBToAR30(const uint8_t* src_argb,
......
......@@ -252,6 +252,7 @@ extern "C" {
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_ABGRTOAR30ROW_SSSE3
#define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
......@@ -268,6 +269,7 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_ABGRTOAR30ROW_AVX2
#define HAS_ARGBTOAR30ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
......@@ -1688,7 +1690,8 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
void ARGBToAR30Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ABGRToAR30Row_SSSE3(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
void ARGBToAR30Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
uint8_t* dst_rgb,
......@@ -1710,7 +1713,8 @@ void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
void ARGBToAR30Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ABGRToAR30Row_AVX2(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
void ARGBToAR30Row_AVX2(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
......@@ -1745,7 +1749,8 @@ void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
......@@ -2407,9 +2412,8 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_argb,
void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_argb,
uint8_t* dst_rgb,
......@@ -2429,9 +2433,8 @@ void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_argb,
void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
void ABGRToAR30Row_Any_AVX2(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
void ARGBToRGB24Row_Any_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb,
......
......@@ -1217,6 +1217,55 @@ int ARGBToARGB4444(const uint8_t* src_argb,
return 0;
}
// Convert ABGR To AR30.
LIBYUV_API
int ABGRToAR30(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_ar30,
int dst_stride_ar30,
int width,
int height) {
int y;
void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =
ABGRToAR30Row_C;
if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {
return -1;
}
if (height < 0) {
height = -height;
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
// Coalesce rows.
if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {
width *= height;
height = 1;
src_stride_abgr = dst_stride_ar30 = 0;
}
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ABGRToAR30Row = ABGRToAR30Row_SSSE3;
}
}
#endif
#if defined(HAS_ABGRTOAR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToAR30Row = ABGRToAR30Row_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ABGRToAR30Row = ABGRToAR30Row_AVX2;
}
}
#endif
for (y = 0; y < height; ++y) {
ABGRToAR30Row(src_abgr, dst_ar30, width);
src_abgr += src_stride_abgr;
dst_ar30 += dst_stride_ar30;
}
return 0;
}
// Convert ARGB To AR30.
LIBYUV_API
int ARGBToAR30(const uint8_t* src_argb,
......
......@@ -428,9 +428,15 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#endif
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
#endif
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
#endif
#if defined(HAS_ABGRTOAR30ROW_AVX2)
ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
#endif
#if defined(HAS_ARGBTOAR30ROW_AVX2)
ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
#endif
......
......@@ -348,15 +348,28 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
}
}
void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
int x;
for (x = 0; x < width; ++x) {
uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
uint32_t a0 = (src_abgr[3] >> 6);
*(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
dst_ar30 += 4;
src_abgr += 4;
}
}
void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
int x;
for (x = 0; x < width; ++x) {
uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
uint32_t a0 = (src_argb[3] >> 6);
*(uint32_t*)(dst_rgb) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
dst_rgb += 4;
*(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
dst_ar30 += 4;
src_argb += 4;
}
}
......
......@@ -730,6 +730,10 @@ result left 10 to position the A and G channels.
// Shuffle table for converting RAW to RGB24. Last 8.
static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
static const uint32_t kMaskRB10 = 0x3ff003ff;
static const uint32_t kMaskAG10 = 0xc000ff00;
......@@ -774,8 +778,46 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#ifdef HAS_ARGBTOAR30ROW_AVX2
void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"movdqa %3,%%xmm2 \n" // shuffler for RB
"movd %4,%%xmm3 \n" // multipler for RB
"movd %5,%%xmm4 \n" // mask for R10 B10
"movd %6,%%xmm5 \n" // mask for AG
"movd %7,%%xmm6 \n" // multipler for AG
"pshufd $0x0,%%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm6,%%xmm6 \n"
"sub %0,%1 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
"movdqa %%xmm0,%%xmm1 \n"
"pshufb %%xmm2,%%xmm1 \n" // R0B0
"pand %%xmm5,%%xmm0 \n" // A0G0
"pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
"pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
"pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
"pslld $10,%%xmm0 \n" // A2 x10 G10 x10
"por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
"movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
"add $0x10,%0 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "m"(kShuffleBR30), // %3 reversed shuffler
"m"(kMulRB10), // %4
"m"(kMaskRB10), // %5
"m"(kMaskAG10), // %6
"m"(kMulAG10) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#ifdef HAS_ARGBTOAR30ROW_AVX2
void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
......@@ -812,6 +854,43 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
}
#endif
#ifdef HAS_ABGRTOAR30ROW_AVX2
void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
"vbroadcastss %4,%%ymm3 \n" // multipler for RB
"vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
"vbroadcastss %6,%%ymm5 \n" // mask for AG
"vbroadcastss %7,%%ymm6 \n" // multipler for AG
"sub %0,%1 \n"
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
"vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
"vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
"vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
"vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
"vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
"vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
"vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
"vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
"add $0x20,%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "m"(kShuffleBR30), // %3 reversed shuffler
"m"(kMulRB10), // %4
"m"(kMaskRB10), // %5
"m"(kMaskAG10), // %6
"m"(kMulAG10) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
......
......@@ -41,6 +41,7 @@ namespace libyuv {
// Alias to copy pixels as is
#define AR30ToAR30 ARGBCopy
#define ABGRToABGR ARGBCopy
#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
......@@ -1065,6 +1066,7 @@ TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
......@@ -1945,9 +1947,9 @@ TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
// Caveat: Destination needs to be 4 bytes
TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
// TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ABGR, 4)
TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
// 2x2 frames
......@@ -2018,6 +2020,40 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
}
#endif // HAS_ARGBTOAR30ROW_AVX2
#ifdef HAS_ABGRTOAR30ROW_AVX2
TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
// ABGRToAR30Row_AVX2 expects a multiple of 8 pixels.
const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
align_buffer_page_end(src, kPixels * 4);
align_buffer_page_end(dst_opt, kPixels * 4);
align_buffer_page_end(dst_c, kPixels * 4);
MemRandomize(src, kPixels * 4);
memset(dst_opt, 0, kPixels * 4);
memset(dst_c, 1, kPixels * 4);
ABGRToAR30Row_C(src, dst_c, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) {
ABGRToAR30Row_AVX2(src, dst_opt, kPixels);
} else if (has_ssse3) {
ABGRToAR30Row_SSSE3(src, dst_opt, kPixels);
} else {
ABGRToAR30Row_C(src, dst_opt, kPixels);
}
}
for (int i = 0; i < kPixels * 4; ++i) {
EXPECT_EQ(dst_opt[i], dst_c[i]);
}
free_aligned_buffer_page_end(src);
free_aligned_buffer_page_end(dst_opt);
free_aligned_buffer_page_end(dst_c);
}
#endif // HAS_ABGRTOAR30ROW_AVX2
// TODO(fbarchard): Fix clamping issue affected by U channel.
#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF) \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment