Commit 1d509f21 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

ARGBToRGB24_AVX2 version

AVX2 port of SSSE3 conversion to output 24 bit RGB

Bug: libyuv:778
Test: LibYUVConvertTest.NV21ToRGB24_Opt
Change-Id: I14f7815522d1b790ecd2bb39d9a3441e803b694a
Reviewed-on: https://chromium-review.googlesource.com/953303
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
parent 3009890c
......@@ -275,10 +275,12 @@ extern "C" {
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_ABGRTOAR30ROW_AVX2
#define HAS_ARGBTOAR30ROW_AVX2
#define HAS_ARGBTORAWROW_AVX2
#define HAS_ARGBTORGB24ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
#define HAS_I422TOAR30ROW_AVX2
#define HAS_I422TOUYVYROW_AVX2
#define HAS_I422TOYUY2ROW_AVX2
......@@ -1701,6 +1703,9 @@ void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
uint8_t* dst_rgb,
const uint32_t dither4,
......@@ -2492,7 +2497,12 @@ void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint32_t param,
......
......@@ -879,6 +879,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTORGB24ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToRGB24Row = ARGBToRGB24Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
......@@ -937,6 +945,14 @@ int ARGBToRAW(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTORAWROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToRAWRow = ARGBToRAWRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
......
......@@ -123,27 +123,28 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
// Note that odd width replication includes 444 due to implementation
// on arm that subsamples 444 to 422 internally.
// Any 3 planes to 1 with yuvconstants
#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
const uint8_t* v_buf, uint8_t* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8_t temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
if (width & 1) { \
temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
} \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
const uint8_t* v_buf, uint8_t* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8_t temp[128 * 4]); \
memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
if (width & 1) { \
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \
} \
ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422TOARGBROW_SSSE3
......@@ -161,10 +162,10 @@ ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TORGB24ROW_AVX2
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
......@@ -443,6 +444,12 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
#endif
#if defined(HAS_ARGBTORGB24ROW_AVX2)
ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
#endif
#if defined(HAS_ARGBTORAWROW_AVX2)
ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
#endif
#if defined(HAS_ARGBTORGB565ROW_AVX2)
ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
#endif
......
......@@ -3004,8 +3004,11 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
#if defined(HAS_ARGBTORGB24ROW_AVX2)
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
#else
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
#endif
src_y += twidth;
src_uv += twidth;
dst_rgb24 += twidth * 3;
......@@ -3025,8 +3028,11 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
#if defined(HAS_ARGBTORGB24ROW_AVX2)
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
#else
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
#endif
src_y += twidth;
src_vu += twidth;
dst_rgb24 += twidth * 3;
......@@ -3124,8 +3130,11 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
#if defined(HAS_ARGBTORGB24ROW_AVX2)
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
#else
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
#endif
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
......
......@@ -505,6 +505,97 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
// vpermd for 12+12 to 24
static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
"vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
"vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
"vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
"vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
"vpermd %%ymm1,%%ymm7,%%ymm1 \n"
"vpermd %%ymm2,%%ymm7,%%ymm2 \n"
"vpermd %%ymm3,%%ymm7,%%ymm3 \n"
"vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
"vpor %%ymm4,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
"vpermq $0x4f,%%ymm2,%%ymm4 \n"
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
"vpermq $0x93,%%ymm3,%%ymm3 \n"
"vpor %%ymm3,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm2,0x40(%1) \n"
"lea 0x60(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "m"(kShuffleMaskARGBToRGB24), // %3
"m"(kPermdRGB24_AVX) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
"vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
"vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
"vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
"vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
"vpermd %%ymm1,%%ymm7,%%ymm1 \n"
"vpermd %%ymm2,%%ymm7,%%ymm2 \n"
"vpermd %%ymm3,%%ymm7,%%ymm3 \n"
"vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
"vpor %%ymm4,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
"vpermq $0x4f,%%ymm2,%%ymm4 \n"
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
"vpermq $0x93,%%ymm3,%%ymm3 \n"
"vpor %%ymm3,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm2,0x40(%1) \n"
"lea 0x60(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "m"(kShuffleMaskARGBToRAW), // %3
"m"(kPermdRGB24_AVX) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"pcmpeqb %%xmm3,%%xmm3 \n"
......
......@@ -1022,15 +1022,9 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
MaskCpuFlags(benchmark_cpu_info_); \
FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
kHeight); \
int max_diff = 0; \
for (int i = 0; i < kStrideB * kHeightB; ++i) { \
int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
static_cast<int>(dst_argb_opt[i])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF); \
} \
EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_argb); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
......@@ -1050,6 +1044,7 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
HEIGHT_B, DIFF)
// TODO(fbarchard): make ARM version of C code that matches NEON.
TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
......@@ -2458,4 +2453,28 @@ TEST_F(LibYUVConvertTest, TestH420ToAR30) {
free_aligned_buffer_page_end(ar30_pixels);
}
// Test RGB24 to ARGB and back to RGB24
TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
const int kSize = 256;
align_buffer_page_end(orig_rgb24, kSize * 3);
align_buffer_page_end(argb_pixels, kSize * 4);
align_buffer_page_end(dest_rgb24, kSize * 3);
// Test grey scale
for (int i = 0; i < kSize * 3; ++i) {
orig_rgb24[i] = i;
}
RGB24ToARGB(orig_rgb24, 0, argb_pixels, 0, kSize, 1);
ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);
for (int i = 0; i < kSize * 3; ++i) {
EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
}
free_aligned_buffer_page_end(orig_rgb24);
free_aligned_buffer_page_end(argb_pixels);
free_aligned_buffer_page_end(dest_rgb24);
}
} // namespace libyuv
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment