Commit 2f58d126 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

MergeUV10Row_AVX2 use multiply to handle different bit depths

Instead of hardcoded shift, use a multiply by a parameter.
128 = 9 bits
64 = 10 bits
16 = 12 bits
1 = 16 bits

Bug: libyuv:751
Test: LibYUVPlanarTest.MergeUV10Row_Opt
Change-Id: Id925edfdbf91243370c90641b50eb8e7625ec329
Reviewed-on: https://chromium-review.googlesource.com/762523Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent e26b0a7e
......@@ -277,7 +277,7 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_MERGEUV10ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2
#endif
// The following are available on Neon platforms:
......@@ -1521,14 +1521,16 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
uint8* dst_rgb,
int width);
void MergeUV10Row_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width);
void MergeUV10Row_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width);
void MergeUVRow_16_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int scale, /* 64 for 10 bit */
int width);
void MergeUVRow_16_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int scale,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
......
......@@ -1798,21 +1798,22 @@ void MergeRGBRow_C(const uint8* src_r,
}
}
void MergeUV10Row_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width) {
void MergeUVRow_16_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int scale,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
dst_uv[0] = src_u[x] << 6;
dst_uv[1] = src_v[x] << 6;
dst_uv[2] = src_u[x + 1] << 6;
dst_uv[3] = src_v[x + 1] << 6;
dst_uv[0] = src_u[x] * scale;
dst_uv[1] = src_v[x] * scale;
dst_uv[2] = src_u[x + 1] * scale;
dst_uv[3] = src_v[x + 1] * scale;
dst_uv += 4;
}
if (width & 1) {
dst_uv[0] = src_u[width - 1] << 6;
dst_uv[1] = src_v[width - 1] << 6;
dst_uv[0] = src_u[width - 1] * scale;
dst_uv[1] = src_v[width - 1] * scale;
}
}
......
......@@ -2753,13 +2753,23 @@ void MergeUVRow_SSE2(const uint8* src_u,
}
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUV10ROW_AVX2
void MergeUV10Row_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width) {
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 128 = 9 bits
// 64 = 10 bits
// 16 = 12 bits
// 1 = 16 bits
#ifdef HAS_MERGEUVROW_16_AVX2
void MergeUVRow_16_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int scale,
int width) {
// clang-format off
asm volatile (
"vmovd %4,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"sub %0,%1 \n"
// 16 pixels per loop.
......@@ -2768,8 +2778,9 @@ void MergeUV10Row_AVX2(const uint16* src_u,
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu (%0,%1,1),%%ymm1 \n"
"add $0x20,%0 \n"
"vpsllw $0x6,%%ymm0,%%ymm0 \n"
"vpsllw $0x6,%%ymm1,%%ymm1 \n"
"vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
......@@ -2784,8 +2795,8 @@ void MergeUV10Row_AVX2(const uint16* src_u,
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
: "r"(scale) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
// clang-format on
}
#endif // HAS_MERGEUVROW_AVX2
......
......@@ -2618,8 +2618,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
}
// TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MERGEUV10ROW_AVX2
TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
#ifdef HAS_MERGEUVROW_16_AVX2
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_u, kPixels * 2);
align_buffer_page_end(src_pixels_v, kPixels * 2);
......@@ -2631,20 +2631,22 @@ TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels);
MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_c), 64, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) {
MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
MergeUVRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
kPixels);
} else {
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
kPixels);
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment