Commit 2f58d126 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

MergeUV10Row_AVX2 use multiply to handle different bit depths

Instead of hardcoded shift, use a multiply by a parameter.
128 = 9 bits
64 = 10 bits
16 = 12 bits
1 = 16 bits

Bug: libyuv:751
Test: LibYUVPlanarTest.MergeUV10Row_Opt
Change-Id: Id925edfdbf91243370c90641b50eb8e7625ec329
Reviewed-on: https://chromium-review.googlesource.com/762523Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent e26b0a7e
...@@ -277,7 +277,7 @@ extern "C" { ...@@ -277,7 +277,7 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_MERGEUV10ROW_AVX2 #define HAS_MERGEUVROW_16_AVX2
#endif #endif
// The following are available on Neon platforms: // The following are available on Neon platforms:
...@@ -1521,14 +1521,16 @@ void MergeRGBRow_Any_NEON(const uint8* src_r, ...@@ -1521,14 +1521,16 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
uint8* dst_rgb, uint8* dst_rgb,
int width); int width);
void MergeUV10Row_C(const uint16* src_u, void MergeUVRow_16_C(const uint16* src_u,
const uint16* src_v, const uint16* src_v,
uint16* dst_uv, uint16* dst_uv,
int width); int scale, /* 64 for 10 bit */
void MergeUV10Row_AVX2(const uint16* src_u, int width);
const uint16* src_v, void MergeUVRow_16_AVX2(const uint16* src_u,
uint16* dst_uv, const uint16* src_v,
int width); uint16* dst_uv,
int scale,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count);
......
...@@ -1798,21 +1798,22 @@ void MergeRGBRow_C(const uint8* src_r, ...@@ -1798,21 +1798,22 @@ void MergeRGBRow_C(const uint8* src_r,
} }
} }
void MergeUV10Row_C(const uint16* src_u, void MergeUVRow_16_C(const uint16* src_u,
const uint16* src_v, const uint16* src_v,
uint16* dst_uv, uint16* dst_uv,
int width) { int scale,
int width) {
int x; int x;
for (x = 0; x < width - 1; x += 2) { for (x = 0; x < width - 1; x += 2) {
dst_uv[0] = src_u[x] << 6; dst_uv[0] = src_u[x] * scale;
dst_uv[1] = src_v[x] << 6; dst_uv[1] = src_v[x] * scale;
dst_uv[2] = src_u[x + 1] << 6; dst_uv[2] = src_u[x + 1] * scale;
dst_uv[3] = src_v[x + 1] << 6; dst_uv[3] = src_v[x + 1] * scale;
dst_uv += 4; dst_uv += 4;
} }
if (width & 1) { if (width & 1) {
dst_uv[0] = src_u[width - 1] << 6; dst_uv[0] = src_u[width - 1] * scale;
dst_uv[1] = src_v[width - 1] << 6; dst_uv[1] = src_v[width - 1] * scale;
} }
} }
......
...@@ -2753,13 +2753,23 @@ void MergeUVRow_SSE2(const uint8* src_u, ...@@ -2753,13 +2753,23 @@ void MergeUVRow_SSE2(const uint8* src_u,
} }
#endif // HAS_MERGEUVROW_SSE2 #endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUV10ROW_AVX2 // Use scale to convert lsb formats to msb, depending how many bits there are:
void MergeUV10Row_AVX2(const uint16* src_u, // 128 = 9 bits
const uint16* src_v, // 64 = 10 bits
uint16* dst_uv, // 16 = 12 bits
int width) { // 1 = 16 bits
#ifdef HAS_MERGEUVROW_16_AVX2
void MergeUVRow_16_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int scale,
int width) {
// clang-format off // clang-format off
asm volatile ( asm volatile (
"vmovd %4,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"sub %0,%1 \n" "sub %0,%1 \n"
// 16 pixels per loop. // 16 pixels per loop.
...@@ -2768,8 +2778,9 @@ void MergeUV10Row_AVX2(const uint16* src_u, ...@@ -2768,8 +2778,9 @@ void MergeUV10Row_AVX2(const uint16* src_u,
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu (%0,%1,1),%%ymm1 \n" "vmovdqu (%0,%1,1),%%ymm1 \n"
"add $0x20,%0 \n" "add $0x20,%0 \n"
"vpsllw $0x6,%%ymm0,%%ymm0 \n"
"vpsllw $0x6,%%ymm1,%%ymm1 \n" "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n" "vextractf128 $0x0,%%ymm2,(%2) \n"
...@@ -2784,8 +2795,8 @@ void MergeUV10Row_AVX2(const uint16* src_u, ...@@ -2784,8 +2795,8 @@ void MergeUV10Row_AVX2(const uint16* src_u,
"+r"(src_v), // %1 "+r"(src_v), // %1
"+r"(dst_uv), // %2 "+r"(dst_uv), // %2
"+r"(width) // %3 "+r"(width) // %3
: : "r"(scale) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
// clang-format on // clang-format on
} }
#endif // HAS_MERGEUVROW_AVX2 #endif // HAS_MERGEUVROW_AVX2
......
...@@ -2618,8 +2618,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { ...@@ -2618,8 +2618,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
} }
// TODO(fbarchard): improve test for platforms and cpu detect // TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MERGEUV10ROW_AVX2 #ifdef HAS_MERGEUVROW_16_AVX2
TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) { TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_; const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_u, kPixels * 2); align_buffer_page_end(src_pixels_u, kPixels * 2);
align_buffer_page_end(src_pixels_v, kPixels * 2); align_buffer_page_end(src_pixels_v, kPixels * 2);
...@@ -2631,20 +2631,22 @@ TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) { ...@@ -2631,20 +2631,22 @@ TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2); memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
memset(dst_pixels_uv_c, 1, kPixels * 2 * 2); memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u), MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v), reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels); reinterpret_cast<uint16*>(dst_pixels_uv_c), 64, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_avx2 = TestCpuFlag(kCpuHasAVX2);
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) { if (has_avx2) {
MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u), MergeUVRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v), reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels); reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
kPixels);
} else { } else {
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u), MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v), reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels); reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
kPixels);
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment