Commit 49d1e3b0 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

MultiplyRow_16_AVX2 for converting 10 bit YUV

When converting from lsb 10 bit formats to msb, the values
need to be shifted to the top 10 bits.  Using a multiply
allows the different numbers of bits to be copied:
// 128 = 9 bits
// 64 = 10 bits
// 16 = 12 bits
// 1 = 16 bits
Bug: libyuv:751
Test: LibYUVPlanarTest.MultiplyRow_16_Opt
Change-Id: I9cf226053a164baa14155215cb175065b1c4f169
Reviewed-on: https://chromium-review.googlesource.com/762951Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 2f58d126
......@@ -278,6 +278,7 @@ extern "C" {
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2
#endif
// The following are available on Neon platforms:
......@@ -1532,6 +1533,15 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
int scale,
int width);
void MultiplyRow_16_AVX2(const uint16* src_y,
uint16* dst_y,
int scale,
int width);
void MultiplyRow_16_C(const uint16* src_y,
uint16* dst_y,
int scale,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
......
......@@ -1817,6 +1817,16 @@ void MergeUVRow_16_C(const uint16* src_u,
}
}
void MultiplyRow_16_C(const uint16* src_y,
uint16* dst_y,
int scale,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_y[x] = src_y[x] * scale;
}
}
void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}
......
......@@ -2758,7 +2758,6 @@ void MergeUVRow_SSE2(const uint8* src_u,
// 64 = 10 bits
// 16 = 12 bits
// 1 = 16 bits
#ifdef HAS_MERGEUVROW_16_AVX2
void MergeUVRow_16_AVX2(const uint16* src_u,
const uint16* src_v,
......@@ -2801,6 +2800,41 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
}
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_MULTIPLYROW_16_AVX2
void MultiplyRow_16_AVX2(const uint16* src_y,
uint16* dst_y,
int scale,
int width) {
// clang-format off
asm volatile (
"vmovd %3,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%0,%1) \n"
"vmovdqu %%ymm1,0x20(%0,%1) \n"
"add $0x40,%0 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm3");
// clang-format on
}
#endif // HAS_MULTIPLYROW_16_AVX2
#ifdef HAS_SPLITRGBROW_SSSE3
// Shuffle table for converting RGB to Planar.
......
......@@ -2661,6 +2661,44 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
}
#endif
// TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MULTIPLYROW_16_AVX2
TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_y, kPixels * 2);
align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
MemRandomize(src_pixels_y, kPixels * 2);
memset(dst_pixels_y_opt, 0, kPixels * 2);
memset(dst_pixels_y_c, 1, kPixels * 2);
MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y),
reinterpret_cast<uint16*>(dst_pixels_y_c), 64, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) {
MultiplyRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
reinterpret_cast<uint16*>(dst_pixels_y_opt), 64,
kPixels);
} else {
MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y),
reinterpret_cast<uint16*>(dst_pixels_y_opt), 64,
kPixels);
}
}
for (int i = 0; i < kPixels * 2; ++i) {
EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
}
free_aligned_buffer_page_end(src_pixels_y);
free_aligned_buffer_page_end(dst_pixels_y_opt);
free_aligned_buffer_page_end(dst_pixels_y_c);
}
#endif
float TestScaleMaxSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment