Commit a0c32b9e authored by Frank Barchard's avatar Frank Barchard Committed by Frank Barchard

MergeUV10Row_AVX2 for converting H010 to P010

H010 is 10 bit planar format with 10 bits in lower bits.
P010 is 10 bit biplanar format with 10 bits in upper bits.
This function weaves the U and V channels and shifts the bits
into the upper bits.

Bug: libyuv:751
Test: LibYUVPlanarTest.MergeUV10Row_Opt
Change-Id: I4a0bac0ef1ff95aa1b8d68261ec8e8e86f2d1fbf
Reviewed-on: https://chromium-review.googlesource.com/752692Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 75ec56b5
......@@ -271,7 +271,7 @@ extern "C" {
#define HAS_I422TOARGBROW_SSSE3
#endif
// The following are available forr gcc/clang x86 platforms:
// The following are available for gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
......@@ -279,6 +279,14 @@ extern "C" {
#define HAS_SPLITRGBROW_SSSE3
#endif
// The following are available for AVX2 gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_MERGEUV10ROW_AVX2
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
......@@ -1523,6 +1531,15 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
uint8* dst_rgb,
int width);
void MergeUV10Row_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width);
void MergeUV10Row_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
......
......@@ -1798,6 +1798,24 @@ void MergeRGBRow_C(const uint8* src_r,
}
}
void MergeUV10Row_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
dst_uv[0] = src_u[x] << 6;
dst_uv[1] = src_v[x] << 6;
dst_uv[2] = src_u[x + 1] << 6;
dst_uv[3] = src_v[x + 1] << 6;
dst_uv += 4;
}
if (width & 1) {
dst_uv[0] = src_u[width - 1] << 6;
dst_uv[1] = src_v[width - 1] << 6;
}
}
void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}
......
......@@ -2753,6 +2753,48 @@ void MergeUVRow_SSE2(const uint8* src_u,
}
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUV10ROW_AVX2
void MergeUV10Row_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width) {
asm volatile (
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu (%0,%1,1),%%ymm1 \n"
"add $0x20,%0 \n"
"vpsllw $0x6,%%ymm0,%%ymm0 \n"
"vpsllw $0x6,%%ymm1,%%ymm1 \n"
// "vpermq $0xd8,%%ymm0,%%ymm0 \n"
// "vpermq $0xd8,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
// "vmovdqu %%ymm2, (%2) \n"
// "vmovdqu %%ymm0, 0x20(%2) \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
"add $0x40,%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc", "xmm0", "xmm1", "xmm2"
);
}
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_SPLITRGBROW_SSSE3
// Shuffle table for converting RGB to Planar.
......
......@@ -2617,6 +2617,48 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
// TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MERGEUV10ROW_AVX2
TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_u, kPixels * 2);
align_buffer_page_end(src_pixels_v, kPixels * 2);
align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
align_buffer_page_end(dst_pixels_uv_c, kPixels * 2 * 2);
MemRandomize(src_pixels_u, kPixels * 2);
MemRandomize(src_pixels_v, kPixels * 2);
memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) {
MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
} else {
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
}
}
for (int i = 0; i < kPixels * 2 * 2; ++i) {
EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]);
}
free_aligned_buffer_page_end(src_pixels_u);
free_aligned_buffer_page_end(src_pixels_v);
free_aligned_buffer_page_end(dst_pixels_uv_opt);
free_aligned_buffer_page_end(dst_pixels_uv_c);
}
#endif
float TestScaleMaxSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment