Commit 768f103b authored by Frank Barchard's avatar Frank Barchard Committed by Frank Barchard

Convert8To16 for better H010 support

Convert planar 8 bit formats to planar 16 bit formats.
Accepts a parameter that determines the number of bits.

Bug: libyuv:751
Test: Convert8To16 unittest
Change-Id: I8f6ffe64428ddf5769b87e0c069093a50a2541e9
Reviewed-on: https://chromium-review.googlesource.com/835410Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
parent c67db605
......@@ -270,6 +270,7 @@ extern "C" {
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
#define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#endif
......@@ -281,6 +282,7 @@ extern "C" {
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_ARGBTOAR30ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2
#endif
......@@ -1428,6 +1430,24 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
int width);
void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
void Convert8To16Row_C(const uint8* src_y, uint16* dst_y, int scale, int width);
void Convert8To16Row_SSE2(const uint8* src_y,
uint16* dst_y,
int scale,
int width);
void Convert8To16Row_AVX2(const uint8* src_y,
uint16* dst_y,
int scale,
int width);
void Convert8To16Row_Any_SSE2(const uint8* src_y,
uint16* dst_y,
int scale,
int width);
void Convert8To16Row_Any_AVX2(const uint8* src_y,
uint16* dst_y,
int scale,
int width);
void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
void Convert16To8Row_SSSE3(const uint16* src_y,
uint8* dst_y,
......
......@@ -699,26 +699,38 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
#undef ANY11P
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(const uint16* src_ptr, uint8* dst_ptr, int scale, int width) { \
SIMD_ALIGNED(uint16 temp[32]); \
SIMD_ALIGNED(uint8 out[32]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, scale, n); \
} \
memcpy(temp, src_ptr + n, r * SBPP); \
ANY_SIMD(temp, out, scale, MASK + 1); \
memcpy(dst_ptr + n, out, r * BPP); \
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
SIMD_ALIGNED(STYPE temp[32]); \
SIMD_ALIGNED(DTYPE out[32]); \
memset(temp, 0, 32 * SBPP); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, scale, n); \
} \
memcpy(temp, src_ptr + n, r * SBPP); \
ANY_SIMD(temp, out, scale, MASK + 1); \
memcpy(dst_ptr + n, out, r * BPP); \
}
#ifdef HAS_CONVERT16TO8ROW_SSSE3
ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, 15)
ANY11C(Convert16To8Row_Any_SSSE3,
Convert16To8Row_SSSE3,
2,
1,
uint16,
uint8,
15)
#endif
#ifdef HAS_CONVERT16TO8ROW_AVX2
ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, 31)
ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, uint16, uint8, 31)
#endif
#ifdef HAS_CONVERT8TO16ROW_SSE2
ANY11C(Convert8To16Row_Any_SSE2, Convert8To16Row_SSE2, 1, 2, uint8, uint16, 15)
#endif
#ifdef HAS_CONVERT8TO16ROW_AVX2
ANY11C(Convert8To16Row_Any_AVX2, Convert8To16Row_AVX2, 1, 2, uint8, uint16, 31)
#endif
#undef ANY11C
......
......@@ -1877,6 +1877,19 @@ void Convert16To8Row_C(const uint16* src_y,
}
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 1024 = 10 bits
void Convert8To16Row_C(const uint8* src_y,
uint16* dst_y,
int scale,
int width) {
int x;
scale *= 0x0101; // replicates the byte.
for (x = 0; x < width; ++x) {
dst_y[x] = (src_y[x] * scale) >> 16;
}
}
void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}
......
......@@ -2981,7 +2981,7 @@ void Convert16To8Row_SSSE3(const uint16* src_y,
// clang-format on
}
#ifdef HAS_MULTIPLYROW_16_AVX2
#ifdef HAS_CONVERT16TO8ROW_AVX2
void Convert16To8Row_AVX2(const uint16* src_y,
uint8* dst_y,
int scale,
......@@ -3014,7 +3014,81 @@ void Convert16To8Row_AVX2(const uint16* src_y,
: "memory", "cc", "xmm0", "xmm1", "xmm2");
// clang-format on
}
#endif // HAS_MULTIPLYROW_16_AVX2
#endif // HAS_CONVERT16TO8ROW_AVX2
// Use scale to convert to lsb formats depending how many bits there are:
// 512 = 9 bits
// 1024 = 10 bits
// 4096 = 12 bits
// TODO(fbarchard): reduce to SSE2
void Convert8To16Row_SSE2(const uint8* src_y,
uint16* dst_y,
int scale,
int width) {
// clang-format off
asm volatile (
"movd %3,%%xmm2 \n"
"punpcklwd %%xmm2,%%xmm2 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
// 32 pixels per loop.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"add $0x10,%0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"add $0x20,%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2");
// clang-format on
}
#ifdef HAS_CONVERT8TO16ROW_AVX2
void Convert8To16Row_AVX2(const uint8* src_y,
uint16* dst_y,
int scale,
int width) {
// clang-format off
asm volatile (
"vmovd %3,%%xmm2 \n"
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
"vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"add $0x20,%0 \n"
"vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"add $0x40,%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2");
// clang-format on
}
#endif // HAS_CONVERT8TO16ROW_AVX2
#ifdef HAS_SPLITRGBROW_SSSE3
......
......@@ -2733,13 +2733,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_CONVERT16TO8ROW_AVX2
TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
// AVX2 does multiple of 32, so round count up
const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
align_buffer_page_end(src_pixels_y, kPixels * 2);
align_buffer_page_end(dst_pixels_y_opt, kPixels);
align_buffer_page_end(dst_pixels_y_c, kPixels);
MemRandomize(src_pixels_y, kPixels * 2);
// C code does not clamp so limit source range to 10 bits.
// clamp source range to 10 bits.
for (int i = 0; i < kPixels; ++i) {
reinterpret_cast<uint16*>(src_pixels_y)[i] &= 1023;
}
......@@ -2775,6 +2776,50 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
}
#endif // HAS_CONVERT16TO8ROW_AVX2
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_CONVERT8TO16ROW_AVX2
TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) {
const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
align_buffer_page_end(src_pixels_y, kPixels);
align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
MemRandomize(src_pixels_y, kPixels);
memset(dst_pixels_y_opt, 0, kPixels);
memset(dst_pixels_y_c, 1, kPixels);
Convert8To16Row_C(src_pixels_y, reinterpret_cast<uint16*>(dst_pixels_y_c),
1024, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) {
Convert8To16Row_AVX2(src_pixels_y,
reinterpret_cast<uint16*>(dst_pixels_y_opt), 1024,
kPixels);
} else if (has_sse2) {
Convert8To16Row_SSE2(src_pixels_y,
reinterpret_cast<uint16*>(dst_pixels_y_opt), 1024,
kPixels);
} else {
Convert8To16Row_C(src_pixels_y,
reinterpret_cast<uint16*>(dst_pixels_y_opt), 1024,
kPixels);
}
}
for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
}
free_aligned_buffer_page_end(src_pixels_y);
free_aligned_buffer_page_end(dst_pixels_y_opt);
free_aligned_buffer_page_end(dst_pixels_y_c);
}
#endif // HAS_CONVERT8TO16ROW_AVX2
float TestScaleMaxSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment