Commit 324fa327 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Convert16To8Row_SSSE3 port from AVX2

H010ToAR30 uses Convert16To8Row_SSSE3 to convert 10 bit YUV to 8 bit.
Then standard YUV conversion can be used.  This improves performance
on low end CPUs.
Future CL will by pass this conversion allowing for 10 bit YUV source,
but the function will be useful as a utility for YUV conversions.

Bug: libyuv:559, libyuv:751
Test: out/Release/libyuv_unittest --gtest_filter=*H010ToAR30* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1
Change-Id: I9b3ef22d88a5fd861de4cf1900b4c6e8fd24d0af
Reviewed-on: https://chromium-review.googlesource.com/792334
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarFrank Barchard <fbarchard@chromium.org>
parent 84456171
...@@ -37,7 +37,7 @@ extern "C" { ...@@ -37,7 +37,7 @@ extern "C" {
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature) #if defined(__has_feature)
#if __has_feature(memory_sanitizer) #if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86 // define LIBYUV_DISABLE_X86
#endif #endif
#endif #endif
// True if compiling for SSSE3 as a requirement. // True if compiling for SSSE3 as a requirement.
...@@ -268,6 +268,7 @@ extern "C" { ...@@ -268,6 +268,7 @@ extern "C" {
// TODO(fbarchard): Port to Visual C // TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
#endif #endif
...@@ -1541,11 +1542,23 @@ void MultiplyRow_16_AVX2(const uint16* src_y, ...@@ -1541,11 +1542,23 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
int width); int width);
void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width); void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
void Convert16To8Row_SSSE3(const uint16* src_y,
uint8* dst_y,
int scale,
int width);
void Convert16To8Row_AVX2(const uint16* src_y, void Convert16To8Row_AVX2(const uint16* src_y,
uint8* dst_y, uint8* dst_y,
int scale, int scale,
int width); int width);
void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width); void Convert16To8Row_Any_SSSE3(const uint16* src_y,
uint8* dst_y,
int scale,
int width);
void Convert16To8Row_Any_AVX2(const uint16* src_y,
uint8* dst_y,
int scale,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count);
......
...@@ -462,15 +462,22 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -462,15 +462,22 @@ static int H010ToAR30Matrix(const uint16* src_y,
dst_stride_ar30 = -dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30;
} }
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
Convert16To8Row = Convert16To8Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
Convert16To8Row = Convert16To8Row_SSSE3;
}
}
#endif
#if defined(HAS_CONVERT16TO8ROW_AVX2) #if defined(HAS_CONVERT16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
Convert16To8Row = Convert16To8Row_C; // TODO(fbarchard): Any AVX2 Convert16To8Row = Convert16To8Row_Any_AVX2;
if (IS_ALIGNED(width, 64)) { if (IS_ALIGNED(width, 32)) {
Convert16To8Row = Convert16To8Row_AVX2; Convert16To8Row = Convert16To8Row_AVX2;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOAR30ROW_AVX2) #if defined(HAS_ARGBTOAR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
...@@ -479,7 +486,6 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -479,7 +486,6 @@ static int H010ToAR30Matrix(const uint16* src_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOARGBROW_SSSE3) #if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3; I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
......
...@@ -732,10 +732,34 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7) ...@@ -732,10 +732,34 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
#undef ANY11P #undef ANY11P
// Any 1 to 1 with parameter and shorts. BPP measures in shorts. // Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(const uint16* src_ptr, uint8* dst_ptr, int scale, int width) { \
SIMD_ALIGNED(uint16 temp[32]); \
SIMD_ALIGNED(uint8 out[32]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, scale, n); \
} \
memcpy(temp, src_ptr + n, r * SBPP); \
ANY_SIMD(temp, out, scale, MASK + 1); \
memcpy(dst_ptr + n, out, r * BPP); \
}
#ifdef HAS_CONVERT16TO8ROW_SSSE3
ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, 15)
#endif
#ifdef HAS_CONVERT16TO8ROW_AVX2
ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, 31)
#endif
#undef ANY11C
// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ #define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \ void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \
SIMD_ALIGNED(uint16 temp[16 * 2]); \ SIMD_ALIGNED(uint16 temp[32 * 2]); \
memset(temp, 0, 32); /* for msan */ \ memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \ int r = width & MASK; \
int n = width & ~MASK; \ int n = width & ~MASK; \
if (n > 0) { \ if (n > 0) { \
......
...@@ -2894,6 +2894,37 @@ void MultiplyRow_16_AVX2(const uint16* src_y, ...@@ -2894,6 +2894,37 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
// 16384 = 10 bits // 16384 = 10 bits
// 4096 = 12 bits // 4096 = 12 bits
// 256 = 16 bits // 256 = 16 bits
void Convert16To8Row_SSSE3(const uint16* src_y,
uint8* dst_y,
int scale,
int width) {
// clang-format off
asm volatile (
"movd %3,%%xmm3 \n"
"punpcklwd %%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm3,%%xmm3 \n"
// 32 pixels per loop.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"pmulhuw %%xmm3,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"add $0x20,%0 \n"
"add $0x10,%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm3");
// clang-format on
}
#ifdef HAS_MULTIPLYROW_16_AVX2 #ifdef HAS_MULTIPLYROW_16_AVX2
void Convert16To8Row_AVX2(const uint16* src_y, void Convert16To8Row_AVX2(const uint16* src_y,
uint8* dst_y, uint8* dst_y,
......
...@@ -338,7 +338,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848 ...@@ -338,7 +338,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848
TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
uint32 h1 = 0; uint32 h1 = 0;
const int kMaxWidth = benchmark_width_ * benchmark_height_; const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31;
align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth);
memset(src_a, 255u, kMaxWidth); memset(src_a, 255u, kMaxWidth);
......
...@@ -1966,63 +1966,73 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { ...@@ -1966,63 +1966,73 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
// Alias to copy pixels as is // Alias to copy pixels as is
#define AR30ToAR30 ARGBToARGB #define AR30ToAR30 ARGBToARGB
#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ #define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
ALIGN, YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, \ ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF, \
BPP_C) \ FMT_C, BPP_C) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
const int kBpc = 2; \ const int kBpc = 2; \
align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF); \ align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \
align_buffer_page_end(src_u, kSizeUV* kBpc + OFF); \ align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \
align_buffer_page_end(src_v, kSizeUV* kBpc + OFF); \ align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \
align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \
align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \
for (int i = 0; i < kWidth * kHeight; ++i) { \ for (int i = 0; i < kWidth * kHeight; ++i) { \
reinterpret_cast<uint16*>(src_y)[i + OFF] = (fastrand() & 0x3ff); \ reinterpret_cast<uint16*>(src_y + SOFF)[i] = (fastrand() & 0x3ff); \
} \ } \
for (int i = 0; i < kSizeUV; ++i) { \ for (int i = 0; i < kSizeUV; ++i) { \
reinterpret_cast<uint16*>(src_u)[i + OFF] = (fastrand() & 0x3ff); \ reinterpret_cast<uint16*>(src_u + SOFF)[i] = (fastrand() & 0x3ff); \
reinterpret_cast<uint16*>(src_v)[i + OFF] = (fastrand() & 0x3ff); \ reinterpret_cast<uint16*>(src_v + SOFF)[i] = (fastrand() & 0x3ff); \
} \ } \
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \
MaskCpuFlags(disable_cpu_flags_); \ MaskCpuFlags(disable_cpu_flags_); \
FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth, \ FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y + SOFF), kWidth, \
reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \ reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV, \
reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \ reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV, \
dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight); \ dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \ MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \ for (int i = 0; i < benchmark_iterations_; ++i) { \
FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth, \ FMT_PLANAR##To##FMT_B( \
reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \ reinterpret_cast<uint16*>(src_y + SOFF), kWidth, \
reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \ reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV, \
dst_argb_opt + OFF, kStrideB, kWidth, \ reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV, \
NEG kHeight); \ dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \
} \ } \
int max_diff = 0; \ int max_diff = 0; \
for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \ int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) - \
static_cast<int>(dst_argb_opt[i])); \ static_cast<int>(dst_argb_opt[i + DOFF])); \
if (abs_diff > max_diff) { \ if (abs_diff > max_diff) { \
max_diff = abs_diff; \ max_diff = abs_diff; \
} \ } \
} \ } \
EXPECT_LE(max_diff, DIFF); \ EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \ free_aligned_buffer_page_end(src_v); \
free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \ free_aligned_buffer_page_end(dst_argb_opt); \
} }
#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, DIFF, FMT_C, BPP_C) \ YALIGN, DIFF, FMT_C, BPP_C) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0, FMT_C, \
BPP_C) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1, FMT_C, \
BPP_C) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0, FMT_C, \
BPP_C) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0, FMT_C, \
BPP_C)
TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2, AR30, 4) TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2, AR30, 4)
......
...@@ -2720,10 +2720,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) { ...@@ -2720,10 +2720,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
dst_pixels_y_c, 16384, kPixels); dst_pixels_y_c, 16384, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) { if (has_avx2) {
Convert16To8Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_y), Convert16To8Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
dst_pixels_y_opt, 16384, kPixels); dst_pixels_y_opt, 16384, kPixels);
} else if (has_ssse3) {
Convert16To8Row_SSSE3(reinterpret_cast<const uint16*>(src_pixels_y),
dst_pixels_y_opt, 16384, kPixels);
} else { } else {
Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y), Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y),
dst_pixels_y_opt, 16384, kPixels); dst_pixels_y_opt, 16384, kPixels);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment