Convert16To8Row_SSSE3 port from AVX2

H010ToAR30 uses Convert16To8Row_SSSE3 to convert 10 bit YUV to 8 bit. Then standard YUV conversion can be used. This improves performance on low end CPUs. Future CL will by pass this conversion allowing for 10 bit YUV source, but the function will be useful as a utility for YUV conversions. Bug: libyuv:559, libyuv:751 Test: out/Release/libyuv_unittest --gtest_filter=*H010ToAR30* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1 Change-Id: I9b3ef22d88a5fd861de4cf1900b4c6e8fd24d0af Reviewed-on: https://chromium-review.googlesource.com/792334 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>

Convert16To8Row_SSSE3 port from AVX2
H010ToAR30 uses Convert16To8Row_SSSE3 to convert 10 bit YUV to 8 bit. Then standard YUV conversion can be used. This improves performance on low end CPUs. Future CL will by pass this conversion allowing for 10 bit YUV source, but the function will be useful as a utility for YUV conversions. Bug: libyuv:559, libyuv:751 Test: out/Release/libyuv_unittest --gtest_filter=*H010ToAR30* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1 Change-Id: I9b3ef22d88a5fd861de4cf1900b4c6e8fd24d0af Reviewed-on: https://chromium-review.googlesource.com/792334 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
324fa327 · Frank Barchard · Commit Bot · 84456171 · 324fa327 · 324fa327
Commit 324fa327 authored Nov 28, 2017 by Frank Barchard Committed by Commit Bot Nov 28, 2017
7 changed files
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -37,7 +37,7 @@ extern "C" {
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
+// define LIBYUV_DISABLE_X86
 #endif
 #endif
 // True if compiling for SSSE3 as a requirement.
@@ -268,6 +268,7 @@ extern "C" {
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
 #endif
@@ -1541,11 +1542,23 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
                         int width);
 void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
+void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
+void Convert16To8Row_SSSE3(const uint16* src_y,
+                           uint8* dst_y,
+                           int scale,
+                           int width);
 void Convert16To8Row_AVX2(const uint16* src_y,
                          uint8* dst_y,
                          int scale,
                          int width);
-void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
+void Convert16To8Row_Any_SSSE3(const uint16* src_y,
+                               uint8* dst_y,
+                               int scale,
+                               int width);
+void Convert16To8Row_Any_AVX2(const uint16* src_y,
+                              uint8* dst_y,
+                              int scale,
+                              int width);
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);

--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -462,15 +462,22 @@ static int H010ToAR30Matrix(const uint16* src_y,
    dst_stride_ar30 = -dst_stride_ar30;
  }
+#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Convert16To8Row = Convert16To8Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      Convert16To8Row = Convert16To8Row_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_CONVERT16TO8ROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    Convert16To8Row = Convert16To8Row_C;  // TODO(fbarchard): Any AVX2
+    Convert16To8Row = Convert16To8Row_Any_AVX2;
-    if (IS_ALIGNED(width, 64)) {
+    if (IS_ALIGNED(width, 32)) {
      Convert16To8Row = Convert16To8Row_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOAR30ROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
@@ -479,7 +486,6 @@ static int H010ToAR30Matrix(const uint16* src_y,
    }
  }
 #endif
 #if defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -732,10 +732,34 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
 #undef ANY11P
 // Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                            \
+  void NAMEANY(const uint16* src_ptr, uint8* dst_ptr, int scale, int width) { \
+    SIMD_ALIGNED(uint16 temp[32]);                                            \
+    SIMD_ALIGNED(uint8 out[32]);                                              \
+    memset(temp, 0, 64); /* for msan */                                       \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                   \
+    }                                                                         \
+    memcpy(temp, src_ptr + n, r * SBPP);                                      \
+    ANY_SIMD(temp, out, scale, MASK + 1);                                     \
+    memcpy(dst_ptr + n, out, r * BPP);                                        \
+  }
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, 15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, 31)
+#endif
+#undef ANY11C
+// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
 #define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                      \
  void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint16 temp[16 * 2]);                                       \
+    SIMD_ALIGNED(uint16 temp[32 * 2]);                                       \
-    memset(temp, 0, 32); /* for msan */                                      \
+    memset(temp, 0, 64); /* for msan */                                      \
    int r = width & MASK;                                                    \
    int n = width & ~MASK;                                                   \
    if (n > 0) {                                                             \

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -2894,6 +2894,37 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
 // 16384 = 10 bits
 // 4096 = 12 bits
 // 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16* src_y,
+                           uint8* dst_y,
+                           int scale,
+                           int width) {
+  // clang-format off
+  asm volatile (
+    "movd      %3,%%xmm3                      \n"
+    "punpcklwd %%xmm3,%%xmm3                  \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3             \n"
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                       \n"
+    "movdqu    (%0),%%xmm0                    \n"
+    "movdqu    0x10(%0),%%xmm1                \n"
+    "pmulhuw   %%xmm3,%%xmm0                  \n"
+    "pmulhuw   %%xmm3,%%xmm1                  \n"
+    "packuswb  %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0,(%1)                    \n"
+    "add       $0x20,%0                       \n"
+    "add       $0x10,%1                       \n"
+    "sub       $0x10,%2                       \n"
+    "jg        1b                             \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
 #ifdef HAS_MULTIPLYROW_16_AVX2
 void Convert16To8Row_AVX2(const uint16* src_y,
                          uint8* dst_y,

--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -338,7 +338,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64;  // 536870848
 TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
  uint32 h1 = 0;
-  const int kMaxWidth = benchmark_width_ * benchmark_height_;
+  const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31;
  align_buffer_page_end(src_a, kMaxWidth);
  align_buffer_page_end(src_b, kMaxWidth);
  memset(src_a, 255u, kMaxWidth);

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -1966,63 +1966,73 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
 // Alias to copy pixels as is
 #define AR30ToAR30 ARGBToARGB
-#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,      \
-                         ALIGN, YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C,       \
+                         ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF,      \
-                         BPP_C)                                                \
+                         FMT_C, BPP_C)                                        \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    const int kBpc = 2;                                                        \
+    const int kBpc = 2;                                                       \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      reinterpret_cast<uint16*>(src_y)[i + OFF] = (fastrand() & 0x3ff);        \
+      reinterpret_cast<uint16*>(src_y + SOFF)[i] = (fastrand() & 0x3ff);      \
-    }                                                                          \
+    }                                                                         \
-    for (int i = 0; i < kSizeUV; ++i) {                                        \
+    for (int i = 0; i < kSizeUV; ++i) {                                       \
-      reinterpret_cast<uint16*>(src_u)[i + OFF] = (fastrand() & 0x3ff);        \
+      reinterpret_cast<uint16*>(src_u + SOFF)[i] = (fastrand() & 0x3ff);      \
-      reinterpret_cast<uint16*>(src_v)[i + OFF] = (fastrand() & 0x3ff);        \
+      reinterpret_cast<uint16*>(src_v + SOFF)[i] = (fastrand() & 0x3ff);      \
-    }                                                                          \
+    }                                                                         \
-    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
+    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                         \
-    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
+    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                     \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
-    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth,      \
+    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y + SOFF), kWidth,    \
-                          reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV,   \
+                          reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV, \
-                          reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV,   \
+                          reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV, \
-                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight);    \
+                          dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);  \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth,    \
+      FMT_PLANAR##To##FMT_B(                                                  \
-                            reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \
+          reinterpret_cast<uint16*>(src_y + SOFF), kWidth,                    \
-                            reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \
+          reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV,                 \
-                            dst_argb_opt + OFF, kStrideB, kWidth,              \
+          reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV,                 \
-                            NEG kHeight);                                      \
+          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);                \
-    }                                                                          \
+    }                                                                         \
-    int max_diff = 0;                                                          \
+    int max_diff = 0;                                                         \
-    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                       \
+    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                      \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                     \
+      int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) -             \
-                         static_cast<int>(dst_argb_opt[i]));                   \
+                         static_cast<int>(dst_argb_opt[i + DOFF]));           \
-      if (abs_diff > max_diff) {                                               \
+      if (abs_diff > max_diff) {                                              \
-        max_diff = abs_diff;                                                   \
+        max_diff = abs_diff;                                                  \
-      }                                                                        \
+      }                                                                       \
-    }                                                                          \
+    }                                                                         \
-    EXPECT_LE(max_diff, DIFF);                                                 \
+    EXPECT_LE(max_diff, DIFF);                                                \
-    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_u);                                       \
+    free_aligned_buffer_page_end(src_u);                                      \
-    free_aligned_buffer_page_end(src_v);                                       \
+    free_aligned_buffer_page_end(src_v);                                      \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
  }
 #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                        YALIGN, DIFF, FMT_C, BPP_C)                            \
  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+                   YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0, FMT_C,   \
+                   BPP_C)                                                      \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1, FMT_C, \
+                   BPP_C)                                                      \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0, FMT_C,    \
+                   BPP_C)                                                      \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0, FMT_C,       \
+                   BPP_C)
 TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2, AR30, 4)

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2720,10 +2720,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
                    dst_pixels_y_c, 16384, kPixels);
  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    if (has_avx2) {
      Convert16To8Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
                           dst_pixels_y_opt, 16384, kPixels);
+    } else if (has_ssse3) {
+      Convert16To8Row_SSSE3(reinterpret_cast<const uint16*>(src_pixels_y),
+                            dst_pixels_y_opt, 16384, kPixels);
    } else {
      Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y),
                        dst_pixels_y_opt, 16384, kPixels);