MergeUV10Row_AVX2 for converting H010 to P010

H010 is 10 bit planar format with 10 bits in lower bits. P010 is 10 bit biplanar format with 10 bits in upper bits. This function weaves the U and V channels and shifts the bits into the upper bits. Bug: libyuv:751 Test: LibYUVPlanarTest.MergeUV10Row_Opt Change-Id: I4a0bac0ef1ff95aa1b8d68261ec8e8e86f2d1fbf Reviewed-on: https://chromium-review.googlesource.com/752692Reviewed-by: Cheng Wang <wangcheng@google.com> Reviewed-by: Frank Barchard <fbarchard@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>

MergeUV10Row_AVX2 for converting H010 to P010
H010 is 10 bit planar format with 10 bits in lower bits. P010 is 10 bit biplanar format with 10 bits in upper bits. This function weaves the U and V channels and shifts the bits into the upper bits. Bug: libyuv:751 Test: LibYUVPlanarTest.MergeUV10Row_Opt Change-Id: I4a0bac0ef1ff95aa1b8d68261ec8e8e86f2d1fbf Reviewed-on: https://chromium-review.googlesource.com/752692Reviewed-by: Cheng Wang <wangcheng@google.com> Reviewed-by: Frank Barchard <fbarchard@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
a0c32b9e · Frank Barchard · Frank Barchard · 75ec56b5 · a0c32b9e · a0c32b9e
Commit a0c32b9e authored Nov 03, 2017 by Frank Barchard Committed by Frank Barchard Nov 03, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 120 additions and 1 deletion

row.h include/libyuv/row.h +18 -1

row_common.cc source/row_common.cc +18 -0

row_gcc.cc source/row_gcc.cc +42 -0

planar_test.cc unit_test/planar_test.cc +42 -0

No files found.
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -271,7 +271,7 @@ extern "C" {
 #define HAS_I422TOARGBROW_SSSE3
 #endif
-// The following are available forr gcc/clang x86 platforms:
+// The following are available for gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
@@ -279,6 +279,14 @@ extern "C" {
 #define HAS_SPLITRGBROW_SSSE3
 #endif
+// The following are available for AVX2 gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_MERGEUV10ROW_AVX2
+#endif
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@@ -1523,6 +1531,15 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
                          uint8* dst_rgb,
                          int width);
+void MergeUV10Row_C(const uint16* src_u,
+                    const uint16* src_v,
+                    uint16* dst_uv,
+                    int width);
+void MergeUV10Row_AVX2(const uint16* src_u,
+                       const uint16* src_v,
+                       uint16* dst_uv,
+                       int width);
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count);

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1798,6 +1798,24 @@ void MergeRGBRow_C(const uint8* src_r,
  }
 }
+void MergeUV10Row_C(const uint16* src_u,
+                    const uint16* src_v,
+                    uint16* dst_uv,
+                    int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x] << 6;
+    dst_uv[1] = src_v[x] << 6;
+    dst_uv[2] = src_u[x + 1] << 6;
+    dst_uv[3] = src_v[x + 1] << 6;
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1] << 6;
+    dst_uv[1] = src_v[width - 1] << 6;
+  }
+}
 void CopyRow_C(const uint8* src, uint8* dst, int count) {
  memcpy(dst, src, count);
 }

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -2753,6 +2753,48 @@ void MergeUVRow_SSE2(const uint8* src_u,
 }
 #endif  // HAS_MERGEUVROW_SSE2
+#ifdef HAS_MERGEUV10ROW_AVX2
+void MergeUV10Row_AVX2(const uint16* src_u,
+                       const uint16* src_v,
+                       uint16* dst_uv,
+                       int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   (%0,%1,1),%%ymm1                \n"
+    "add        $0x20,%0                       \n"
+    "vpsllw    $0x6,%%ymm0,%%ymm0              \n"
+    "vpsllw    $0x6,%%ymm1,%%ymm1              \n"
+//    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+//    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"
+    "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
+//    "vmovdqu   %%ymm2, (%2)                    \n"
+//    "vmovdqu   %%ymm0, 0x20(%2)                \n"
+    "vextractf128 $0x0,%%ymm2,(%2)             \n"
+    "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+    "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+    "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+    "add       $0x40,%2                        \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_AVX2
 #ifdef HAS_SPLITRGBROW_SSSE3
 // Shuffle table for converting RGB to Planar.

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2617,6 +2617,48 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
  free_aligned_buffer_page_end(dst_pixels_c);
 }
+// TODO(fbarchard): improve test for platforms and cpu detect
+#ifdef HAS_MERGEUV10ROW_AVX2
+TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_u, kPixels * 2);
+  align_buffer_page_end(src_pixels_v, kPixels * 2);
+  align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
+  align_buffer_page_end(dst_pixels_uv_c, kPixels * 2 * 2);
+  MemRandomize(src_pixels_u, kPixels * 2);
+  MemRandomize(src_pixels_v, kPixels * 2);
+  memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
+  memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
+  MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
+                 reinterpret_cast<const uint16*>(src_pixels_v),
+                 reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels);
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    if (has_avx2) {
+      MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
+                        reinterpret_cast<const uint16*>(src_pixels_v),
+                        reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
+    } else {
+      MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
+                     reinterpret_cast<const uint16*>(src_pixels_v),
+                     reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
+    }
+  }
+  for (int i = 0; i < kPixels * 2 * 2; ++i) {
+    EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
+  free_aligned_buffer_page_end(dst_pixels_uv_opt);
+  free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+#endif
 float TestScaleMaxSamples(int benchmark_width,
                          int benchmark_height,
                          int benchmark_iterations,