GaussCol_NEON resample from short to int

Old NEON LibYUVPlanarTest.TestGaussCol_Opt (916 ms) New NEON LibYUVPlanarTest.TestGaussCol_Opt (520 ms) C vectorized LibYUVPlanarTest.TestGaussCol_Opt (739 ms) TBR=kjellander@chromium.org BUG=libyuv:719 TEST=LibYUVPlanarTest.TestGaussCol_Opt Change-Id: I863b66f700f7a71fcb08a2eabb03240fdaf8a238 Reviewed-on: https://chromium-review.googlesource.com/626938Reviewed-by: Cheng Wang <wangcheng@google.com>

GaussCol_NEON resample from short to int
Old NEON LibYUVPlanarTest.TestGaussCol_Opt (916 ms) New NEON LibYUVPlanarTest.TestGaussCol_Opt (520 ms) C vectorized LibYUVPlanarTest.TestGaussCol_Opt (739 ms) TBR=kjellander@chromium.org BUG=libyuv:719 TEST=LibYUVPlanarTest.TestGaussCol_Opt Change-Id: I863b66f700f7a71fcb08a2eabb03240fdaf8a238 Reviewed-on: https://chromium-review.googlesource.com/626938Reviewed-by: Cheng Wang <wangcheng@google.com>
1cc539f7 · Frank Barchard · c5bad809 · 1cc539f7 · 1cc539f7 · 1cc539f7
Commit 1cc539f7 authored Aug 22, 2017 by Frank Barchard
Hide whitespace changes
Inline Side-by-side

Showing with 90 additions and 64 deletions

row_common.cc source/row_common.cc +14 -0

row_neon64.cc source/row_neon64.cc +44 -40

planar_test.cc unit_test/planar_test.cc +32 -24

No files found.
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2672,6 +2672,20 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
  }
 }
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_C(const uint16* src0,
+                const uint16* src1,
+                const uint16* src2,
+                const uint16* src3,
+                const uint16* src4,
+                uint32* dst,
+                int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+  }
+}
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2699,6 +2699,50 @@ static vec16 kGauseCoefficients[4] = {
    {0, 0, 0, 1, 4, 6, 4, 1},
 };
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16* src0,
+                   const uint16* src1,
+                   const uint16* src2,
+                   const uint16* src3,
+                   const uint16* src4,
+                   uint32* dst,
+                   int width) {
+  asm volatile(
+      "movi       v6.8h, #4                      \n"  // constant 4
+      "movi       v7.8h, #6                      \n"  // constant 6
+      "1:                                        \n"
+      "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows
+      "ld1        {v2.8h}, [%1], #16             \n"
+      "ld1        {v3.8h}, [%2], #16             \n"
+      "ld1        {v4.8h}, [%3], #16             \n"
+      "ld1        {v5.8h}, [%4], #16             \n"
+      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
+      "uaddl       v0.4s, v1.4h, v5.4h            \n"  // * 1
+      "uaddl2      v1.4s, v1.8h, v5.8h            \n"  // * 1
+      "umlal       v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "umlal2      v1.4s, v2.8h, v6.8h            \n"  // * 4
+      "umlal       v0.4s, v3.4h, v7.4h            \n"  // * 6
+      "umlal2      v1.4s, v3.8h, v7.8h            \n"  // * 6
+      "umlal       v0.4s, v4.4h, v6.4h            \n"  // * 4
+      "umlal2      v1.4s, v4.8h, v6.8h            \n"  // * 4
+      "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
 void GaussRow_NEON(const uint16* src0, uint16* dst, int width) {
  asm volatile(
@@ -2736,46 +2780,6 @@ void GaussRow_NEON(const uint16* src0, uint16* dst, int width) {
        "v23");
 }
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_NEON(const uint32* src0,
-                   const uint32* src1,
-                   const uint32* src2,
-                   const uint32* src3,
-                   const uint32* src4,
-                   uint16* dst,
-                   int width) {
-  asm volatile(
-      "movi       v5.4s, #4                      \n"  // constant 4
-      "movi       v6.4s, #6                      \n"  // constant 6
-      "1:                                        \n"
-      "ld1        {v0.4s}, [%0], #16             \n"  // load 4 samples, 5 rows
-      "ld1        {v1.4s}, [%1], #16             \n"
-      "ld1        {v2.4s}, [%2], #16             \n"
-      "ld1        {v3.4s}, [%3], #16             \n"
-      "ld1        {v4.4s}, [%4], #16             \n"
-      "subs       %w6, %w6, #4                   \n"  // 4 processed per loop
-      "add        v0.4s, v0.4s, v4.4s            \n"  // * 1
-      "mla        v0.4s, v1.4s, v5.4s            \n"  // * 4
-      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 6
-      "mla        v0.4s, v3.4s, v5.4s            \n"  // * 4
-      "uqshrn     v0.4h, v0.4s, #8               \n"  // round, shift by 8 pack.
-      "st1        {v0.4h}, [%5], #8              \n"  // store 4 samples
-      "b.gt       1b                             \n"
-      : "+r"(src0),  // %0
-        "+r"(src1),  // %1
-        "+r"(src2),  // %2
-        "+r"(src3),  // %3
-        "+r"(src4),  // %4
-        "+r"(dst),   // %5
-        "+r"(width)  // %6
-      :
-      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
-}
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #ifdef __cplusplus

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2742,6 +2742,7 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
    orig_pixels[i] = i;
  }
  GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280);
+  MaskCpuFlags(benchmark_cpu_info_);
  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
    int has_neon = TestCpuFlag(kCpuHasNEON);
@@ -2760,22 +2761,29 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
  }
  EXPECT_EQ(dst_pixels_c[0], 0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1);
-  EXPECT_EQ(dst_pixels_c[1279],
+  EXPECT_EQ(dst_pixels_c[1279], 20496);
-            1279 * 1 + 1280 * 4 + 1281 * 6 + 1282 * 4 + 1283 * 1);
 }
-extern "C" void GaussCol_NEON(const uint32* src0,
+extern "C" void GaussCol_NEON(const uint16* src0,
-                              const uint32* src1,
+                              const uint16* src1,
-                              const uint32* src2,
+                              const uint16* src2,
-                              const uint32* src3,
+                              const uint16* src3,
-                              const uint32* src4,
+                              const uint16* src4,
-                              uint16* dst,
+                              uint32* dst,
                              int width);
+extern "C" void GaussCol_C(const uint16* src0,
+                           const uint16* src1,
+                           const uint16* src2,
+                           const uint16* src3,
+                           const uint16* src4,
+                           uint32* dst,
+                           int width);
 TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
-  SIMD_ALIGNED(uint32 orig_pixels[1280 * 5]);
+  SIMD_ALIGNED(uint16 orig_pixels[1280 * 5]);
-  SIMD_ALIGNED(uint16 dst_pixels_c[1280]);
+  SIMD_ALIGNED(uint32 dst_pixels_c[1280]);
-  SIMD_ALIGNED(uint16 dst_pixels_opt[1280]);
+  SIMD_ALIGNED(uint32 dst_pixels_opt[1280]);
  memset(orig_pixels, 0, sizeof(orig_pixels));
  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
@@ -2784,9 +2792,10 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
  for (int i = 0; i < 1280 * 5; ++i) {
    orig_pixels[i] = i;
  }
-  GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+  GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
-                &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+             &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0],
-                &dst_pixels_c[0], 1280);
+             1280);
+  MaskCpuFlags(benchmark_cpu_info_);
  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
    int has_neon = TestCpuFlag(kCpuHasNEON);
@@ -2795,14 +2804,14 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
                    &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
                    &dst_pixels_opt[0], 1280);
    } else {
-      GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+      GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
-                    &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                 &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
-                    &dst_pixels_opt[0], 1280);
+                 &dst_pixels_opt[0], 1280);
    }
 #else
-    GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+    GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
-                  &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+               &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
-                  &dst_pixels_opt[0], 1280);
+               &dst_pixels_opt[0], 1280);
 #endif
  }
@@ -2810,10 +2819,9 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
-  EXPECT_EQ(dst_pixels_c[0], (0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 +
+  EXPECT_EQ(dst_pixels_c[0],
-                              1280 * 4 * 1 + 128) /
+            0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 + 1280 * 4 * 1);
-                                 256);
+  EXPECT_EQ(dst_pixels_c[1279], 61424);
-  EXPECT_EQ(dst_pixels_c[1279], 239);
 }
 #endif  // aarch64