Add LIBYUV_API to NV12ToABGR and I444Rotate, I444Scale

Gaussian blur low levels ported to 32 bit neon. But they are not hooked up to anything but a unittest. Bug:b/248041731, b/132108021, b/129908793 Change-Id: Iccebb8ffd6b719810aa11dd770a525227da4c357 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1611206 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Chong Zhang <chz@google.com>

Add LIBYUV_API to NV12ToABGR and I444Rotate, I444Scale
Gaussian blur low levels ported to 32 bit neon. But they are not hooked up to anything but a unittest. Bug:b/248041731, b/132108021, b/129908793 Change-Id: Iccebb8ffd6b719810aa11dd770a525227da4c357 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1611206 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Chong Zhang <chz@google.com>
681c6c67 · Frank Barchard · Commit Bot · 05f72b86 · 681c6c67 · 681c6c67
Commit 681c6c67 authored May 14, 2019 by Frank Barchard Committed by Commit Bot May 14, 2019
11 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1727
+Version: 1730
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -256,6 +256,7 @@ int NV21ToARGB(const uint8_t* src_y,
               int height);
 // Convert NV12 to ABGR.
+LIBYUV_API
 int NV12ToABGR(const uint8_t* src_y,
               int src_stride_y,
               const uint8_t* src_uv,

--- a/include/libyuv/scale.h
+++ b/include/libyuv/scale.h
@@ -126,6 +126,25 @@ int I444Scale(const uint8_t* src_y,
              int dst_height,
              enum FilterMode filtering);
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
 #ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1727
+#define LIBYUV_VERSION 1730
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -1793,8 +1793,9 @@ int NV21ToARGB(const uint8_t* src_y,
 }
 // Convert NV12 to ABGR.
-// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix.
+// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
 // To swap the UV use NV12 instead of NV21.LIBYUV_API
+LIBYUV_API
 int NV12ToABGR(const uint8_t* src_y,
               int src_stride_y,
               const uint8_t* src_uv,

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -521,28 +521,19 @@ int I444Rotate(const uint8_t* src_y,
      CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
      return 0;
    case libyuv::kRotate90:
-      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y,
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-        width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u,
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-       width, height);
-      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v,
-       width, height);
      return 0;
    case libyuv::kRotate270:
-      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y,
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-        width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u,
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-        width, height);
-      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v,
-        width, height);
      return 0;
    case libyuv::kRotate180:
-      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y,
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-        width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u,
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-        width, height);
-      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v,
-        width, height);
      return 0;
    default:
      break;

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2685,6 +2685,84 @@ void ByteToFloatRow_NEON(const uint8_t* src,
      : "cc", "memory", "q1", "q2", "q3");
 }
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "vmov.u16   d6, #4                         \n"  // constant 4
+      "vmov.u16   d7, #6                         \n"  // constant 6
+      "1:                                        \n"
+      "vld1.16    {q1}, [%0]!                    \n"  // load 8 samples, 5 rows
+      "vld1.16    {q2}, [%4]!                    \n"
+      "vaddl.u16  q0, d2, d4                     \n"  // * 1
+      "vaddl.u16  q1, d3, d5                     \n"  // * 1
+      "vld1.16    {q2}, [%1]!                    \n"
+      "vmlal.u16  q0, d4, d6                     \n"  // * 4
+      "vmlal.u16  q1, d5, d6                     \n"  // * 4
+      "vld1.16    {q2}, [%2]!                    \n"
+      "vmlal.u16  q0, d4, d7                     \n"  // * 6
+      "vmlal.u16  q1, d5, d7                     \n"  // * 6
+      "vld1.16    {q2}, [%3]!                    \n"
+      "vmlal.u16  q0, d4, d6                     \n"  // * 4
+      "vmlal.u16  q1, d5, d6                     \n"  // * 4
+      "subs       %6, %6, #8                     \n"  // 8 processed per loop
+      "vst1.32    {q0, q1}, [%5]!                \n"  // store 8 samples
+      "bgt        1b                             \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "vmov.u32    q10, #4                        \n"  // constant 4
+      "vmov.u32    q11, #6                        \n"  // constant 6
+      "1:                                        \n"
+      "vld1.32     {q0, q1}, [%0]!               \n"  // load 12 source samples
+      "vld1.32     {q2}, [%0]                    \n"
+      "vadd.u32    q0, q0, q1                    \n"  // * 1
+      "vadd.u32    q1, q1, q2                    \n"  // * 1
+      "vld1.32     {q2, q3}, [%2]!               \n"
+      "vmla.u32    q0, q2, q11                   \n"  // * 6
+      "vmla.u32    q1, q3, q11                   \n"  // * 6
+      "vld1.32     {q2, q3}, [%1]!               \n"
+      "vld1.32     {q8, q9}, [%3]!               \n"
+      "vadd.u32    q2, q2, q8                    \n"  // add rows for * 4
+      "vadd.u32    q3, q3, q9                    \n"
+      "vmla.u32    q0, q2, q10                   \n"  // * 4
+      "vmla.u32    q1, q3, q10                   \n"  // * 4
+      "subs        %5, %5, #8                    \n"  // 8 processed per loop
+      "vqshrn.u32  d0, q0, #8                    \n"  // round and pack
+      "vqshrn.u32  d1, q1, #8                    \n"
+      "vst1.u16    {q0}, [%4]!                   \n"  // store 8 samples
+      "bgt         1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
 // Convert biplanar NV21 to packed YUV24
 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_vu,

--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1824,6 +1824,39 @@ int I444Scale(const uint8_t* src_y,
  return 0;
 }
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+                dst_width, dst_height, filtering);
+  return 0;
+}
 // Deprecated api
 LIBYUV_API
 int Scale(const uint8_t* src_y,

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -3186,7 +3186,8 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
  }
  GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
  for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
    int has_neon = TestCpuFlag(kCpuHasNEON);
    if (has_neon) {
      GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
@@ -3239,7 +3240,8 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
             &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
             640);
  for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
    int has_neon = TestCpuFlag(kCpuHasNEON);
    if (has_neon) {
      GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],

--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@@ -135,6 +135,123 @@ TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
                 benchmark_cpu_info_);
 }
+static void I444TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i444_y_size = src_width * Abs(src_height);
+  int src_i444_uv_size = src_width * Abs(src_height);
+  int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
+  align_buffer_page_end(src_i444, src_i444_size);
+  for (int i = 0; i < src_i444_size; ++i) {
+    src_i444[i] = fastrand() & 0xff;
+  }
+  int dst_i444_y_size = dst_width * dst_height;
+  int dst_i444_uv_size = dst_width * dst_height;
+  int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
+  align_buffer_page_end(dst_i444_c, dst_i444_size);
+  align_buffer_page_end(dst_i444_opt, dst_i444_size);
+  memset(dst_i444_c, 2, dst_i444_size);
+  memset(dst_i444_opt, 3, dst_i444_size);
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+             src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+             dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
+             dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
+             src_width, src_height, mode);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+               src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+               dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
+               dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
+               dst_width, src_width, src_height, mode);
+  }
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i444_size; ++i) {
+    EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
+  }
+  free_aligned_buffer_page_end(dst_i444_c);
+  free_aligned_buffer_page_end(dst_i444_opt);
+  free_aligned_buffer_page_end(src_i444);
+}
+TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
 static void NV12TestRotate(int src_width,
                           int src_height,
                           int dst_width,

--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc