HalfFloatPlane unittest for denormal half floats

Halffloats have a limited range. It shouldnt normally come up, but if the scale value passed in produces a small value, the half floats will be denormals, which are slow and/or flust to zero. This test ensures they behave the same in C and SIMD and tests the performance of denormals. TEST=TestHalfFloatPlane_denormal BUG=libyuv:560 R=hubbe@chromium.org Review URL: https://codereview.chromium.org/2424233004 .

HalfFloatPlane unittest for denormal half floats
Halffloats have a limited range. It shouldnt normally come up, but if the scale value passed in produces a small value, the half floats will be denormals, which are slow and/or flust to zero. This test ensures they behave the same in C and SIMD and tests the performance of denormals. TEST=TestHalfFloatPlane_denormal BUG=libyuv:560 R=hubbe@chromium.org Review URL: https://codereview.chromium.org/2424233004 .
f553db2d · Frank Barchard · 78c58ab8 · f553db2d · f553db2d · f553db2d
Commit f553db2d authored Oct 20, 2016 by Frank Barchard
8 changed files
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -122,6 +122,10 @@ static_library("libyuv") {
    # Enable optimize for speed (-O2) over size (-Os).
    configs += [ "//build/config/compiler:optimize_max" ]
  }
+  # To enable AVX2 or other cpu optimization, pass flag here
+  #  cflags = [ "-mavx2" ]
 }
 if (libyuv_use_neon) {
@@ -140,6 +144,14 @@ if (libyuv_use_neon) {
    public_configs = [ ":libyuv_config" ]
+    # Always enable optimization for Release and NaCl builds (to workaround
+    # crbug.com/538243).
+    if (!is_debug) {
+      configs -= [ "//build/config/compiler:default_optimization" ]
+      # Enable optimize for speed (-O2) over size (-Os).
+      configs += [ "//build/config/compiler:optimize_max" ]
+    }
    if (current_cpu != "arm64") {
      configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
      cflags = [ "-mfpu=neon" ]

--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1627
+Version: 1628
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -201,7 +201,7 @@ extern "C" {
 #define HAS_COPYROW_AVX
 #define HAS_H422TOARGBROW_AVX2
 #define HAS_HALFFLOATROW_AVX2
-// #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
+//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
 #define HAS_I400TOARGBROW_AVX2
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
@@ -330,6 +330,11 @@ extern "C" {
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
+// TODO(fbarchard): Port to 32 bit.
+#if defined(__aarch64__)
+#define HAS_HALFFLOATROW_NEON
+#endif
 // Effects:
 #define HAS_ARGBADDROW_NEON
 #define HAS_ARGBATTENUATEROW_NEON
@@ -1954,6 +1959,9 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
 void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale,
                           int width);
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale,
+                           int width);
 void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                             const uint8* luma, uint32 lumacoeff);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1627
+#define LIBYUV_VERSION 1628
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -2585,6 +2585,15 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
    }
  }
 #endif
+#if defined(HAS_HALFFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HalfFloatRow = HalfFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = HalfFloatRow_NEON;
+    }
+  }
+#endif
  for (y = 0; y < height; ++y) {
    HalfFloatRow(src_y, dst_y, scale, width);
    src_y += src_stride_y;

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -585,6 +585,9 @@ ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
 #ifdef HAS_HALFFLOATROW_F16C
 ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
 #endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7)
+#endif
 #undef ANY11P16
 // Any 1 to 1 with yuvconstants

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2710,6 +2710,32 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+    "uxtl2      v1.4s, v1.8h                   \n"
+    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+    "scvtf      v1.4s, v1.4s                   \n"
+    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
+    "fmul       v1.4s, v1.4s, %3.s[0]          \n"
+    "uqshrn     v4.4h, v2.4s, #13              \n"  // isolate halffloat
+    "uqshrn2    v4.8h, v1.4s, #13              \n"
+   MEMACCESS(1)
+    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
+    "b.gt       1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "w"(scale * 1.9259299444e-34f)    // %3
+  : "cc", "memory", "v1", "v2", "v4"
+  );
+}
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #ifdef __cplusplus

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2081,9 +2081,12 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
  }
 }
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) {
+int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
+                       int benchmark_iterations,
+                       int disable_cpu_flags, int benchmark_cpu_info,
+                       float scale) {
  int i, j;
-  const int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
+  const int y_plane_size = benchmark_width * benchmark_height * 2;
  align_buffer_page_end(orig_y, y_plane_size);
  align_buffer_page_end(dst_c, y_plane_size);
@@ -2093,32 +2096,62 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) {
  memset(dst_opt, 1, y_plane_size);
  // Disable all optimizations.
-  MaskCpuFlags(disable_cpu_flags_);
+  MaskCpuFlags(disable_cpu_flags);
  double c_time = get_time();
-  for (j = 0; j < benchmark_iterations_; j++) {
+  for (j = 0; j < benchmark_iterations; j++) {
-    HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
+    HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
-                   (uint16*)dst_c, benchmark_width_ * 2,
+                   (uint16*)dst_c, benchmark_width * 2,
-                   1.0f / 4096.0f, benchmark_width_, benchmark_height_);
+                   scale, benchmark_width, benchmark_height);
  }
-  c_time = (get_time() - c_time) / benchmark_iterations_;
+  c_time = (get_time() - c_time) / benchmark_iterations;
  // Enable optimizations.
-  MaskCpuFlags(benchmark_cpu_info_);
+  MaskCpuFlags(benchmark_cpu_info);
  double opt_time = get_time();
-  for (j = 0; j < benchmark_iterations_; j++) {
+  for (j = 0; j < benchmark_iterations; j++) {
-    HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
+    HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
-                   (uint16*)dst_opt, benchmark_width_ * 2,
+                   (uint16*)dst_opt, benchmark_width * 2,
-                   1.0f / 4096.0f, benchmark_width_, benchmark_height_);
+                   scale, benchmark_width, benchmark_height);
  }
-  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+  int diff = 0;
  for (i = 0; i < y_plane_size; ++i) {
-    EXPECT_EQ(dst_c[i], dst_opt[i]);
+    diff = dst_c[i] - dst_opt[i];
+    if (diff) break;
  }
  free_aligned_buffer_page_end(orig_y);
  free_aligned_buffer_page_end(dst_c);
  free_aligned_buffer_page_end(dst_opt);
+  return diff;
+}
+// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
+// exponent to be less than 0.  15 - log2(65536) = -1/  This shouldnt normally
+// happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_denormal) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 65536.0f);
+  EXPECT_EQ(diff, 0);
+}
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 4096.0f);
+  EXPECT_EQ(diff, 0);
+}
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 1023.0f);
+  EXPECT_EQ(diff, 0);
 }
 TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {