Commit f553db2d authored by Frank Barchard's avatar Frank Barchard

HalfFloatPlane unittest for denormal half floats

Halffloats have a limited range.  It shouldnt normally come up, but if the scale value passed in produces a small value, the half floats will be denormals, which are slow and/or flust to zero.  This test ensures they behave the same in C and SIMD and tests the performance of denormals.

TEST=TestHalfFloatPlane_denormal
BUG=libyuv:560
R=hubbe@chromium.org

Review URL: https://codereview.chromium.org/2424233004 .
parent 78c58ab8
......@@ -122,6 +122,10 @@ static_library("libyuv") {
# Enable optimize for speed (-O2) over size (-Os).
configs += [ "//build/config/compiler:optimize_max" ]
}
# To enable AVX2 or other cpu optimization, pass flag here
# cflags = [ "-mavx2" ]
}
if (libyuv_use_neon) {
......@@ -140,6 +144,14 @@ if (libyuv_use_neon) {
public_configs = [ ":libyuv_config" ]
# Always enable optimization for Release and NaCl builds (to workaround
# crbug.com/538243).
if (!is_debug) {
configs -= [ "//build/config/compiler:default_optimization" ]
# Enable optimize for speed (-O2) over size (-Os).
configs += [ "//build/config/compiler:optimize_max" ]
}
if (current_cpu != "arm64") {
configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [ "-mfpu=neon" ]
......
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1627
Version: 1628
License: BSD
License File: LICENSE
......
......@@ -330,6 +330,11 @@ extern "C" {
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
// TODO(fbarchard): Port to 32 bit.
#if defined(__aarch64__)
#define HAS_HALFFLOATROW_NEON
#endif
// Effects:
#define HAS_ARGBADDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON
......@@ -1954,6 +1959,9 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale,
int width);
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale,
int width);
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
const uint8* luma, uint32 lumacoeff);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1627
#define LIBYUV_VERSION 1628
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -2585,6 +2585,15 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_HALFFLOATROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
HalfFloatRow = HalfFloatRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
HalfFloatRow = HalfFloatRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
HalfFloatRow(src_y, dst_y, scale, width);
src_y += src_stride_y;
......
......@@ -585,6 +585,9 @@ ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
#ifdef HAS_HALFFLOATROW_F16C
ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
#endif
#ifdef HAS_HALFFLOATROW_NEON
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7)
#endif
#undef ANY11P16
// Any 1 to 1 with yuvconstants
......
......@@ -2710,6 +2710,32 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"uxtl2 v1.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v1.4s, v1.4s \n"
"fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
"fmul v1.4s, v1.4s, %3.s[0] \n"
"uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn2 v4.8h, v1.4s, #13 \n"
MEMACCESS(1)
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "v1", "v2", "v4"
);
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
......
......@@ -2081,9 +2081,12 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
}
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) {
int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info,
float scale) {
int i, j;
const int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
const int y_plane_size = benchmark_width * benchmark_height * 2;
align_buffer_page_end(orig_y, y_plane_size);
align_buffer_page_end(dst_c, y_plane_size);
......@@ -2093,32 +2096,62 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) {
memset(dst_opt, 1, y_plane_size);
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags_);
MaskCpuFlags(disable_cpu_flags);
double c_time = get_time();
for (j = 0; j < benchmark_iterations_; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
(uint16*)dst_c, benchmark_width_ * 2,
1.0f / 4096.0f, benchmark_width_, benchmark_height_);
for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
(uint16*)dst_c, benchmark_width * 2,
scale, benchmark_width, benchmark_height);
}
c_time = (get_time() - c_time) / benchmark_iterations_;
c_time = (get_time() - c_time) / benchmark_iterations;
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info_);
MaskCpuFlags(benchmark_cpu_info);
double opt_time = get_time();
for (j = 0; j < benchmark_iterations_; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
(uint16*)dst_opt, benchmark_width_ * 2,
1.0f / 4096.0f, benchmark_width_, benchmark_height_);
for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
(uint16*)dst_opt, benchmark_width * 2,
scale, benchmark_width, benchmark_height);
}
opt_time = (get_time() - opt_time) / benchmark_iterations_;
opt_time = (get_time() - opt_time) / benchmark_iterations;
int diff = 0;
for (i = 0; i < y_plane_size; ++i) {
EXPECT_EQ(dst_c[i], dst_opt[i]);
diff = dst_c[i] - dst_opt[i];
if (diff) break;
}
free_aligned_buffer_page_end(orig_y);
free_aligned_buffer_page_end(dst_c);
free_aligned_buffer_page_end(dst_opt);
return diff;
}
// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
// exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally
// happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_denormal) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_,
1.0f / 65536.0f);
EXPECT_EQ(diff, 0);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_,
1.0f / 4096.0f);
EXPECT_EQ(diff, 0);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_,
1.0f / 1023.0f);
EXPECT_EQ(diff, 0);
}
TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment