Commit 0b0a891c authored by Frank Barchard's avatar Frank Barchard

Change TestScaleSumSamples_C test to allow for some float error in sum.

The sum of floats can optimize differently with vectorization, producing
a different result between NEON and C.
Adjust the unittest to allow for some difference in the sum.

The NEON version is 8 samples at a time, so the test now rounds up
the number of values to multiple of 8.

TBR=kjellander@chromium.org
Bug: libyuv:717
Test: LibYUVPlanarTest.TestScaleSumSamples_Opt


Change-Id: I2a0783780c7e0f240f7a8e4700b2a4d3e6b52d87
Reviewed-on: https://chromium-review.googlesource.com/673708Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent efbf1575
...@@ -160,7 +160,7 @@ static_library("libyuv_internal") { ...@@ -160,7 +160,7 @@ static_library("libyuv_internal") {
# To enable AVX2 or other cpu optimization, pass flag here # To enable AVX2 or other cpu optimization, pass flag here
# cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ] # cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ]
if (!is_win) { if (!is_win) {
cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON. cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON.
} }
} }
if (libyuv_use_neon) { if (libyuv_use_neon) {
...@@ -185,6 +185,7 @@ if (libyuv_use_neon) { ...@@ -185,6 +185,7 @@ if (libyuv_use_neon) {
configs -= [ "//build/config/compiler:default_optimization" ] configs -= [ "//build/config/compiler:default_optimization" ]
# Enable optimize for speed (-O2) over size (-Os). # Enable optimize for speed (-O2) over size (-Os).
# TODO(fbarchard): Consider optimize_speed which is O3.
configs += [ "//build/config/compiler:optimize_max" ] configs += [ "//build/config/compiler:optimize_max" ]
} }
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include <math.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
...@@ -2623,44 +2624,44 @@ float TestScaleMaxSamples(int benchmark_width, ...@@ -2623,44 +2624,44 @@ float TestScaleMaxSamples(int benchmark_width,
bool opt) { bool opt) {
int i, j; int i, j;
float max_c, max_opt = 0.f; float max_c, max_opt = 0.f;
const int y_plane_size = benchmark_width * benchmark_height * 4; // NEON does multiple of 8, so round count up
const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
align_buffer_page_end(orig_y, y_plane_size * 3); align_buffer_page_end(orig_y, kPixels * 4 * 3);
uint8* dst_opt = orig_y + y_plane_size; uint8* dst_c = orig_y + kPixels * 4;
uint8* dst_c = orig_y + y_plane_size * 2; uint8* dst_opt = orig_y + kPixels * 4 * 2;
// Randomize works but may contain some denormals affecting performance. // Randomize works but may contain some denormals affecting performance.
// MemRandomize(orig_y, y_plane_size); // MemRandomize(orig_y, kPixels * 4);
for (i = 0; i < y_plane_size / 4; ++i) { // large values are problematic. audio is really -1 to 1.
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f; for (i = 0; i < kPixels; ++i) {
(reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
} }
memset(dst_c, 0, y_plane_size); memset(dst_c, 0, kPixels * 4);
memset(dst_opt, 1, y_plane_size); memset(dst_opt, 1, kPixels * 4);
max_c = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y), max_c = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale, reinterpret_cast<float*>(dst_c), scale, kPixels);
benchmark_width * benchmark_height);
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
if (opt) { if (opt) {
#ifdef HAS_SCALESUMSAMPLES_NEON #ifdef HAS_SCALESUMSAMPLES_NEON
max_opt = ScaleMaxSamples_NEON(reinterpret_cast<float*>(orig_y), max_opt = ScaleMaxSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale, reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height); kPixels);
#else #else
max_opt = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y), max_opt =
reinterpret_cast<float*>(dst_opt), scale, ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
benchmark_width * benchmark_height); reinterpret_cast<float*>(dst_opt), scale, kPixels);
#endif #endif
} else { } else {
max_opt = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y), max_opt =
reinterpret_cast<float*>(dst_opt), scale, ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
benchmark_width * benchmark_height); reinterpret_cast<float*>(dst_opt), scale, kPixels);
} }
} }
float max_diff = FAbs(max_opt - max_c); float max_diff = FAbs(max_opt - max_c);
for (i = 0; i < y_plane_size / 4; ++i) { for (i = 0; i < kPixels; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) - float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i])); (reinterpret_cast<float*>(dst_opt)[i]));
if (abs_diff > max_diff) { if (abs_diff > max_diff) {
...@@ -2691,44 +2692,55 @@ float TestScaleSumSamples(int benchmark_width, ...@@ -2691,44 +2692,55 @@ float TestScaleSumSamples(int benchmark_width,
bool opt) { bool opt) {
int i, j; int i, j;
float sum_c, sum_opt = 0.f; float sum_c, sum_opt = 0.f;
const int y_plane_size = benchmark_width * benchmark_height * 4; // NEON does multiple of 8, so round count up
const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
align_buffer_page_end(orig_y, y_plane_size * 3); align_buffer_page_end(orig_y, kPixels * 4 * 3);
uint8* dst_opt = orig_y + y_plane_size; uint8* dst_c = orig_y + kPixels * 4;
uint8* dst_c = orig_y + y_plane_size * 2; uint8* dst_opt = orig_y + kPixels * 4 * 2;
// Randomize works but may contain some denormals affecting performance. // Randomize works but may contain some denormals affecting performance.
// MemRandomize(orig_y, y_plane_size); // MemRandomize(orig_y, kPixels * 4);
for (i = 0; i < y_plane_size / 4; ++i) { // large values are problematic. audio is really -1 to 1.
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f; for (i = 0; i < kPixels; ++i) {
(reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
} }
memset(dst_c, 0, y_plane_size); memset(dst_c, 0, kPixels * 4);
memset(dst_opt, 1, y_plane_size); memset(dst_opt, 1, kPixels * 4);
sum_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y), sum_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale, reinterpret_cast<float*>(dst_c), scale, kPixels);
benchmark_width * benchmark_height);
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
if (opt) { if (opt) {
#ifdef HAS_SCALESUMSAMPLES_NEON #ifdef HAS_SCALESUMSAMPLES_NEON
sum_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y), sum_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale, reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height); kPixels);
#else #else
sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y), sum_opt =
reinterpret_cast<float*>(dst_opt), scale, ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
benchmark_width * benchmark_height); reinterpret_cast<float*>(dst_opt), scale, kPixels);
#endif #endif
} else { } else {
sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y), sum_opt =
reinterpret_cast<float*>(dst_opt), scale, ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
benchmark_width * benchmark_height); reinterpret_cast<float*>(dst_opt), scale, kPixels);
} }
} }
float max_diff = FAbs(sum_opt - sum_c); float mse_opt = sum_opt / kPixels * 4;
for (i = 0; i < y_plane_size / 4; ++i) { float mse_c = sum_c / kPixels * 4;
float mse_error = FAbs(mse_opt - mse_c) / mse_c;
// If the sum of a float is more than 4 million, small adds are round down on
// float and produce different results with vectorized sum vs scalar sum.
// Ignore the difference if the sum is large.
float max_diff = 0.f;
if (mse_error > 0.0001 && sum_c < 4000000) { // allow .01% difference of mse
max_diff = mse_error;
}
for (i = 0; i < kPixels; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) - float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i])); (reinterpret_cast<float*>(dst_opt)[i]));
if (abs_diff > max_diff) { if (abs_diff > max_diff) {
...@@ -2758,45 +2770,41 @@ float TestScaleSamples(int benchmark_width, ...@@ -2758,45 +2770,41 @@ float TestScaleSamples(int benchmark_width,
float scale, float scale,
bool opt) { bool opt) {
int i, j; int i, j;
const int y_plane_size = benchmark_width * benchmark_height * 4; // NEON does multiple of 8, so round count up
const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
align_buffer_page_end(orig_y, y_plane_size * 3); align_buffer_page_end(orig_y, kPixels * 4 * 3);
uint8* dst_opt = orig_y + y_plane_size; uint8* dst_c = orig_y + kPixels * 4;
uint8* dst_c = orig_y + y_plane_size * 2; uint8* dst_opt = orig_y + kPixels * 4 * 2;
// Randomize works but may contain some denormals affecting performance. // Randomize works but may contain some denormals affecting performance.
// MemRandomize(orig_y, y_plane_size); // MemRandomize(orig_y, kPixels * 4);
for (i = 0; i < y_plane_size / 4; ++i) { // large values are problematic. audio is really -1 to 1.
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f; for (i = 0; i < kPixels; ++i) {
(reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
} }
memset(dst_c, 0, kPixels * 4);
memset(dst_c, 0, y_plane_size); memset(dst_opt, 1, kPixels * 4);
memset(dst_opt, 1, y_plane_size);
ScaleSamples_C(reinterpret_cast<float*>(orig_y), ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale, reinterpret_cast<float*>(dst_c), scale, kPixels);
benchmark_width * benchmark_height);
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
if (opt) { if (opt) {
#ifdef HAS_SCALESUMSAMPLES_NEON #ifdef HAS_SCALESUMSAMPLES_NEON
ScaleSamples_NEON(reinterpret_cast<float*>(orig_y), ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale, reinterpret_cast<float*>(dst_opt), scale, kPixels);
benchmark_width * benchmark_height);
#else #else
ScaleSamples_C(reinterpret_cast<float*>(orig_y), ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale, reinterpret_cast<float*>(dst_opt), scale, kPixels);
benchmark_width * benchmark_height);
#endif #endif
} else { } else {
ScaleSamples_C(reinterpret_cast<float*>(orig_y), ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale, reinterpret_cast<float*>(dst_opt), scale, kPixels);
benchmark_width * benchmark_height);
} }
} }
float max_diff = 0.f; float max_diff = 0.f;
for (i = 0; i < y_plane_size / 4; ++i) { for (i = 0; i < kPixels; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) - float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i])); (reinterpret_cast<float*>(dst_opt)[i]));
if (abs_diff > max_diff) { if (abs_diff > max_diff) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment