Commit 83ca1abe authored by Frank Barchard's avatar Frank Barchard

Change ScaleSumSamples to return Sum of Squares

TBR=kjellander@chromium.org
BUG=libyuv:717
TEST=LibYUVPlanarTest.TestScaleSumSamples_Opt

Change-Id: I5208666f3968c5c4b0f1b0c951f24216d78ee3fe
Reviewed-on: https://chromium-review.googlesource.com/607184Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent 8676ad70
...@@ -158,10 +158,11 @@ static_library("libyuv_internal") { ...@@ -158,10 +158,11 @@ static_library("libyuv_internal") {
} }
# To enable AVX2 or other cpu optimization, pass flag here # To enable AVX2 or other cpu optimization, pass flag here
# cflags = [ "-mavx2" ] # cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ]
# cflags = [ "-mpopcnt" ] if (!is_win) {
cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON.
}
} }
if (libyuv_use_neon) { if (libyuv_use_neon) {
static_library("libyuv_neon") { static_library("libyuv_neon") {
sources = [ sources = [
......
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1664 Version: 1665
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1664 #define LIBYUV_VERSION 1665
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -2642,10 +2642,13 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y, ...@@ -2642,10 +2642,13 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fmax = 0.f; float fmax = 0.f;
int i; int i;
#if defined(__clang__)
#pragma clang loop vectorize_width(4)
#endif
for (i = 0; i < width; ++i) { for (i = 0; i < width; ++i) {
float v = *src++ * scale; float v = *src++;
*dst++ = v; fmax += v * v;
fmax = (v > fmax) ? v : fmax; *dst++ = v * scale;
} }
return fmax; return fmax;
} }
...@@ -2653,8 +2656,7 @@ float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { ...@@ -2653,8 +2656,7 @@ float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
void ScaleSamples_C(const float* src, float* dst, float scale, int width) { void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
int i; int i;
for (i = 0; i < width; ++i) { for (i = 0; i < width; ++i) {
float v = *src++ * scale; *dst++ = *src++ * scale;
*dst++ = v;
} }
} }
......
...@@ -2616,30 +2616,33 @@ float ScaleSumSamples_NEON(const float* src, ...@@ -2616,30 +2616,33 @@ float ScaleSumSamples_NEON(const float* src,
float* dst, float* dst,
float scale, float scale,
int width) { int width) {
float fmax; float fsum;
asm volatile( asm volatile(
"movi v3.4s, #0 \n" // max "movi v5.4s, #0 \n" // max
"movi v4.4s, #0 \n" // max "movi v6.4s, #0 \n" // max
"1: \n" "1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v1.4s, v1.4s, %4.s[0] \n" // scale "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
"fmul v2.4s, v2.4s, %4.s[0] \n" // scale "fmul v4.4s, v2.4s, %4.s[0] \n"
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
"fmax v3.4s, v3.4s, v1.4s \n" // max "fmla v6.4s, v2.4s, v2.4s \n"
"fmax v4.4s, v4.4s, v2.4s \n" "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
"fmax v3.4s, v3.4s, v4.4s \n" // max "faddp v5.4s, v5.4s, v6.4s \n"
"fmaxv %s3, v3.4s \n" // signed max acculator "faddp v5.4s, v5.4s, v5.4s \n"
"faddp v5.4s, v5.4s, v5.4s \n"
"fmov %w3, s5 \n" // sum
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width), // %2 "+r"(width), // %2
"=w"(fmax) // %3 "=w"(fsum) // %3
: "w"(scale) // %4 : "w"(scale) // %4
: "cc", "memory", "v1", "v2", "v3", "v4"); : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
return fmax; return fsum;
} }
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
......
...@@ -2527,7 +2527,7 @@ float TestScaleSumSamples(int benchmark_width, ...@@ -2527,7 +2527,7 @@ float TestScaleSumSamples(int benchmark_width,
float scale, float scale,
bool opt) { bool opt) {
int i, j; int i, j;
float max_c, max_opt; float sum_c, sum_opt = 0.f;
const int y_plane_size = benchmark_width * benchmark_height * 4; const int y_plane_size = benchmark_width * benchmark_height * 4;
align_buffer_page_end(orig_y, y_plane_size * 3); align_buffer_page_end(orig_y, y_plane_size * 3);
...@@ -2542,32 +2542,29 @@ float TestScaleSumSamples(int benchmark_width, ...@@ -2542,32 +2542,29 @@ float TestScaleSumSamples(int benchmark_width,
memset(dst_c, 0, y_plane_size); memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size); memset(dst_opt, 1, y_plane_size);
// Disable all optimizations. sum_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
max_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale, reinterpret_cast<float*>(dst_c), scale,
benchmark_width * benchmark_height); benchmark_width * benchmark_height);
// Enable optimizations.
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
#ifdef HAS_SCALESUMSAMPLES_NEON
if (opt) { if (opt) {
max_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y), #ifdef HAS_SCALESUMSAMPLES_NEON
sum_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale, reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height); benchmark_width * benchmark_height);
#else
sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
} else { } else {
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y), sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale, reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height); benchmark_width * benchmark_height);
} }
#else
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
} }
float max_diff = 0; float max_diff = FAbs(sum_opt - sum_c);
for (i = 0; i < y_plane_size / 4; ++i) { for (i = 0; i < y_plane_size / 4; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) - float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i])); (reinterpret_cast<float*>(dst_opt)[i]));
...@@ -2613,32 +2610,29 @@ float TestScaleSamples(int benchmark_width, ...@@ -2613,32 +2610,29 @@ float TestScaleSamples(int benchmark_width,
memset(dst_c, 0, y_plane_size); memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size); memset(dst_opt, 1, y_plane_size);
// Disable all optimizations.
ScaleSamples_C(reinterpret_cast<float*>(orig_y), ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale, reinterpret_cast<float*>(dst_c), scale,
benchmark_width * benchmark_height); benchmark_width * benchmark_height);
// Enable optimizations.
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
#ifdef HAS_SCALESAMPLES_NEON
if (opt) { if (opt) {
max_opt = ScaleSamples_NEON(reinterpret_cast<float*>(orig_y), #ifdef HAS_SCALESUMSAMPLES_NEON
reinterpret_cast<float*>(dst_opt), scale, ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
benchmark_width * benchmark_height); reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#else
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
} else { } else {
ScaleSamples_C(reinterpret_cast<float*>(orig_y), ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale, reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height); benchmark_width * benchmark_height);
} }
#else
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
} }
float max_diff = 0; float max_diff =0.f;
for (i = 0; i < y_plane_size / 4; ++i) { for (i = 0; i < y_plane_size / 4; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) - float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i])); (reinterpret_cast<float*>(dst_opt)[i]));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment