Commit 8676ad70 authored by Frank Barchard's avatar Frank Barchard

scale float samples and return max value

BUG=libyuv:717
TEST=ScaleSum unittest to compare C vs Arm implementation
TBR=kjellander@chromium.org

Change-Id: Iaa7af5547d979aad4722f868d31b405340115748
Reviewed-on: https://chromium-review.googlesource.com/600534Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent 27036e33
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1663
Version: 1664
License: BSD
License File: LICENSE
......
......@@ -359,6 +359,11 @@ extern "C" {
#define HAS_SOBELYROW_NEON
#endif
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SCALESUMSAMPLES_NEON
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
......@@ -3152,6 +3157,14 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
const uint8* luma,
uint32 lumacoeff);
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
float ScaleSumSamples_NEON(const float* src,
float* dst,
float scale,
int width);
void ScaleSamples_C(const float* src, float* dst, float scale, int width);
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1663
#define LIBYUV_VERSION 1664
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -26,7 +26,7 @@ extern "C" {
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
asm volatile(
"vmov.u16 q4, #0 \n" // accumulator
"1: \n"
......@@ -46,10 +46,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "q0", "q1", "q2", "q3", "q4");
return diff;
......@@ -57,7 +54,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
asm volatile(
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
......@@ -81,10 +78,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
return sse;
......
......@@ -24,7 +24,7 @@ extern "C" {
// uses short accumulator which restricts count to 131 KB
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
asm volatile(
"movi v4.8h, #0 \n"
"1: \n"
......@@ -41,10 +41,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
"uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
return diff;
......@@ -52,7 +49,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
asm volatile(
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
......@@ -75,10 +72,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
......
......@@ -30,7 +30,7 @@ void TransposeWx8_NEON(const uint8* src,
int dst_stride,
int width) {
const uint8* src_temp;
asm volatile (
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
......@@ -193,8 +193,7 @@ void TransposeWx8_NEON(const uint8* src,
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
"v17", "v18", "v19", "v20", "v21", "v22", "v23");
}
static uint8 kVTbl4x4TransposeDi[32] = {
......@@ -209,7 +208,7 @@ void TransposeUVWx8_NEON(const uint8* src,
int dst_stride_b,
int width) {
const uint8* src_temp;
asm volatile (
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
......@@ -278,8 +277,10 @@ void TransposeUVWx8_NEON(const uint8* src,
"st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
// dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
// dst_stride_b
"subs %w4, %w4, #8 \n" // w -= 8
"b.ge 1b \n"
......@@ -342,8 +343,10 @@ void TransposeUVWx8_NEON(const uint8* src,
"st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 *
// dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 *
// dst_stride_b
"subs %w4, %w4, #4 \n" // w -= 4
"b.eq 4f \n"
......@@ -380,8 +383,10 @@ void TransposeUVWx8_NEON(const uint8* src,
"st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 *
// dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 *
// dst_stride_b
"subs %w4, %w4, #2 \n" // w -= 2
"b.eq 4f \n"
......@@ -410,11 +415,8 @@ void TransposeUVWx8_NEON(const uint8* src,
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
"r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v30", "v31"
);
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
......
......@@ -2639,6 +2639,25 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
}
#endif
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fmax = 0.f;
int i;
for (i = 0; i < width; ++i) {
float v = *src++ * scale;
*dst++ = v;
fmax = (v > fmax) ? v : fmax;
}
return fmax;
}
void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
int i;
for (i = 0; i < width; ++i) {
float v = *src++ * scale;
*dst++ = v;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -2612,6 +2612,53 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
: "cc", "memory", "v1", "v2", "v3");
}
float ScaleSumSamples_NEON(const float* src,
float* dst,
float scale,
int width) {
float fmax;
asm volatile(
"movi v3.4s, #0 \n" // max
"movi v4.4s, #0 \n" // max
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v1.4s, v1.4s, %4.s[0] \n" // scale
"fmul v2.4s, v2.4s, %4.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"fmax v3.4s, v3.4s, v1.4s \n" // max
"fmax v4.4s, v4.4s, v2.4s \n"
"b.gt 1b \n"
"fmax v3.4s, v3.4s, v4.4s \n" // max
"fmaxv %s3, v3.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
"=w"(fmax) // %3
: "w"(scale) // %4
: "cc", "memory", "v1", "v2", "v3", "v4");
return fmax;
}
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
asm volatile(
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale) // %3
: "cc", "memory", "v1", "v2");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
......
This diff is collapsed.
......@@ -11,6 +11,9 @@
#include <stdlib.h>
#include <time.h>
// row.h defines SIMD_ALIGNED, overriding unit_test.h
#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
#include "../unit_test/unit_test.h"
#include "libyuv/compare.h"
#include "libyuv/convert.h"
......@@ -2518,4 +2521,146 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
float TestScaleSumSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
float scale,
bool opt) {
int i, j;
float max_c, max_opt;
const int y_plane_size = benchmark_width * benchmark_height * 4;
align_buffer_page_end(orig_y, y_plane_size * 3);
uint8* dst_opt = orig_y + y_plane_size;
uint8* dst_c = orig_y + y_plane_size * 2;
// Randomize works but may contain some denormals affecting performance.
// MemRandomize(orig_y, y_plane_size);
for (i = 0; i < y_plane_size / 4; ++i) {
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
}
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size);
// Disable all optimizations.
max_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale,
benchmark_width * benchmark_height);
// Enable optimizations.
for (j = 0; j < benchmark_iterations; j++) {
#ifdef HAS_SCALESUMSAMPLES_NEON
if (opt) {
max_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
} else {
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
}
#else
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
}
float max_diff = 0;
for (i = 0; i < y_plane_size / 4; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_page_end(orig_y);
return max_diff;
}
TEST_F(LibYUVPlanarTest, TestScaleSumSamples_C) {
float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, false);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestScaleSumSamples_Opt) {
float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, true);
EXPECT_EQ(0, diff);
}
float TestScaleSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
float scale,
bool opt) {
int i, j;
const int y_plane_size = benchmark_width * benchmark_height * 4;
align_buffer_page_end(orig_y, y_plane_size * 3);
uint8* dst_opt = orig_y + y_plane_size;
uint8* dst_c = orig_y + y_plane_size * 2;
// Randomize works but may contain some denormals affecting performance.
// MemRandomize(orig_y, y_plane_size);
for (i = 0; i < y_plane_size / 4; ++i) {
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
}
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size);
// Disable all optimizations.
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale,
benchmark_width * benchmark_height);
// Enable optimizations.
for (j = 0; j < benchmark_iterations; j++) {
#ifdef HAS_SCALESAMPLES_NEON
if (opt) {
max_opt = ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
} else {
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
}
#else
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
}
float max_diff = 0;
for (i = 0; i < y_plane_size / 4; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_page_end(orig_y);
return max_diff;
}
TEST_F(LibYUVPlanarTest, TestScaleSamples_C) {
float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, false);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, true);
EXPECT_EQ(0, diff);
}
} // namespace libyuv
......@@ -36,6 +36,9 @@ static __inline int Abs(int v) {
return v >= 0 ? v : -v;
}
static __inline float FAbs(float v) {
return v >= 0 ? v : -v;
}
#define OFFBY 0
// Scaling uses 16.16 fixed point to step thru the source image, so a
......@@ -70,8 +73,11 @@ static inline bool SizeValid(int src_width,
uint8* var; \
uint8* var##_mem; \
var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & /* NOLINT */ \
~4095) - (size)) & ~63);
var = (uint8*)((intptr_t)(var##_mem + \
(((size) + 4095 + 63) & /* NOLINT */ \
~4095) - \
(size)) & \
~63);
#define free_aligned_buffer_page_end(var) \
free(var##_mem); \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment