Commit 8676ad70 authored by Frank Barchard's avatar Frank Barchard

scale float samples and return max value

BUG=libyuv:717
TEST=ScaleSum unittest to compare C vs Arm implementation
TBR=kjellander@chromium.org

Change-Id: Iaa7af5547d979aad4722f868d31b405340115748
Reviewed-on: https://chromium-review.googlesource.com/600534Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent 27036e33
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1663
Version: 1664
License: BSD
License File: LICENSE
......
......@@ -359,6 +359,11 @@ extern "C" {
#define HAS_SOBELYROW_NEON
#endif
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SCALESUMSAMPLES_NEON
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
......@@ -3152,6 +3157,14 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
const uint8* luma,
uint32 lumacoeff);
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
float ScaleSumSamples_NEON(const float* src,
float* dst,
float scale,
int width);
void ScaleSamples_C(const float* src, float* dst, float scale, int width);
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1663
#define LIBYUV_VERSION 1664
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -26,67 +26,61 @@ extern "C" {
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"vmov.u16 q4, #0 \n" // accumulator
asm volatile(
"vmov.u16 q4, #0 \n" // accumulator
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
"veor.32 q1, q1, q3 \n"
"vcnt.i8 q0, q0 \n"
"vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
"vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n"
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
"veor.32 q1, q1, q3 \n"
"vcnt.i8 q0, q0 \n"
"vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
"vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n"
"vpaddl.u16 q0, q4 \n" // 4 ints
"vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "q0", "q1", "q2", "q3", "q4");
"vpaddl.u16 q0, q4 \n" // 4 ints
"vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "q0", "q1", "q2", "q3", "q4");
return diff;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
asm volatile(
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
return sse;
}
......
......@@ -24,63 +24,57 @@ extern "C" {
// uses short accumulator which restricts count to 131 KB
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"movi v4.8h, #0 \n"
asm volatile(
"movi v4.8h, #0 \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
"uadalp v4.8h, v0.16b \n"
"b.gt 1b \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
"uadalp v4.8h, v0.16b \n"
"b.gt 1b \n"
"uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
"uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
return diff;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
asm volatile(
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
......
This diff is collapsed.
......@@ -172,7 +172,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
......@@ -192,8 +192,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
......
......@@ -2639,6 +2639,25 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
}
#endif
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fmax = 0.f;
int i;
for (i = 0; i < width; ++i) {
float v = *src++ * scale;
*dst++ = v;
fmax = (v > fmax) ? v : fmax;
}
return fmax;
}
void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
int i;
for (i = 0; i < width; ++i) {
float v = *src++ * scale;
*dst++ = v;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -2612,6 +2612,53 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
: "cc", "memory", "v1", "v2", "v3");
}
float ScaleSumSamples_NEON(const float* src,
float* dst,
float scale,
int width) {
float fmax;
asm volatile(
"movi v3.4s, #0 \n" // max
"movi v4.4s, #0 \n" // max
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v1.4s, v1.4s, %4.s[0] \n" // scale
"fmul v2.4s, v2.4s, %4.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"fmax v3.4s, v3.4s, v1.4s \n" // max
"fmax v4.4s, v4.4s, v2.4s \n"
"b.gt 1b \n"
"fmax v3.4s, v3.4s, v4.4s \n" // max
"fmaxv %s3, v3.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
"=w"(fmax) // %3
: "w"(scale) // %4
: "cc", "memory", "v1", "v2", "v3", "v4");
return fmax;
}
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
asm volatile(
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale) // %3
: "cc", "memory", "v1", "v2");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
......
This diff is collapsed.
This diff is collapsed.
......@@ -816,7 +816,7 @@ __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
mov ecx, [esp + 12] // src_width
pxor xmm5, xmm5
// sum rows
// sum rows
xloop:
movdqu xmm3, [eax] // read 16 bytes
lea eax, [eax + 16]
......@@ -847,7 +847,7 @@ __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
mov ecx, [esp + 12] // src_width
vpxor ymm5, ymm5, ymm5
// sum rows
// sum rows
xloop:
vmovdqu ymm3, [eax] // read 32 bytes
lea eax, [eax + 32]
......@@ -939,7 +939,7 @@ __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
add ecx, 2 - 1
jl xloop99
// 1 pixel remainder
// 1 pixel remainder
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
movd xmm0, ebx
psrlw xmm2, 9 // 7 bit fractions.
......@@ -1194,7 +1194,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
sub ecx, 4
jl xloop49
// 4 Pixel loop.
// 4 Pixel loop.
xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
......@@ -1218,7 +1218,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
test ecx, 2
je xloop29
// 2 Pixels.
// 2 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
pextrw eax, xmm2, 5 // get x2 integer.
......@@ -1231,7 +1231,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
test ecx, 1
je xloop99
// 1 Pixels.
// 1 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
movd dword ptr [edi], xmm0
xloop99:
......@@ -1309,7 +1309,7 @@ __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
add ecx, 2 - 1
jl xloop99
// 1 pixel remainder
// 1 pixel remainder
psrlw xmm2, 9 // 7 bit fractions.
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
pshufb xmm2, xmm5 // 00000000
......
......@@ -11,6 +11,9 @@
#include <stdlib.h>
#include <time.h>
// row.h defines SIMD_ALIGNED, overriding unit_test.h
#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
#include "../unit_test/unit_test.h"
#include "libyuv/compare.h"
#include "libyuv/convert.h"
......@@ -2518,4 +2521,146 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
float TestScaleSumSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
float scale,
bool opt) {
int i, j;
float max_c, max_opt;
const int y_plane_size = benchmark_width * benchmark_height * 4;
align_buffer_page_end(orig_y, y_plane_size * 3);
uint8* dst_opt = orig_y + y_plane_size;
uint8* dst_c = orig_y + y_plane_size * 2;
// Randomize works but may contain some denormals affecting performance.
// MemRandomize(orig_y, y_plane_size);
for (i = 0; i < y_plane_size / 4; ++i) {
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
}
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size);
// Disable all optimizations.
max_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale,
benchmark_width * benchmark_height);
// Enable optimizations.
for (j = 0; j < benchmark_iterations; j++) {
#ifdef HAS_SCALESUMSAMPLES_NEON
if (opt) {
max_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
} else {
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
}
#else
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
}
float max_diff = 0;
for (i = 0; i < y_plane_size / 4; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_page_end(orig_y);
return max_diff;
}
TEST_F(LibYUVPlanarTest, TestScaleSumSamples_C) {
float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, false);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestScaleSumSamples_Opt) {
float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, true);
EXPECT_EQ(0, diff);
}
float TestScaleSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
float scale,
bool opt) {
int i, j;
const int y_plane_size = benchmark_width * benchmark_height * 4;
align_buffer_page_end(orig_y, y_plane_size * 3);
uint8* dst_opt = orig_y + y_plane_size;
uint8* dst_c = orig_y + y_plane_size * 2;
// Randomize works but may contain some denormals affecting performance.
// MemRandomize(orig_y, y_plane_size);
for (i = 0; i < y_plane_size / 4; ++i) {
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
}
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size);
// Disable all optimizations.
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale,
benchmark_width * benchmark_height);
// Enable optimizations.
for (j = 0; j < benchmark_iterations; j++) {
#ifdef HAS_SCALESAMPLES_NEON
if (opt) {
max_opt = ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
} else {
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
}
#else
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
}
float max_diff = 0;
for (i = 0; i < y_plane_size / 4; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_page_end(orig_y);
return max_diff;
}
TEST_F(LibYUVPlanarTest, TestScaleSamples_C) {
float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, false);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, true);
EXPECT_EQ(0, diff);
}
} // namespace libyuv
......@@ -36,6 +36,9 @@ static __inline int Abs(int v) {
return v >= 0 ? v : -v;
}
static __inline float FAbs(float v) {
return v >= 0 ? v : -v;
}
#define OFFBY 0
// Scaling uses 16.16 fixed point to step thru the source image, so a
......@@ -70,8 +73,11 @@ static inline bool SizeValid(int src_width,
uint8* var; \
uint8* var##_mem; \
var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & /* NOLINT */ \
~4095) - (size)) & ~63);
var = (uint8*)((intptr_t)(var##_mem + \
(((size) + 4095 + 63) & /* NOLINT */ \
~4095) - \
(size)) & \
~63);
#define free_aligned_buffer_page_end(var) \
free(var##_mem); \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment