Commit b36c86fd authored by Frank Barchard's avatar Frank Barchard

Port box filter to NEON

Bug: libyuv:821
Change-Id: I4a6b9bee2c2fae199c73c9ec7ecb32bde37c1852
Tested: out/Release/libyuv_unittest --gtest_filter=*ScaleFrom1920x1080_Box --libyuv_width=160 --libyuv_height=90 --libyuv_repeat=1000
Reviewed-on: https://chromium-review.googlesource.com/c/1298598
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarMiguel Casas <mcasas@chromium.org>
parent b416d36c
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1721
Version: 1722
License: BSD
License File: LICENSE
......
......@@ -58,6 +58,7 @@ extern "C" {
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_FIXEDDIV1_X86
#define HAS_FIXEDDIV_X86
#define HAS_SCALEADDROW_SSE2
#define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3
......@@ -69,7 +70,6 @@ extern "C" {
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSSE3
#define HAS_SCALEADDROW_SSE2
#endif
// The following are available on all x86 platforms, but
......@@ -86,7 +86,9 @@ extern "C" {
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEADDROW_NEON
#define HAS_SCALEARGBCOLS_NEON
#define HAS_SCALEARGBFILTERCOLS_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEFILTERCOLS_NEON
......@@ -94,7 +96,6 @@ extern "C" {
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEARGBFILTERCOLS_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
......@@ -113,18 +114,18 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
#define HAS_FIXEDDIV1_MIPS
#define HAS_FIXEDDIV_MIPS
#define HAS_SCALEADDROW_16_MMI
#define HAS_SCALEADDROW_MMI
#define HAS_SCALEARGBCOLS_MMI
#define HAS_SCALEARGBCOLSUP2_MMI
#define HAS_SCALEARGBROWDOWN2_MMI
#define HAS_SCALEARGBROWDOWNEVEN_MMI
#define HAS_SCALEROWDOWN2_MMI
#define HAS_SCALEROWDOWN4_MMI
#define HAS_SCALEADDROW_MMI
#define HAS_SCALEADDROW_16_MMI
#define HAS_SCALECOLS_16_MMI
#define HAS_SCALECOLS_MMI
#define HAS_SCALEROWDOWN2_16_MMI
#define HAS_SCALEROWDOWN2_MMI
#define HAS_SCALEROWDOWN4_16_MMI
#define HAS_SCALECOLS_MMI
#define HAS_SCALECOLS_16_MMI
#define HAS_SCALEROWDOWN4_MMI
#endif
// Scale ARGB vertically with bilinear interpolation.
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1721
#define LIBYUV_VERSION 1722
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -1980,9 +1980,8 @@ int NV12ToRAW(const uint8_t* src_y,
int dst_stride_raw,
int width,
int height) {
return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
dst_raw, dst_stride_raw, &kYvuI601Constants,
width, height);
return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
dst_stride_raw, &kYvuI601Constants, width, height);
}
// Convert NV21 to RAW.
......@@ -1995,9 +1994,8 @@ int NV21ToRAW(const uint8_t* src_y,
int dst_stride_raw,
int width,
int height) {
return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
dst_raw, dst_stride_raw, &kYvuI601Constants,
width, height);
return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
dst_stride_raw, &kYvuI601Constants, width, height);
}
// Convert M420 to ARGB.
......
......@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <string.h> // For memset/memcpy
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
......@@ -499,6 +501,45 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
1)
#endif
#ifdef SASIMDONLY
// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
// Add rows box filter scale down. Using macro from row_any
#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
SIMD_ALIGNED(uint16_t dst_temp[32]); \
SIMD_ALIGNED(uint8_t src_temp[32]); \
memset(dst_temp, 0, 32 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \
memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \
ANY_SIMD(src_temp, dst_temp, MASK + 1); \
memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \
}
#ifdef HAS_SCALEADDROW_SSE2
SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
#endif
#ifdef HAS_SCALEADDROW_AVX2
SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
#endif
#ifdef HAS_SCALEADDROW_NEON
SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
#endif
#ifdef HAS_SCALEADDROW_MSA
SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
#endif
#ifdef HAS_SCALEADDROW_MMI
SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
#endif
#undef SAANY
#else
// Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
......@@ -526,6 +567,8 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
#endif
#undef SAANY
#endif // SASIMDONLY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -504,37 +504,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
}
void ScaleAddRows_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
int src_width,
int src_height) {
const uint8_t* src_tmp;
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile(
"1: \n"
"mov %0, %1 \n"
"mov r12, %5 \n"
"veor q2, q2, q2 \n"
"veor q3, q3, q3 \n"
"2: \n"
// load 16 pixels into q0
"vld1.8 {q0}, [%0], %3 \n"
"vaddw.u8 q3, q3, d1 \n"
"vaddw.u8 q2, q2, d0 \n"
"subs r12, r12, #1 \n"
"bgt 2b \n"
"vst1.16 {q2, q3}, [%2]! \n" // store pixels
"add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop
"vld1.16 {q1, q2}, [%1] \n" // load accumulator
"vld1.8 {q0}, [%0]! \n" // load 16 bytes
"vaddw.u8 q2, q2, d1 \n" // add
"vaddw.u8 q1, q1, d0 \n"
"vst1.16 {q1, q2}, [%1]! \n" // store accumulator
"subs %2, %2, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "=&r"(src_tmp), // %0
"+r"(src_ptr), // %1
"+r"(dst_ptr), // %2
"+r"(src_stride), // %3
"+r"(src_width), // %4
"+r"(src_height) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
: "memory", "cc", "q0", "q1", "q2" // Clobber List
);
}
......
......@@ -515,37 +515,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"v19", "v30", "v31", "memory", "cc");
}
void ScaleAddRows_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
int src_width,
int src_height) {
const uint8_t* src_tmp;
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile(
"1: \n"
"mov %0, %1 \n"
"mov w12, %w5 \n"
"eor v2.16b, v2.16b, v2.16b \n"
"eor v3.16b, v3.16b, v3.16b \n"
"2: \n"
// load 16 pixels into q0
"ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n"
"uaddw v2.8h, v2.8h, v0.8b \n"
"subs w12, w12, #1 \n"
"b.gt 2b \n"
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
"add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
"uaddw2 v2.8h, v2.8h, v0.16b \n" // add
"uaddw v1.8h, v1.8h, v0.8b \n"
"st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
"subs %w2, %w2, #16 \n" // 16 processed per loop
"b.gt 1b \n"
: "=&r"(src_tmp), // %0
"+r"(src_ptr), // %1
"+r"(dst_ptr), // %2
"+r"(src_stride), // %3
"+r"(src_width), // %4
"+r"(src_height) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
: "memory", "cc", "v0", "v1", "v2" // Clobber List
);
}
......
......@@ -1693,7 +1693,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420) {
EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2;
int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height);
......@@ -1727,7 +1727,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2;
int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height);
......@@ -1786,7 +1786,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2;
int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height);
......@@ -1816,7 +1816,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) {
EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2;
int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height);
......@@ -1846,7 +1846,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2;
int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height);
......@@ -1876,7 +1876,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2;
int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height);
......
......@@ -303,10 +303,10 @@ TEST_FACTOR(3, 1, 3)
TEST_SCALETO(ARGBScale, 1, 1)
TEST_SCALETO(ARGBScale, 320, 240)
TEST_SCALETO(ARGBScale, 352, 288)
TEST_SCALETO(ARGBScale, 569, 480)
TEST_SCALETO(ARGBScale, 640, 360)
TEST_SCALETO(ARGBScale, 1280, 720)
TEST_SCALETO(ARGBScale, 1920, 1080)
#undef TEST_SCALETO1
#undef TEST_SCALETO
......
......@@ -336,10 +336,10 @@ TEST_FACTOR(3, 1, 3, 0)
TEST_SCALETO(Scale, 1, 1)
TEST_SCALETO(Scale, 320, 240)
TEST_SCALETO(Scale, 352, 288)
TEST_SCALETO(Scale, 569, 480)
TEST_SCALETO(Scale, 640, 360)
TEST_SCALETO(Scale, 1280, 720)
TEST_SCALETO(Scale, 1920, 1080)
#undef TEST_SCALETO1
#undef TEST_SCALETO
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment