Commit b36c86fd authored by Frank Barchard's avatar Frank Barchard

Port box filter to NEON

Bug: libyuv:821
Change-Id: I4a6b9bee2c2fae199c73c9ec7ecb32bde37c1852
Tested: out/Release/libyuv_unittest --gtest_filter=*ScaleFrom1920x1080_Box --libyuv_width=160 --libyuv_height=90 --libyuv_repeat=1000
Reviewed-on: https://chromium-review.googlesource.com/c/1298598
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarMiguel Casas <mcasas@chromium.org>
parent b416d36c
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1721 Version: 1722
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -58,6 +58,7 @@ extern "C" { ...@@ -58,6 +58,7 @@ extern "C" {
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_FIXEDDIV1_X86 #define HAS_FIXEDDIV1_X86
#define HAS_FIXEDDIV_X86 #define HAS_FIXEDDIV_X86
#define HAS_SCALEADDROW_SSE2
#define HAS_SCALEARGBCOLS_SSE2 #define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBCOLSUP2_SSE2 #define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3 #define HAS_SCALEARGBFILTERCOLS_SSSE3
...@@ -69,7 +70,6 @@ extern "C" { ...@@ -69,7 +70,6 @@ extern "C" {
#define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSSE3 #define HAS_SCALEROWDOWN4_SSSE3
#define HAS_SCALEADDROW_SSE2
#endif #endif
// The following are available on all x86 platforms, but // The following are available on all x86 platforms, but
...@@ -86,7 +86,9 @@ extern "C" { ...@@ -86,7 +86,9 @@ extern "C" {
// The following are available on Neon platforms: // The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEADDROW_NEON
#define HAS_SCALEARGBCOLS_NEON #define HAS_SCALEARGBCOLS_NEON
#define HAS_SCALEARGBFILTERCOLS_NEON
#define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEFILTERCOLS_NEON #define HAS_SCALEFILTERCOLS_NEON
...@@ -94,7 +96,6 @@ extern "C" { ...@@ -94,7 +96,6 @@ extern "C" {
#define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEARGBFILTERCOLS_NEON
#endif #endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
...@@ -113,18 +114,18 @@ extern "C" { ...@@ -113,18 +114,18 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
#define HAS_FIXEDDIV1_MIPS #define HAS_FIXEDDIV1_MIPS
#define HAS_FIXEDDIV_MIPS #define HAS_FIXEDDIV_MIPS
#define HAS_SCALEADDROW_16_MMI
#define HAS_SCALEADDROW_MMI
#define HAS_SCALEARGBCOLS_MMI #define HAS_SCALEARGBCOLS_MMI
#define HAS_SCALEARGBCOLSUP2_MMI #define HAS_SCALEARGBCOLSUP2_MMI
#define HAS_SCALEARGBROWDOWN2_MMI #define HAS_SCALEARGBROWDOWN2_MMI
#define HAS_SCALEARGBROWDOWNEVEN_MMI #define HAS_SCALEARGBROWDOWNEVEN_MMI
#define HAS_SCALEROWDOWN2_MMI #define HAS_SCALECOLS_16_MMI
#define HAS_SCALEROWDOWN4_MMI #define HAS_SCALECOLS_MMI
#define HAS_SCALEADDROW_MMI
#define HAS_SCALEADDROW_16_MMI
#define HAS_SCALEROWDOWN2_16_MMI #define HAS_SCALEROWDOWN2_16_MMI
#define HAS_SCALEROWDOWN2_MMI
#define HAS_SCALEROWDOWN4_16_MMI #define HAS_SCALEROWDOWN4_16_MMI
#define HAS_SCALECOLS_MMI #define HAS_SCALEROWDOWN4_MMI
#define HAS_SCALECOLS_16_MMI
#endif #endif
// Scale ARGB vertically with bilinear interpolation. // Scale ARGB vertically with bilinear interpolation.
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1721 #define LIBYUV_VERSION 1722
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -1980,9 +1980,8 @@ int NV12ToRAW(const uint8_t* src_y, ...@@ -1980,9 +1980,8 @@ int NV12ToRAW(const uint8_t* src_y,
int dst_stride_raw, int dst_stride_raw,
int width, int width,
int height) { int height) {
return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
dst_raw, dst_stride_raw, &kYvuI601Constants, dst_stride_raw, &kYvuI601Constants, width, height);
width, height);
} }
// Convert NV21 to RAW. // Convert NV21 to RAW.
...@@ -1995,9 +1994,8 @@ int NV21ToRAW(const uint8_t* src_y, ...@@ -1995,9 +1994,8 @@ int NV21ToRAW(const uint8_t* src_y,
int dst_stride_raw, int dst_stride_raw,
int width, int width,
int height) { int height) {
return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
dst_raw, dst_stride_raw, &kYvuI601Constants, dst_stride_raw, &kYvuI601Constants, width, height);
width, height);
} }
// Convert M420 to ARGB. // Convert M420 to ARGB.
......
...@@ -8,6 +8,8 @@ ...@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include <string.h> // For memset/memcpy
#include "libyuv/scale.h" #include "libyuv/scale.h"
#include "libyuv/scale_row.h" #include "libyuv/scale_row.h"
...@@ -499,6 +501,45 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, ...@@ -499,6 +501,45 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
1) 1)
#endif #endif
#ifdef SASIMDONLY
// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
// Add rows box filter scale down. Using macro from row_any
#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
SIMD_ALIGNED(uint16_t dst_temp[32]); \
SIMD_ALIGNED(uint8_t src_temp[32]); \
memset(dst_temp, 0, 32 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \
memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \
ANY_SIMD(src_temp, dst_temp, MASK + 1); \
memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \
}
#ifdef HAS_SCALEADDROW_SSE2
SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
#endif
#ifdef HAS_SCALEADDROW_AVX2
SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
#endif
#ifdef HAS_SCALEADDROW_NEON
SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
#endif
#ifdef HAS_SCALEADDROW_MSA
SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
#endif
#ifdef HAS_SCALEADDROW_MMI
SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
#endif
#undef SAANY
#else
// Add rows box filter scale down. // Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ #define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
...@@ -526,6 +567,8 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) ...@@ -526,6 +567,8 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
#endif #endif
#undef SAANY #undef SAANY
#endif // SASIMDONLY
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -504,37 +504,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ...@@ -504,37 +504,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
} }
void ScaleAddRows_NEON(const uint8_t* src_ptr, // Add a row of bytes to a row of shorts. Used for box filter.
ptrdiff_t src_stride, // Reads 16 bytes and accumulates to 16 shorts at a time.
uint16_t* dst_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr,
int src_width, uint16_t* dst_ptr,
int src_height) { int src_width) {
const uint8_t* src_tmp;
asm volatile( asm volatile(
"1: \n" "1: \n"
"mov %0, %1 \n" "vld1.16 {q1, q2}, [%1] \n" // load accumulator
"mov r12, %5 \n" "vld1.8 {q0}, [%0]! \n" // load 16 bytes
"veor q2, q2, q2 \n" "vaddw.u8 q2, q2, d1 \n" // add
"veor q3, q3, q3 \n" "vaddw.u8 q1, q1, d0 \n"
"2: \n" "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
// load 16 pixels into q0 "subs %2, %2, #16 \n" // 16 processed per loop
"vld1.8 {q0}, [%0], %3 \n"
"vaddw.u8 q3, q3, d1 \n"
"vaddw.u8 q2, q2, d0 \n"
"subs r12, r12, #1 \n"
"bgt 2b \n"
"vst1.16 {q2, q3}, [%2]! \n" // store pixels
"add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop
"bgt 1b \n" "bgt 1b \n"
: "=&r"(src_tmp), // %0 : "+r"(src_ptr), // %0
"+r"(src_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_ptr), // %2 "+r"(src_width) // %2
"+r"(src_stride), // %3
"+r"(src_width), // %4
"+r"(src_height) // %5
: :
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List : "memory", "cc", "q0", "q1", "q2" // Clobber List
); );
} }
......
...@@ -515,37 +515,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ...@@ -515,37 +515,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"v19", "v30", "v31", "memory", "cc"); "v19", "v30", "v31", "memory", "cc");
} }
void ScaleAddRows_NEON(const uint8_t* src_ptr, // Add a row of bytes to a row of shorts. Used for box filter.
ptrdiff_t src_stride, // Reads 16 bytes and accumulates to 16 shorts at a time.
uint16_t* dst_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr,
int src_width, uint16_t* dst_ptr,
int src_height) { int src_width) {
const uint8_t* src_tmp;
asm volatile( asm volatile(
"1: \n" "1: \n"
"mov %0, %1 \n" "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
"mov w12, %w5 \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
"eor v2.16b, v2.16b, v2.16b \n" "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
"eor v3.16b, v3.16b, v3.16b \n" "uaddw v1.8h, v1.8h, v0.8b \n"
"2: \n" "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
// load 16 pixels into q0 "subs %w2, %w2, #16 \n" // 16 processed per loop
"ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n"
"uaddw v2.8h, v2.8h, v0.8b \n"
"subs w12, w12, #1 \n"
"b.gt 2b \n"
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
"add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"b.gt 1b \n" "b.gt 1b \n"
: "=&r"(src_tmp), // %0 : "+r"(src_ptr), // %0
"+r"(src_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_ptr), // %2 "+r"(src_width) // %2
"+r"(src_stride), // %3
"+r"(src_width), // %4
"+r"(src_height) // %5
: :
: "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List : "memory", "cc", "v0", "v1", "v2" // Clobber List
); );
} }
......
...@@ -1693,7 +1693,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420) { ...@@ -1693,7 +1693,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420) {
EXPECT_EQ(0, ret); EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2; int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2; int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height); benchmark_height_ / (width * height);
...@@ -1727,7 +1727,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) { ...@@ -1727,7 +1727,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
EXPECT_EQ(0, ret); EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2; int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2; int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height); benchmark_height_ / (width * height);
...@@ -1786,7 +1786,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) { ...@@ -1786,7 +1786,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
EXPECT_EQ(0, ret); EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2; int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2; int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height); benchmark_height_ / (width * height);
...@@ -1816,7 +1816,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) { ...@@ -1816,7 +1816,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) {
EXPECT_EQ(0, ret); EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2; int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2; int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height); benchmark_height_ / (width * height);
...@@ -1846,7 +1846,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) { ...@@ -1846,7 +1846,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
EXPECT_EQ(0, ret); EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2; int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2; int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height); benchmark_height_ / (width * height);
...@@ -1876,7 +1876,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) { ...@@ -1876,7 +1876,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
EXPECT_EQ(0, ret); EXPECT_EQ(0, ret);
int half_width = (width + 1) / 2; int half_width = (width + 1) / 2;
int half_height = (height + 1)/ 2; int half_height = (height + 1) / 2;
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
benchmark_height_ / (width * height); benchmark_height_ / (width * height);
......
...@@ -303,10 +303,10 @@ TEST_FACTOR(3, 1, 3) ...@@ -303,10 +303,10 @@ TEST_FACTOR(3, 1, 3)
TEST_SCALETO(ARGBScale, 1, 1) TEST_SCALETO(ARGBScale, 1, 1)
TEST_SCALETO(ARGBScale, 320, 240) TEST_SCALETO(ARGBScale, 320, 240)
TEST_SCALETO(ARGBScale, 352, 288)
TEST_SCALETO(ARGBScale, 569, 480) TEST_SCALETO(ARGBScale, 569, 480)
TEST_SCALETO(ARGBScale, 640, 360) TEST_SCALETO(ARGBScale, 640, 360)
TEST_SCALETO(ARGBScale, 1280, 720) TEST_SCALETO(ARGBScale, 1280, 720)
TEST_SCALETO(ARGBScale, 1920, 1080)
#undef TEST_SCALETO1 #undef TEST_SCALETO1
#undef TEST_SCALETO #undef TEST_SCALETO
......
...@@ -336,10 +336,10 @@ TEST_FACTOR(3, 1, 3, 0) ...@@ -336,10 +336,10 @@ TEST_FACTOR(3, 1, 3, 0)
TEST_SCALETO(Scale, 1, 1) TEST_SCALETO(Scale, 1, 1)
TEST_SCALETO(Scale, 320, 240) TEST_SCALETO(Scale, 320, 240)
TEST_SCALETO(Scale, 352, 288)
TEST_SCALETO(Scale, 569, 480) TEST_SCALETO(Scale, 569, 480)
TEST_SCALETO(Scale, 640, 360) TEST_SCALETO(Scale, 640, 360)
TEST_SCALETO(Scale, 1280, 720) TEST_SCALETO(Scale, 1280, 720)
TEST_SCALETO(Scale, 1920, 1080)
#undef TEST_SCALETO1 #undef TEST_SCALETO1
#undef TEST_SCALETO #undef TEST_SCALETO
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment