Commit d6d7de57 authored by yang.zhang@arm.com's avatar yang.zhang@arm.com

Add ScaleFilterCols_NEON for ARM32/64

ARM32/64 NEON versions of ScaleFilterCols_NEON are implemented.

BUG=319
TESTED=libyuvTest.* on ARM32/64 with Android
R=fbarchard@google.com

Change-Id: I5b0838769ffb0182155d7cd6bcc520eb81eb5c4e

Review URL: https://webrtc-codereview.appspot.com/41349004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1340 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 70e5c818
...@@ -203,6 +203,7 @@ extern "C" { ...@@ -203,6 +203,7 @@ extern "C" {
#define HAS_I444TOARGBROW_AVX2 #define HAS_I444TOARGBROW_AVX2
#define HAS_I411TOARGBROW_AVX2 #define HAS_I411TOARGBROW_AVX2
#define HAS_J400TOARGBROW_AVX2 #define HAS_J400TOARGBROW_AVX2
#define HAS_J422TOARGBROW_AVX2
// TODO(fbarchard): Port to Neon // TODO(fbarchard): Port to Neon
#define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565DITHERROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2
...@@ -233,7 +234,6 @@ extern "C" { ...@@ -233,7 +234,6 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2 #define HAS_YUY2TOYROW_AVX2
#define HAS_J422TOARGBROW_AVX2
// The following require HAS_I422TOARGBROW_AVX2 // The following require HAS_I422TOARGBROW_AVX2
#if defined(HAS_I422TOARGBROW_AVX2) #if defined(HAS_I422TOARGBROW_AVX2)
......
...@@ -52,6 +52,7 @@ extern "C" { ...@@ -52,6 +52,7 @@ extern "C" {
#define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEADDROWS_NEON #define HAS_SCALEADDROWS_NEON
#define HAS_SCALEFILTERCOLS_NEON
#endif #endif
// The following are available on Mips platforms: // The following are available on Mips platforms:
...@@ -311,6 +312,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -311,6 +312,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height); uint16* dst_ptr, int src_width, int src_height);
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
......
...@@ -55,6 +55,7 @@ ...@@ -55,6 +55,7 @@
'source/row_win.cc', 'source/row_win.cc',
'source/scale.cc', 'source/scale.cc',
'source/scale_argb.cc', 'source/scale_argb.cc',
'source/scale_any.cc',
'source/scale_common.cc', 'source/scale_common.cc',
'source/scale_mips.cc', 'source/scale_mips.cc',
'source/scale_posix.cc', 'source/scale_posix.cc',
......
...@@ -928,6 +928,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, ...@@ -928,6 +928,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_SSSE3; ScaleFilterCols = ScaleFilterCols_SSSE3;
} }
#endif
#if defined(HAS_SCALEFILTERCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleFilterCols_NEON;
}
}
#endif #endif
if (y > max_y) { if (y > max_y) {
y = max_y; y = max_y;
...@@ -1119,6 +1127,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, ...@@ -1119,6 +1127,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_SSSE3; ScaleFilterCols = ScaleFilterCols_SSSE3;
} }
#endif
#if defined(HAS_SCALEFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleFilterCols_NEON;
}
}
#endif #endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) { if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C; ScaleFilterCols = ScaleColsUp2_C;
......
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
int dst_width, int x, int dx) { \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, \
dst_width & MASK, x + n * dx, dx); \
}
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
#endif
#undef CANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
...@@ -575,6 +575,73 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -575,6 +575,73 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add r12, %1, %5 \n" \
"add %3, %3, %4 \n" \
"vld2.8 {d6["#n"], d7["#n"]}, [r12] \n"
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int tmp[4] = {0, 1, 2, 3};
asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
"vshl.i32 q3, q1, #2 \n" // 4 * dx
"vmul.s32 q1, q1, q2 \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q1, q1, q0 \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"vadd.s32 q2, q1, q3 \n"
"vshl.i32 q0, q3, #1 \n" // 8 * dx
"1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
LOAD2_DATA8_LANE(3)
LOAD2_DATA8_LANE(4)
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
"vmov q10, q1 \n"
"vmov q11, q2 \n"
"vuzp.16 q10, q11 \n"
"vmovl.u8 q8, d6 \n"
"vmovl.u8 q9, d7 \n"
"vsubl.s16 q11, d18, d16 \n"
"vsubl.s16 q12, d19, d17 \n"
"vmovl.u16 q13, d20 \n"
"vmovl.u16 q10, d21 \n"
"vmul.s32 q11, q11, q13 \n"
"vmul.s32 q12, q12, q10 \n"
"vshrn.s32 d18, q11, #16 \n"
"vshrn.s32 d19, q12, #16 \n"
"vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0]! \n" // store pixels
"vadd.s32 q1, q1, q0 \n"
"vadd.s32 q2, q2, q0 \n"
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_ptr) // %0
: "r"(src_ptr), // %1
"r"(dst_width), // %2
"r"(x), // %3
"r"(dx), // %4
"r"(tmp) // %5
: "memory", "cc", "r12", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13"
);
}
#undef LOAD2_DATA8_LANE
// 16x2 -> 16x1 // 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr, void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr, ptrdiff_t src_stride,
......
...@@ -578,6 +578,72 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -578,6 +578,72 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add x12, %1, %5 \n" \
"add %3, %3, %4 \n" \
"ld2 {v4.b, v5.b}["#n"], [x12] \n"
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int tmp[4] = {0, 1, 2, 3};
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
"shl v3.4s, v1.4s, #2 \n" // 4 * dx
"mul v1.4s, v1.4s, v2.4s \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"add v1.4s, v1.4s, v0.4s \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"add v2.4s, v1.4s, v3.4s \n"
"shl v0.4s, v3.4s, #1 \n" // 8 * dx
"1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
LOAD2_DATA8_LANE(3)
LOAD2_DATA8_LANE(4)
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
"mov v6.16b, v1.16b \n"
"mov v7.16b, v2.16b \n"
"uzp1 v6.8h, v6.8h, v7.8h \n"
"ushll v4.8h, v4.8b, #0 \n"
"ushll v5.8h, v5.8b, #0 \n"
"ssubl v16.4s, v5.4h, v4.4h \n"
"ssubl2 v17.4s, v5.8h, v4.8h \n"
"ushll v7.4s, v6.4h, #0 \n"
"ushll2 v6.4s, v6.8h, #0 \n"
"mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n"
"shrn v6.4h, v16.4s, #16 \n"
"shrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n"
MEMACCESS(0)
"st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n"
"subs %2, %2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
: "+r"(dst_ptr) // %0
: "r"(src_ptr), // %1
"r"(dst_width), // %2
"r"(static_cast<ptrdiff_t>(x)), // %3
"r"(static_cast<ptrdiff_t>(dx)), // %4
"r"(tmp) // %5
: "memory", "cc", "x12", "v0", "v1", "v2", "v3",
"v4", "v5", "v6", "v7", "v16", "v17"
);
}
#undef LOAD2_DATA8_LANE
// 16x2 -> 16x1 // 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr, void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr, ptrdiff_t src_stride,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment