Commit 5f609856 authored by yang.zhang@arm.com's avatar yang.zhang@arm.com

Add ScaleARGBFilterCols_NEON for ARM32/64

ARM32/64 NEON versions of ScaleARGBFilterCols_NEON are implemented.

BUG=319
TESTED=libyuvTest.* on ARM32/64 with Android
R=fbarchard@google.com

Change-Id: Ifea62bc25d846bf16cb51d13b408de7bf58dccd4

Review URL: https://webrtc-codereview.appspot.com/46699004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1361 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 3d1176a3
......@@ -67,6 +67,7 @@ extern "C" {
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEARGBFILTERCOLS_NEON
#endif
// The following are available on Mips platforms:
......@@ -371,6 +372,12 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
......
......@@ -36,6 +36,10 @@ CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
#ifdef HAS_SCALEARGBCOLS_NEON
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C, 4, 3)
#endif
#undef CANY
// Fixed scale down.
......
......@@ -230,6 +230,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB.
......@@ -322,6 +330,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
......@@ -504,6 +520,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
......
......@@ -959,6 +959,76 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
#undef LOAD1_DATA32_LANE
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int *tmp = dx_offset;
const uint8* src_tmp = src_argb;
asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
"vshl.i32 q9, q1, #2 \n" // 4 * dx
"vmul.s32 q1, q1, q2 \n"
"vmov.i8 q3, #0x7f \n" // 0x7F
"vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q8, q1, q0 \n"
"1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0)
LOAD2_DATA32_LANE(d0, d2, 1)
LOAD2_DATA32_LANE(d1, d3, 0)
LOAD2_DATA32_LANE(d1, d3, 1)
"vshrn.i32 d22, q8, #9 \n"
"vand.16 d22, d22, d30 \n"
"vdup.8 d24, d22[0] \n"
"vdup.8 d25, d22[2] \n"
"vdup.8 d26, d22[4] \n"
"vdup.8 d27, d22[6] \n"
"vext.8 d4, d24, d25, #4 \n"
"vext.8 d5, d26, d27, #4 \n" // f
"veor.8 q10, q2, q3 \n" // 0x7f ^ f
"vmull.u8 q11, d0, d20 \n"
"vmull.u8 q12, d1, d21 \n"
"vmull.u8 q13, d2, d4 \n"
"vmull.u8 q14, d3, d5 \n"
"vadd.i16 q11, q11, q13 \n"
"vadd.i16 q12, q12, q14 \n"
"vshrn.i16 d0, q11, #7 \n"
"vshrn.i16 d1, q12, #7 \n"
MEMACCESS(0)
"vst1.32 {d0, d1}, [%0]! \n" // store pixels
"vadd.s32 q8, q8, q9 \n"
"subs %2, %2, #4 \n" // 4 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
#undef LOAD2_DATA32_LANE
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
......
......@@ -955,6 +955,76 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
#undef LOAD1_DATA32_LANE
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(vn1, vn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int *tmp = dx_offset;
const uint8* src_tmp = src_argb;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
"shl v6.4s, v1.4s, #2 \n" // 4 * dx
"mul v1.4s, v1.4s, v2.4s \n"
"movi v3.16b, #0x7f \n" // 0x7F
"movi v4.8h, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"add v5.4s, v1.4s, v0.4s \n"
"1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(v0, v1, 0)
LOAD2_DATA32_LANE(v0, v1, 1)
LOAD2_DATA32_LANE(v0, v1, 2)
LOAD2_DATA32_LANE(v0, v1, 3)
"shrn v2.4h, v5.4s, #9 \n"
"and v2.8b, v2.8b, v4.8b \n"
"dup v16.8b, v2.b[0] \n"
"dup v17.8b, v2.b[2] \n"
"dup v18.8b, v2.b[4] \n"
"dup v19.8b, v2.b[6] \n"
"ext v2.8b, v16.8b, v17.8b, #4 \n"
"ext v17.8b, v18.8b, v19.8b, #4 \n"
"ins v2.d[1], v17.d[0] \n" // f
"eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
"umull v16.8h, v0.8b, v7.8b \n"
"umull2 v17.8h, v0.16b, v7.16b \n"
"umull v18.8h, v1.8b, v2.8b \n"
"umull2 v19.8h, v1.16b, v2.16b \n"
"add v16.8h, v16.8h, v18.8h \n"
"add v17.8h, v17.8h, v19.8h \n"
"shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n"
MEMACCESS(0)
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %2, %2, #4 \n" // 4 processed per loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v16", "v17", "v18", "v19"
);
}
#undef LOAD2_DATA32_LANE
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment