Commit 7aa6f06e authored by frkoenig@google.com's avatar frkoenig@google.com

Avoid using Q4-Q7 registers for NEON.

Was not paying attention to which registers were for scratch.
Avoid the need to preserve registers by not using registers
in the Q4-Q7 range.

Fix ScaleDown2Int_NEON by changing how rounding was applied.

ScaleDownRow4 changed to process 4 output pixels per loop.

No need to push/pop registers for UV Transpose, removed
functions.

Fix for CPU Flag for scale_test.cc to turn on/off optimizations
for timing.
Review URL: http://webrtc-codereview.appspot.com/259002

git-svn-id: http://libyuv.googlecode.com/svn/trunk@58 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 799796b2
......@@ -44,8 +44,6 @@ typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);
#ifdef __ARM_NEON__
extern "C" {
void RestoreRegisters_NEON(unsigned long long *restore);
void SaveRegisters_NEON(unsigned long long *store);
#define HAS_REVERSE_LINE_NEON
void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
#define HAS_REVERSE_LINE_UV_NEON
......@@ -996,9 +994,7 @@ void TransposeUV(const uint8* src, int src_stride,
rotate_uv_wxh_func TransposeWxH;
#if defined(HAS_TRANSPOSE_UVWX8_NEON)
unsigned long long store_reg[8];
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
SaveRegisters_NEON(store_reg);
TransposeWx8 = TransposeUVWx8_NEON;
TransposeWxH = TransposeUVWxH_C;
} else
......@@ -1036,11 +1032,6 @@ void TransposeUV(const uint8* src, int src_stride,
dst_b, dst_stride_b,
width, i);
#if defined(HAS_TRANSPOSE_UVWX8_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
RestoreRegisters_NEON(store_reg);
}
#endif
}
void RotateUV90(const uint8* src, int src_stride,
......
.global RestoreRegisters_NEON
.global ReverseLine_NEON
.global ReverseLineUV_NEON
.global SaveRegisters_NEON
.global TransposeWx8_NEON
.global TransposeUVWx8_NEON
.type RestoreRegisters_NEON, function
.type ReverseLine_NEON, function
.type ReverseLineUV_NEON, function
.type SaveRegisters_NEON, function
.type TransposeWx8_NEON, function
.type TransposeUVWx8_NEON, function
......@@ -261,20 +257,6 @@ Ldone:
vtbl_4x4_transpose:
.byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
@ void SaveRegisters_NEON (unsigned long long store)
@ r0 unsigned long long store
SaveRegisters_NEON:
vst1.i64 {d8, d9, d10, d11}, [r0]!
vst1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
@ void RestoreRegisters_NEON (unsigned long long store)
@ r0 unsigned long long store
RestoreRegisters_NEON:
vld1.i64 {d8, d9, d10, d11}, [r0]!
vld1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
@ void ReverseLineUV_NEON (const uint8* src,
@ uint8* dst_a,
@ uint8* dst_b,
......@@ -380,34 +362,34 @@ Lloop_8x8_di:
vld2.8 {d2, d3}, [r9], r1
vld2.8 {d4, d5}, [r9], r1
vld2.8 {d6, d7}, [r9], r1
vld2.8 {d8, d9}, [r9], r1
vld2.8 {d10, d11}, [r9], r1
vld2.8 {d12, d13}, [r9], r1
vld2.8 {d14, d15}, [r9]
vld2.8 {d16, d17}, [r9], r1
vld2.8 {d18, d19}, [r9], r1
vld2.8 {d20, d21}, [r9], r1
vld2.8 {d22, d23}, [r9]
vtrn.8 q1, q0
vtrn.8 q3, q2
vtrn.8 q5, q4
vtrn.8 q7, q6
vtrn.8 q9, q8
vtrn.8 q11, q10
vtrn.16 q1, q3
vtrn.16 q0, q2
vtrn.16 q5, q7
vtrn.16 q4, q6
vtrn.16 q9, q11
vtrn.16 q8, q10
vtrn.32 q1, q5
vtrn.32 q0, q4
vtrn.32 q3, q7
vtrn.32 q2, q6
vtrn.32 q1, q9
vtrn.32 q0, q8
vtrn.32 q3, q11
vtrn.32 q2, q10
vrev16.8 q0, q0
vrev16.8 q1, q1
vrev16.8 q2, q2
vrev16.8 q3, q3
vrev16.8 q4, q4
vrev16.8 q5, q5
vrev16.8 q6, q6
vrev16.8 q7, q7
vrev16.8 q8, q8
vrev16.8 q9, q9
vrev16.8 q10, q10
vrev16.8 q11, q11
mov r9, r2
......@@ -415,10 +397,10 @@ Lloop_8x8_di:
vst1.8 {d0}, [r9], r3
vst1.8 {d6}, [r9], r3
vst1.8 {d4}, [r9], r3
vst1.8 {d10}, [r9], r3
vst1.8 {d8}, [r9], r3
vst1.8 {d14}, [r9], r3
vst1.8 {d12}, [r9]
vst1.8 {d18}, [r9], r3
vst1.8 {d16}, [r9], r3
vst1.8 {d22}, [r9], r3
vst1.8 {d20}, [r9]
mov r9, r4
......@@ -426,10 +408,10 @@ Lloop_8x8_di:
vst1.8 {d1}, [r9], r5
vst1.8 {d7}, [r9], r5
vst1.8 {d5}, [r9], r5
vst1.8 {d11}, [r9], r5
vst1.8 {d9}, [r9], r5
vst1.8 {d15}, [r9], r5
vst1.8 {d13}, [r9]
vst1.8 {d19}, [r9], r5
vst1.8 {d17}, [r9], r5
vst1.8 {d23}, [r9], r5
vst1.8 {d21}, [r9]
add r0, #8*2 @ src += 8*2
add r2, r3, lsl #3 @ dst_a += 8 * dst_stride_a
......@@ -462,45 +444,45 @@ Lblock_4x8_di:
vld1.64 {d7}, [r9]
adr r12, vtbl_4x4_transpose_di
vld1.8 {q7}, [r12]
vld1.8 {q15}, [r12]
vtrn.8 q0, q1
vtrn.8 q2, q3
vtbl.8 d8, {d0, d1}, d14
vtbl.8 d9, {d0, d1}, d15
vtbl.8 d10, {d2, d3}, d14
vtbl.8 d11, {d2, d3}, d15
vtbl.8 d12, {d4, d5}, d14
vtbl.8 d13, {d4, d5}, d15
vtbl.8 d0, {d6, d7}, d14
vtbl.8 d1, {d6, d7}, d15
vtbl.8 d16, {d0, d1}, d30
vtbl.8 d17, {d0, d1}, d31
vtbl.8 d18, {d2, d3}, d30
vtbl.8 d19, {d2, d3}, d31
vtbl.8 d20, {d4, d5}, d30
vtbl.8 d21, {d4, d5}, d31
vtbl.8 d22, {d6, d7}, d30
vtbl.8 d23, {d6, d7}, d31
mov r9, r2
vst1.32 {d8[0]}, [r9], r3
vst1.32 {d8[1]}, [r9], r3
vst1.32 {d9[0]}, [r9], r3
vst1.32 {d9[1]}, [r9], r3
vst1.32 {d16[0]}, [r9], r3
vst1.32 {d16[1]}, [r9], r3
vst1.32 {d17[0]}, [r9], r3
vst1.32 {d17[1]}, [r9], r3
add r9, r2, #4
vst1.32 {d12[0]}, [r9], r3
vst1.32 {d12[1]}, [r9], r3
vst1.32 {d13[0]}, [r9], r3
vst1.32 {d13[1]}, [r9]
vst1.32 {d20[0]}, [r9], r3
vst1.32 {d20[1]}, [r9], r3
vst1.32 {d21[0]}, [r9], r3
vst1.32 {d21[1]}, [r9]
mov r9, r4
vst1.32 {d10[0]}, [r9], r5
vst1.32 {d10[1]}, [r9], r5
vst1.32 {d11[0]}, [r9], r5
vst1.32 {d11[1]}, [r9], r5
vst1.32 {d18[0]}, [r9], r5
vst1.32 {d18[1]}, [r9], r5
vst1.32 {d19[0]}, [r9], r5
vst1.32 {d19[1]}, [r9], r5
add r9, r4, #4
vst1.32 {d0[0]}, [r9], r5
vst1.32 {d0[1]}, [r9], r5
vst1.32 {d1[0]}, [r9], r5
vst1.32 {d1[1]}, [r9]
vst1.32 {d22[0]}, [r9], r5
vst1.32 {d22[1]}, [r9], r5
vst1.32 {d23[0]}, [r9], r5
vst1.32 {d23[1]}, [r9]
add r0, #4*2 @ src += 4 * 2
add r2, r3, lsl #2 @ dst_a += 4 * dst_stride_a
......
......@@ -70,9 +70,7 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst, int dst_width) {
__asm__ volatile
(
"mov r4, #2\n" // rounding constant
"add %1, %0\n" // change the stride to row 2 pointer
"vdup.16 q4, r4\n"
"1:\n"
"vld1.u8 {q0,q1}, [%0]!\n" // load row 1 and post increment
"vld1.u8 {q2,q3}, [%1]!\n" // load row 2 and post increment
......@@ -80,10 +78,8 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
"vpaddl.u8 q1, q1\n"
"vpadal.u8 q0, q2\n" // row 2 add adjacent, add row 1 to row 2
"vpadal.u8 q1, q3\n"
"vadd.u16 q0, q4\n" // rounding
"vadd.u16 q1, q4\n"
"vshrn.u16 d0, q0, #2\n" // downshift and pack
"vshrn.u16 d1, q1, #2\n"
"vrshrn.u16 d0, q0, #2\n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2\n"
"vst1.u8 {q0}, [%2]!\n"
"subs %3, %3, #16\n" // 16 processed per loop
"bhi 1b\n"
......@@ -92,32 +88,28 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "r4", "q0", "q1", "q2", "q3", "q4" // Clobber List
: "q0", "q1", "q2", "q3" // Clobber List
);
}
#define HAS_SCALEROWDOWN4_NEON
// Expecting widths on arm devices to be smaller. Went with 8x4 blocks
// to get most coverage. Look to back and evaluate 16x4 blocks with
// handling of leftovers.
static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"mov r4, #4\n"
"1:\n"
"vld1.u8 {d0[0]}, [%0],r4\n" // load up only 2 pixels of data to
"vld1.u8 {d0[1]}, [%0],r4\n" // represent the entire 8x4 block
"vld2.u8 {d0, d1}, [%0]!\n"
"vtrn.u8 d1, d0\n"
"vshrn.u16 d0, q0, #8\n"
"vst1.u32 {d0[1]}, [%1]!\n"
"vst1.u16 {d0[0]}, [%1]!\n"
"subs %2, #2\n" // dst_width -= 2
"subs %2, #4\n"
"bhi 1b\n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "r4", "q0", "q1", "memory", "cc"
: "q0", "q1", "memory", "cc"
);
}
......@@ -125,46 +117,35 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"add r4, %0, %3\n"
"add r5, r4, %3\n"
"add %3, r5, %3\n"
"1:\n"
"mov r4, %0\n"
"vld1.u8 {d0}, [r4],%3\n" // load up 8x4 block of input data
"vld1.u8 {d1}, [r4],%3\n"
"vld1.u8 {d2}, [r4],%3\n"
"vld1.u8 {d3}, [r4]\n"
// data is loaded up int q0 and q1
// q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
// q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
// q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
"vpaddl.u8 q0, q0\n"
"vld1.u8 {q0}, [%0]!\n" // load up 16x4 block of input data
"vld1.u8 {q1}, [r4]!\n"
"vld1.u8 {q2}, [r5]!\n"
"vld1.u8 {q3}, [%3]!\n"
// d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
// d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
"vpaddl.u8 q0, q0\n"
"vpadal.u8 q0, q1\n"
"vpadal.u8 q0, q2\n"
"vpadal.u8 q0, q3\n"
// d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
// d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
"vpaddl.u16 q0, q0\n"
"vrshrn.u32 d0, q0, #4\n" // divide by 16 w/rounding
// d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
// b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
"vadd.u32 d0, d1\n"
"vrshr.u32 d0, d0, #4\n" // divide by 16 w/rounding
"vst1.u8 {d0[0]}, [%1]!\n"
"vst1.u8 {d0[4]}, [%1]!\n"
"vmovn.u16 d0, q0\n"
"vst1.u32 {d0[0]}, [%1]!\n"
"add %0, #8\n" // move src pointer to next 8 pixels
"subs %2, #2\n" // dst_width -= 2
"subs %2, #4\n"
"bhi 1b\n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "r4", "q0", "q1", "memory", "cc"
: "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
);
}
......@@ -194,7 +175,7 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"vmov.u8 d16, #3\n"
"vmov.u8 d24, #3\n"
"add %3, %0\n"
"1:\n"
"vld4.u8 {d0, d1, d2, d3}, [%0]!\n" // src line 0
......@@ -203,35 +184,35 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
"vmovl.u8 q4, d4\n"
"vmovl.u8 q5, d5\n"
"vmovl.u8 q6, d6\n"
"vmovl.u8 q7, d7\n"
"vmovl.u8 q8, d4\n"
"vmovl.u8 q9, d5\n"
"vmovl.u8 q10, d6\n"
"vmovl.u8 q11, d7\n"
// 3 * line_0 + line_1
"vmlal.u8 q4, d0, d16\n"
"vmlal.u8 q5, d1, d16\n"
"vmlal.u8 q6, d2, d16\n"
"vmlal.u8 q7, d3, d16\n"
"vmlal.u8 q8, d0, d24\n"
"vmlal.u8 q9, d1, d24\n"
"vmlal.u8 q10, d2, d24\n"
"vmlal.u8 q11, d3, d24\n"
// (3 * line_0 + line_1) >> 2
"vqrshrn.u16 d0, q4, #2\n"
"vqrshrn.u16 d1, q5, #2\n"
"vqrshrn.u16 d2, q6, #2\n"
"vqrshrn.u16 d3, q7, #2\n"
"vqrshrn.u16 d0, q8, #2\n"
"vqrshrn.u16 d1, q9, #2\n"
"vqrshrn.u16 d2, q10, #2\n"
"vqrshrn.u16 d3, q11, #2\n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q4, d1\n"
"vmlal.u8 q4, d0, d16\n"
"vqrshrn.u16 d0, q4, #2\n"
"vmovl.u8 q8, d1\n"
"vmlal.u8 q8, d0, d24\n"
"vqrshrn.u16 d0, q8, #2\n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2\n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q4, d2\n"
"vmlal.u8 q4, d3, d16\n"
"vqrshrn.u16 d2, q4, #2\n"
"vmovl.u8 q8, d2\n"
"vmlal.u8 q8, d3, d24\n"
"vqrshrn.u16 d2, q8, #2\n"
"vst3.u8 {d0, d1, d2}, [%1]!\n"
......@@ -242,7 +223,7 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "d17", "memory", "cc"
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
);
}
......@@ -250,11 +231,11 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"vmov.u8 d10, #3\n"
"vmov.u8 d24, #3\n"
"add %3, %0\n"
"1:\n"
"vld4.u8 {d0, d1, d2, d3}, [%0]!\n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]!\n" // src line 1
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2\n"
......@@ -262,7 +243,7 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q3, d1\n"
"vmlal.u8 q3, d0, d10\n"
"vmlal.u8 q3, d0, d24\n"
"vqrshrn.u16 d0, q3, #2\n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
......@@ -270,7 +251,7 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q3, d2\n"
"vmlal.u8 q3, d3, d10\n"
"vmlal.u8 q3, d3, d24\n"
"vqrshrn.u16 d2, q3, #2\n"
"vst3.u8 {d0, d1, d2}, [%1]!\n"
......@@ -282,7 +263,7 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "r4", "q0", "q1", "q2", "q3", "d10", "memory", "cc"
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
);
}
......@@ -325,9 +306,9 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"vld1.u16 {q4}, [%4]\n"
"vld1.u8 {q5}, [%5]\n"
"vld1.u8 {q8}, [%6]\n"
"vld1.u16 {q13}, [%4]\n"
"vld1.u8 {q14}, [%5]\n"
"vld1.u8 {q15}, [%6]\n"
"add r4, %0, %3, lsl #1\n"
"add %3, %0\n"
"1:\n"
......@@ -338,7 +319,7 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]!\n"
"vld4.u8 {d4, d5, d6, d7}, [%3]!\n"
"vld4.u8 {d12, d13, d14, d15}, [r4]!\n"
"vld4.u8 {d16, d17, d18, d19}, [r4]!\n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
......@@ -346,35 +327,35 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1\n"
"vtrn.u8 d4, d5\n"
"vtrn.u8 d12, d13\n"
"vtrn.u8 d16, d17\n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3\n"
"vtrn.u8 d6, d7\n"
"vtrn.u8 d14, d15\n"
"vtrn.u8 d18, d19\n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0\n"
"vpaddl.u8 q2, q2\n"
"vpaddl.u8 q6, q6\n"
"vpaddl.u8 q8, q8\n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3\n"
"vpaddl.u8 d7, d7\n"
"vpaddl.u8 d15, d15\n"
"vpaddl.u8 d19, d19\n"
// combine source lines
"vadd.u16 q0, q2\n"
"vadd.u16 q0, q6\n"
"vadd.u16 q0, q8\n"
"vadd.u16 d4, d3, d7\n"
"vadd.u16 d4, d15\n"
"vadd.u16 d4, d19\n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q4\n"
"vqrdmulh.s16 q2, q13\n"
"vmovn.u16 d4, q2\n"
// Shuffle 2,3 reg around so that 2 can be added to the
......@@ -385,11 +366,11 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2\n"
"vmovl.u8 q3, d6\n"
"vmovl.u8 q7, d14\n"
"vmovl.u8 q9, d18\n"
// combine source lines
"vadd.u16 q1, q3\n"
"vadd.u16 q1, q7\n"
"vadd.u16 q1, q9\n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
......@@ -405,14 +386,14 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q8\n"
"vqrdmulh.s16 q0, q15\n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4\n"
"vtbl.u8 d3, {d0, d1, d2}, d10\n"
"vtbl.u8 d4, {d0, d1, d2}, d11\n"
"vtbl.u8 d3, {d0, d1, d2}, d28\n"
"vtbl.u8 d4, {d0, d1, d2}, d29\n"
"vst1.u8 {d3}, [%1]!\n"
"vst1.u32 {d4[0]}, [%1]!\n"
......@@ -425,8 +406,8 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
: "r"(mult38_div6), // %4
"r"(shuf38_2), // %5
"r"(mult38_div9) // %6
: "r4", "q0", "q1", "q2", "q3", "q4",
"q5", "q6", "q7", "q8", "memory", "cc"
: "r4", "q0", "q1", "q2", "q3", "q8", "q9",
"q13", "q14", "q15", "memory", "cc"
);
}
......@@ -435,8 +416,8 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"vld1.u16 {q4}, [%4]\n"
"vld1.u8 {q5}, [%5]\n"
"vld1.u16 {q13}, [%4]\n"
"vld1.u8 {q14}, [%5]\n"
"add %3, %0\n"
"1:\n"
......@@ -501,14 +482,14 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q4\n"
"vqrdmulh.s16 q0, q13\n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4\n"
"vtbl.u8 d3, {d0, d1, d2}, d10\n"
"vtbl.u8 d4, {d0, d1, d2}, d11\n"
"vtbl.u8 d3, {d0, d1, d2}, d28\n"
"vtbl.u8 d4, {d0, d1, d2}, d29\n"
"vst1.u8 {d3}, [%1]!\n"
"vst1.u32 {d4[0]}, [%1]!\n"
......@@ -520,7 +501,7 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
"+r"(src_stride) // %3
: "r"(mult38_div6), // %4
"r"(shuf38_2) // %5
: "q0", "q1", "q2", "q3", "q4", "q5", "memory", "cc"
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
);
}
......@@ -3090,9 +3071,7 @@ static void ScalePlaneDown2(int src_width, int src_height,
#if defined(HAS_SCALEROWDOWN2_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(dst_width % 16 == 0) && (src_stride % 16 == 0) &&
(dst_stride % 16 == 0) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
(dst_width % 16 == 0)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
} else
#endif
......@@ -3132,8 +3111,7 @@ static void ScalePlaneDown4(int src_width, int src_height,
#if defined(HAS_SCALEROWDOWN4_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(dst_width % 2 == 0) && (src_stride % 8 == 0) &&
IS_ALIGNED(src_ptr, 8)) {
(dst_width % 4 == 0)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
} else
#endif
......@@ -3209,7 +3187,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
uint8* dst_ptr, int dst_width);
#if defined(HAS_SCALEROWDOWN34_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(dst_width % 24 == 0) && (dst_stride % 8 == 0)) {
(dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_NEON;
ScaleRowDown34_1 = ScaleRowDown34_NEON;
......@@ -3298,7 +3276,7 @@ static void ScalePlaneDown38(int src_width, int src_height,
uint8* dst_ptr, int dst_width);
#if defined(HAS_SCALEROWDOWN38_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(dst_width % 24 == 0)) {
(dst_width % 12 == 0)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_NEON;
ScaleRowDown38_2 = ScaleRowDown38_NEON;
......
......@@ -105,7 +105,7 @@ static int TestFilter(int src_width, int src_height,
align_buffer_16(dst_u_opt, dst_uv_plane_size)
align_buffer_16(dst_v_opt, dst_uv_plane_size)
libyuv::MaskCpuFlags(0);
libyuv::MaskCpuFlags(kCpuInitialized);
double c_time = get_time();
for (i = 0; i < runs; ++i)
......@@ -137,8 +137,6 @@ static int TestFilter(int src_width, int src_height,
printf ("filter %d - %8d us c - %8d us opt\n",
f, (int)(c_time*1e6), (int)(opt_time*1e6));
::testing::Test::RecordProperty("C", (int)c_time);
::testing::Test::RecordProperty("Opt", (int)opt_time);
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment