Commit 60ccea47 authored by zhongwei.yao@arm.com's avatar zhongwei.yao@arm.com

add TransposeWx8_NEON's aarch64 implementation

BUG=319
TESTED=libyuv_unittest
R=fbarchard@chromium.org, fbarchard@google.com

Review URL: https://webrtc-codereview.appspot.com/20259004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1081 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 44c4d0f3
......@@ -55,15 +55,17 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width);
//following symbol is temporally enable for aarch64, until all neon optimized
//functions have been ported to aarch64
#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__aarch64__) || defined(LIBYUV_NEON))
// #define HAS_MIRRORROW_NEON
// void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
// #define HAS_MIRRORROW_UV_NEON
// void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
// #define HAS_TRANSPOSE_WX8_NEON
// void TransposeWx8_NEON(const uint8* src, int src_stride,
// uint8* dst, int dst_stride, int width);
#define HAS_TRANSPOSE_WX8_NEON
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
#define HAS_TRANSPOSE_UVWX8_NEON
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
......
......@@ -18,9 +18,6 @@ extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
//this ifdef should be removed if TransposeWx8_NEON's aarch64 has
//been done
#ifdef HAS_TRANSPOSE_WX8_NEON
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
......@@ -32,215 +29,220 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %5, #8 \n"
"sub %3, %3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n"
"ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n"
"ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n"
"ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n"
"ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n"
"ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n"
"ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n"
"ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n"
"vtrn.8 d5, d4 \n"
"vtrn.8 d7, d6 \n"
"ld1 {v7.8b}, [%0] \n"
"vtrn.16 d1, d3 \n"
"vtrn.16 d0, d2 \n"
"vtrn.16 d5, d7 \n"
"vtrn.16 d4, d6 \n"
"trn2 v16.8b, v0.8b, v1.8b \n"
"trn1 v17.8b, v0.8b, v1.8b \n"
"trn2 v18.8b, v2.8b, v3.8b \n"
"trn1 v19.8b, v2.8b, v3.8b \n"
"trn2 v20.8b, v4.8b, v5.8b \n"
"trn1 v21.8b, v4.8b, v5.8b \n"
"trn2 v22.8b, v6.8b, v7.8b \n"
"trn1 v23.8b, v6.8b, v7.8b \n"
"vtrn.32 d1, d5 \n"
"vtrn.32 d0, d4 \n"
"vtrn.32 d3, d7 \n"
"vtrn.32 d2, d6 \n"
"trn2 v3.4h, v17.4h, v19.4h \n"
"trn1 v1.4h, v17.4h, v19.4h \n"
"trn2 v2.4h, v16.4h, v18.4h \n"
"trn1 v0.4h, v16.4h, v18.4h \n"
"trn2 v7.4h, v21.4h, v23.4h \n"
"trn1 v5.4h, v21.4h, v23.4h \n"
"trn2 v6.4h, v20.4h, v22.4h \n"
"trn1 v4.4h, v20.4h, v22.4h \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"trn2 v21.2s, v1.2s, v5.2s \n"
"trn1 v17.2s, v1.2s, v5.2s \n"
"trn2 v20.2s, v0.2s, v4.2s \n"
"trn1 v16.2s, v0.2s, v4.2s \n"
"trn2 v23.2s, v3.2s, v7.2s \n"
"trn1 v19.2s, v3.2s, v7.2s \n"
"trn2 v22.2s, v2.2s, v6.2s \n"
"trn1 v18.2s, v2.2s, v6.2s \n"
"mov %0, %3 \n"
"mov %0, %2 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n"
"st1 {v17.8b}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
"st1 {v16.8b}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n"
"st1 {v19.8b}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
"st1 {v18.8b}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n"
"st1 {v21.8b}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
"st1 {v20.8b}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n"
"st1 {v23.8b}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0] \n"
"st1 {v22.8b}, [%0] \n"
"add %1, #8 \n" // src += 8
"add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
"subs %5, #8 \n" // w -= 8
"bge 1b \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
"subs %3, %3, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %5, #8 \n"
"beq 4f \n"
"adds %3, %3, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %5, #2 \n"
"blt 3f \n"
"cmp %3, #2 \n"
"blt 3f \n"
"cmp %5, #4 \n"
"blt 2f \n"
"cmp %3, #4 \n"
"blt 2f \n"
// 4x8 block
"mov %0, %1 \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], %2 \n"
"ld1 {v0.s}[0], [%0], %5 \n"
MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], %2 \n"
"ld1 {v0.s}[1], [%0], %5 \n"
MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], %2 \n"
"ld1 {v0.s}[2], [%0], %5 \n"
MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], %2 \n"
"ld1 {v0.s}[3], [%0], %5 \n"
MEMACCESS(0)
"vld1.32 {d2[0]}, [%0], %2 \n"
"ld1 {v1.s}[0], [%0], %5 \n"
MEMACCESS(0)
"vld1.32 {d2[1]}, [%0], %2 \n"
"ld1 {v1.s}[1], [%0], %5 \n"
MEMACCESS(0)
"vld1.32 {d3[0]}, [%0], %2 \n"
"ld1 {v1.s}[2], [%0], %5 \n"
MEMACCESS(0)
"vld1.32 {d3[1]}, [%0] \n"
"ld1 {v1.s}[3], [%0] \n"
"mov %0, %3 \n"
"mov %0, %2 \n"
MEMACCESS(6)
"vld1.8 {q3}, [%6] \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4] \n"
"vtbl.8 d4, {d0, d1}, d6 \n"
"vtbl.8 d5, {d0, d1}, d7 \n"
"vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n"
"tbl v3.16b, {v0.16b}, v2.16b \n"
"tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
MEMACCESS(0)
"vst1.32 {d4[0]}, [%0], %4 \n"
"st1 {v3.s}[0], [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d4[1]}, [%0], %4 \n"
"st1 {v3.s}[1], [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d5[0]}, [%0], %4 \n"
"st1 {v3.s}[2], [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d5[1]}, [%0] \n"
"st1 {v3.s}[3], [%0] \n"
"add %0, %3, #4 \n"
"add %0, %2, #4 \n"
MEMACCESS(0)
"vst1.32 {d0[0]}, [%0], %4 \n"
"st1 {v0.s}[0], [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d0[1]}, [%0], %4 \n"
"st1 {v0.s}[1], [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d1[0]}, [%0], %4 \n"
"st1 {v0.s}[2], [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d1[1]}, [%0] \n"
"st1 {v0.s}[3], [%0] \n"
"add %1, #4 \n" // src += 4
"add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
"subs %5, #4 \n" // w -= 4
"beq 4f \n"
"add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %3, %3, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %5, #2 \n"
"blt 3f \n"
"cmp %3, #2 \n"
"blt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.16 {d0[0]}, [%0], %2 \n"
"ld1 {v0.h}[0], [%0], %5 \n"
MEMACCESS(0)
"vld1.16 {d1[0]}, [%0], %2 \n"
"ld1 {v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"vld1.16 {d0[1]}, [%0], %2 \n"
"ld1 {v0.h}[1], [%0], %5 \n"
MEMACCESS(0)
"vld1.16 {d1[1]}, [%0], %2 \n"
"ld1 {v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"vld1.16 {d0[2]}, [%0], %2 \n"
"ld1 {v0.h}[2], [%0], %5 \n"
MEMACCESS(0)
"vld1.16 {d1[2]}, [%0], %2 \n"
"ld1 {v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"vld1.16 {d0[3]}, [%0], %2 \n"
"ld1 {v0.h}[3], [%0], %5 \n"
MEMACCESS(0)
"vld1.16 {d1[3]}, [%0] \n"
"ld1 {v1.h}[3], [%0] \n"
"vtrn.8 d0, d1 \n"
"trn2 v2.8b, v0.8b, v1.8b \n"
"trn1 v3.8b, v0.8b, v1.8b \n"
"mov %0, %3 \n"
"mov %0, %2 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n"
"st1 {v3.8b}, [%0], %6 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0] \n"
"st1 {v2.8b}, [%0] \n"
"add %1, #2 \n" // src += 2
"add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
"subs %5, #2 \n" // w -= 2
"beq 4f \n"
"add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %3, %3, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
"3: \n"
MEMACCESS(1)
"vld1.8 {d0[0]}, [%1], %2 \n"
"ld1 {v0.b}[0], [%1], %5 \n"
MEMACCESS(1)
"vld1.8 {d0[1]}, [%1], %2 \n"
"ld1 {v0.b}[1], [%1], %5 \n"
MEMACCESS(1)
"vld1.8 {d0[2]}, [%1], %2 \n"
"ld1 {v0.b}[2], [%1], %5 \n"
MEMACCESS(1)
"vld1.8 {d0[3]}, [%1], %2 \n"
"ld1 {v0.b}[3], [%1], %5 \n"
MEMACCESS(1)
"vld1.8 {d0[4]}, [%1], %2 \n"
"ld1 {v0.b}[4], [%1], %5 \n"
MEMACCESS(1)
"vld1.8 {d0[5]}, [%1], %2 \n"
"ld1 {v0.b}[5], [%1], %5 \n"
MEMACCESS(1)
"vld1.8 {d0[6]}, [%1], %2 \n"
"ld1 {v0.b}[6], [%1], %5 \n"
MEMACCESS(1)
"vld1.8 {d0[7]}, [%1] \n"
"ld1 {v0.b}[7], [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
MEMACCESS(2)
"st1 {v0.8b}, [%2] \n"
"4: \n"
"4: \n"
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst), // %3
"+r"(dst_stride), // %4
"+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6
: "memory", "cc", "q0", "q1", "q2", "q3"
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width) // %3
: "r"(&kVTbl4x4Transpose), // %4
"r"((ptrdiff_t)src_stride), // %5
"r"((ptrdiff_t)dst_stride) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
}
#endif //HAS_TRANSPOSE_WX8_NEON
static uint8 kVTbl4x4TransposeDi[32] =
{ 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment