Commit 1f461f73 authored by Frank Barchard's avatar Frank Barchard

remove align directives

R=harryjin@google.com
BUG=none

Review URL: https://webrtc-codereview.appspot.com/54809004.
parent 6e7ef3fd
...@@ -27,7 +27,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -27,7 +27,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q9, #0 \n" "vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n" "vmov.u8 q11, #0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
......
...@@ -26,7 +26,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -26,7 +26,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v17.16b, v17.16b, v17.16b \n" "eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n" "eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"
......
...@@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
asm volatile ( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
".p2align 2 \n" LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" "movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n" "movq (%0,%3),%%xmm1 \n"
...@@ -114,7 +114,7 @@ void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, ...@@ -114,7 +114,7 @@ void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
asm volatile ( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
".p2align 2 \n" LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n" "movdqu (%0,%3),%%xmm1 \n"
...@@ -256,7 +256,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -256,7 +256,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
asm volatile ( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
".p2align 2 \n" LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n" "movdqu (%0,%4),%%xmm1 \n"
......
...@@ -35,7 +35,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, ...@@ -35,7 +35,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"sub %5, #8 \n" "sub %5, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
".p2align 2 \n"
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
...@@ -256,7 +255,6 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, ...@@ -256,7 +255,6 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"sub %7, #8 \n" "sub %7, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
".p2align 2 \n"
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
......
...@@ -141,101 +141,6 @@ static uvec8 kShuffleMaskARGBToRAW_0 = { ...@@ -141,101 +141,6 @@ static uvec8 kShuffleMaskARGBToRAW_0 = {
}; };
#endif // HAS_RGB24TOARGBROW_SSSE3 #endif // HAS_RGB24TOARGBROW_SSSE3
#if defined(TESTING) && defined(__x86_64__)
void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile (
".p2align 5 \n"
"mov %%eax,%%eax \n"
"mov %%ebx,%%ebx \n"
"mov %%ecx,%%ecx \n"
"mov %%edx,%%edx \n"
"mov %%esi,%%esi \n"
"mov %%edi,%%edi \n"
"mov %%ebp,%%ebp \n"
"mov %%esp,%%esp \n"
".p2align 5 \n"
"mov %%r8d,%%r8d \n"
"mov %%r9d,%%r9d \n"
"mov %%r10d,%%r10d \n"
"mov %%r11d,%%r11d \n"
"mov %%r12d,%%r12d \n"
"mov %%r13d,%%r13d \n"
"mov %%r14d,%%r14d \n"
"mov %%r15d,%%r15d \n"
".p2align 5 \n"
"lea (%%rax),%%eax \n"
"lea (%%rbx),%%ebx \n"
"lea (%%rcx),%%ecx \n"
"lea (%%rdx),%%edx \n"
"lea (%%rsi),%%esi \n"
"lea (%%rdi),%%edi \n"
"lea (%%rbp),%%ebp \n"
"lea (%%rsp),%%esp \n"
".p2align 5 \n"
"lea (%%r8),%%r8d \n"
"lea (%%r9),%%r9d \n"
"lea (%%r10),%%r10d \n"
"lea (%%r11),%%r11d \n"
"lea (%%r12),%%r12d \n"
"lea (%%r13),%%r13d \n"
"lea (%%r14),%%r14d \n"
"lea (%%r15),%%r15d \n"
".p2align 5 \n"
"lea 0x10(%%rax),%%eax \n"
"lea 0x10(%%rbx),%%ebx \n"
"lea 0x10(%%rcx),%%ecx \n"
"lea 0x10(%%rdx),%%edx \n"
"lea 0x10(%%rsi),%%esi \n"
"lea 0x10(%%rdi),%%edi \n"
"lea 0x10(%%rbp),%%ebp \n"
"lea 0x10(%%rsp),%%esp \n"
".p2align 5 \n"
"lea 0x10(%%r8),%%r8d \n"
"lea 0x10(%%r9),%%r9d \n"
"lea 0x10(%%r10),%%r10d \n"
"lea 0x10(%%r11),%%r11d \n"
"lea 0x10(%%r12),%%r12d \n"
"lea 0x10(%%r13),%%r13d \n"
"lea 0x10(%%r14),%%r14d \n"
"lea 0x10(%%r15),%%r15d \n"
".p2align 5 \n"
"add 0x10,%%eax \n"
"add 0x10,%%ebx \n"
"add 0x10,%%ecx \n"
"add 0x10,%%edx \n"
"add 0x10,%%esi \n"
"add 0x10,%%edi \n"
"add 0x10,%%ebp \n"
"add 0x10,%%esp \n"
".p2align 5 \n"
"add 0x10,%%r8d \n"
"add 0x10,%%r9d \n"
"add 0x10,%%r10d \n"
"add 0x10,%%r11d \n"
"add 0x10,%%r12d \n"
"add 0x10,%%r13d \n"
"add 0x10,%%r14d \n"
"add 0x10,%%r15d \n"
".p2align 2 \n"
"1: \n"
"movq " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
#endif // TESTING
#ifdef HAS_J400TOARGBROW_SSE2 #ifdef HAS_J400TOARGBROW_SSE2
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
......
...@@ -389,7 +389,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -389,7 +389,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez $t4, 2f \n" "blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual " andi %[width], %[width], 0xf \n" // residual
".p2align 2 \n"
"1: \n" "1: \n"
"addiu $t4, $t4, -1 \n" "addiu $t4, $t4, -1 \n"
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
...@@ -457,7 +456,6 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { ...@@ -457,7 +456,6 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
"blez $t4, 2f \n" "blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width " addu %[src], %[src], %[width] \n" // src += width
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0| "lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4| "lw $t1, -12(%[src]) \n" // |7|6|5|4|
...@@ -512,7 +510,6 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -512,7 +510,6 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez %[x], 2f \n" "blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n" " addu %[src_uv], %[src_uv], $t4 \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
...@@ -673,7 +670,6 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, ...@@ -673,7 +670,6 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
"lui $s6, 0xff00 \n" "lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
".p2align 2 \n"
"1: \n" "1: \n"
I422ToTransientMipsRGB I422ToTransientMipsRGB
// Arranging into argb format // Arranging into argb format
...@@ -735,7 +731,6 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, ...@@ -735,7 +731,6 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
"lui $s6, 0xff00 \n" "lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00| "ori $s6, 0xff00 \n" // |ff|00|ff|00|
".p2align 2 \n"
"1: \n" "1: \n"
I422ToTransientMipsRGB I422ToTransientMipsRGB
// Arranging into abgr format // Arranging into abgr format
...@@ -797,7 +792,6 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, ...@@ -797,7 +792,6 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
"lui $s6, 0xff \n" "lui $s6, 0xff \n"
"ori $s6, 0xff \n" // |00|ff|00|ff| "ori $s6, 0xff \n" // |00|ff|00|ff|
".p2align 2 \n"
"1: \n" "1: \n"
I422ToTransientMipsRGB I422ToTransientMipsRGB
// Arranging into bgra format // Arranging into bgra format
...@@ -857,7 +851,6 @@ void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -857,7 +851,6 @@ void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
"replv.ph $t0, %[y0_fraction] \n" "replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n" "replv.ph $t1, %[source_y_fraction] \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t2, 0(%[src_ptr]) \n" "lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n" "lw $t3, 0(%[src_ptr1]) \n"
......
...@@ -174,7 +174,6 @@ void I444ToARGBRow_NEON(const uint8* src_y, ...@@ -174,7 +174,6 @@ void I444ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV444 READYUV444
YUV422TORGB YUV422TORGB
...@@ -204,7 +203,6 @@ void I422ToARGBRow_NEON(const uint8* src_y, ...@@ -204,7 +203,6 @@ void I422ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
...@@ -234,7 +232,6 @@ void I411ToARGBRow_NEON(const uint8* src_y, ...@@ -234,7 +232,6 @@ void I411ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV411 READYUV411
YUV422TORGB YUV422TORGB
...@@ -264,7 +261,6 @@ void I422ToBGRARow_NEON(const uint8* src_y, ...@@ -264,7 +261,6 @@ void I422ToBGRARow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
...@@ -295,7 +291,6 @@ void I422ToABGRRow_NEON(const uint8* src_y, ...@@ -295,7 +291,6 @@ void I422ToABGRRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
...@@ -326,7 +321,6 @@ void I422ToRGBARow_NEON(const uint8* src_y, ...@@ -326,7 +321,6 @@ void I422ToRGBARow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
...@@ -356,7 +350,6 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ...@@ -356,7 +350,6 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
...@@ -385,7 +378,6 @@ void I422ToRAWRow_NEON(const uint8* src_y, ...@@ -385,7 +378,6 @@ void I422ToRAWRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
...@@ -427,7 +419,6 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ...@@ -427,7 +419,6 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
...@@ -472,7 +463,6 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ...@@ -472,7 +463,6 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
...@@ -513,7 +503,6 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -513,7 +503,6 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic. "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
...@@ -542,7 +531,6 @@ void I400ToARGBRow_NEON(const uint8* src_y, ...@@ -542,7 +531,6 @@ void I400ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV400 READYUV400
YUV422TORGB YUV422TORGB
...@@ -568,7 +556,6 @@ void J400ToARGBRow_NEON(const uint8* src_y, ...@@ -568,7 +556,6 @@ void J400ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d20}, [%0]! \n" "vld1.8 {d20}, [%0]! \n"
...@@ -592,7 +579,6 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -592,7 +579,6 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READNV12 READNV12
YUV422TORGB YUV422TORGB
...@@ -620,7 +606,6 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -620,7 +606,6 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READNV21 READNV21
YUV422TORGB YUV422TORGB
...@@ -648,7 +633,6 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ...@@ -648,7 +633,6 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READNV12 READNV12
YUV422TORGB YUV422TORGB
...@@ -676,7 +660,6 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, ...@@ -676,7 +660,6 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READNV21 READNV21
YUV422TORGB YUV422TORGB
...@@ -703,7 +686,6 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -703,7 +686,6 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUY2 READYUY2
YUV422TORGB YUV422TORGB
...@@ -729,7 +711,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -729,7 +711,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READUYVY READUYVY
YUV422TORGB YUV422TORGB
...@@ -754,7 +735,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -754,7 +735,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) { int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
...@@ -777,7 +757,6 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -777,7 +757,6 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) { int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q0}, [%0]! \n" // load U
...@@ -800,7 +779,6 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -800,7 +779,6 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) { void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
...@@ -855,7 +833,6 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -855,7 +833,6 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2 \n" "add %0, %0, %2 \n"
"sub %0, #16 \n" "sub %0, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "vld1.8 {q0}, [%0], r3 \n" // src -= 16
...@@ -882,7 +859,6 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -882,7 +859,6 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"add %0, %0, %3, lsl #1 \n" "add %0, %0, %3, lsl #1 \n"
"sub %0, #16 \n" "sub %0, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
...@@ -909,7 +885,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -909,7 +885,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2, lsl #2 \n" "add %0, %0, %2, lsl #2 \n"
"sub %0, #16 \n" "sub %0, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "vld1.8 {q0}, [%0], r3 \n" // src -= 16
...@@ -931,7 +906,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -931,7 +906,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
...@@ -950,7 +924,6 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -950,7 +924,6 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
...@@ -982,7 +955,6 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -982,7 +955,6 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
...@@ -1030,7 +1002,6 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, ...@@ -1030,7 +1002,6 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
int pix) { int pix) {
asm volatile ( asm volatile (
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
...@@ -1061,7 +1032,6 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, ...@@ -1061,7 +1032,6 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
int pix) { int pix) {
asm volatile ( asm volatile (
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
...@@ -1080,7 +1050,6 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, ...@@ -1080,7 +1050,6 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
...@@ -1098,7 +1067,6 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ...@@ -1098,7 +1067,6 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
...@@ -1117,7 +1085,6 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ...@@ -1117,7 +1085,6 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
...@@ -1135,7 +1102,6 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ...@@ -1135,7 +1102,6 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
...@@ -1154,7 +1120,6 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -1154,7 +1120,6 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int pix) { int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
...@@ -1176,7 +1141,6 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ...@@ -1176,7 +1141,6 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
int pix) { int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
...@@ -1199,7 +1163,6 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1199,7 +1163,6 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // stride + src_yuy2 "add %1, %0, %1 \n" // stride + src_yuy2
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
...@@ -1227,7 +1190,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1227,7 +1190,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // stride + src_uyvy "add %1, %0, %1 \n" // stride + src_uyvy
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
...@@ -1279,7 +1241,6 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1279,7 +1241,6 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_yuy2, int width) { uint8* dst_yuy2, int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
...@@ -1306,7 +1267,6 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1306,7 +1267,6 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_uyvy, int width) { uint8* dst_uyvy, int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
...@@ -1330,7 +1290,6 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1330,7 +1290,6 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
...@@ -1350,7 +1309,6 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { ...@@ -1350,7 +1309,6 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) { const uint32 dither4, int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"vdup.32 d2, %2 \n" // dither4 "vdup.32 d2, %2 \n" // dither4
"1: \n" "1: \n"
MEMACCESS(1) MEMACCESS(1)
...@@ -1374,7 +1332,6 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, ...@@ -1374,7 +1332,6 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
int pix) { int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
...@@ -1395,7 +1352,6 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, ...@@ -1395,7 +1352,6 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
int pix) { int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic. "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
...@@ -1418,7 +1374,6 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1418,7 +1374,6 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
...@@ -1444,7 +1399,6 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1444,7 +1399,6 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
...@@ -1474,7 +1428,6 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1474,7 +1428,6 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u8 d27, #18 \n" // VB -0.1406 coefficient "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
"vmov.u8 d28, #94 \n" // VG -0.7344 coefficient "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
...@@ -1516,7 +1469,6 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1516,7 +1469,6 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
...@@ -1566,7 +1518,6 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1566,7 +1518,6 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
...@@ -1644,7 +1595,6 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1644,7 +1595,6 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
...@@ -1694,7 +1644,6 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1694,7 +1644,6 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
...@@ -1743,7 +1692,6 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ...@@ -1743,7 +1692,6 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
...@@ -1792,7 +1740,6 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ...@@ -1792,7 +1740,6 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
...@@ -1841,7 +1788,6 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ...@@ -1841,7 +1788,6 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
...@@ -1890,7 +1836,6 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ...@@ -1890,7 +1836,6 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
...@@ -1939,7 +1884,6 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, ...@@ -1939,7 +1884,6 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
...@@ -1989,7 +1933,6 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, ...@@ -1989,7 +1933,6 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
...@@ -2059,7 +2002,6 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, ...@@ -2059,7 +2002,6 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
...@@ -2129,7 +2071,6 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, ...@@ -2129,7 +2071,6 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
...@@ -2194,7 +2135,6 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { ...@@ -2194,7 +2135,6 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
...@@ -2222,7 +2162,6 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { ...@@ -2222,7 +2162,6 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
...@@ -2250,7 +2189,6 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { ...@@ -2250,7 +2189,6 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
...@@ -2278,7 +2216,6 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { ...@@ -2278,7 +2216,6 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
...@@ -2305,7 +2242,6 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { ...@@ -2305,7 +2242,6 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
...@@ -2332,7 +2268,6 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { ...@@ -2332,7 +2268,6 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
...@@ -2359,7 +2294,6 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { ...@@ -2359,7 +2294,6 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
...@@ -2386,7 +2320,6 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ...@@ -2386,7 +2320,6 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
...@@ -2605,7 +2538,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, ...@@ -2605,7 +2538,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"vdup.u16 q10, %4 \n" // interval add "vdup.u16 q10, %4 \n" // interval add
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
...@@ -2648,7 +2580,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -2648,7 +2580,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"vshr.u16 q0, q0, #1 \n" // scale / 2. "vshr.u16 q0, q0, #1 \n" // scale / 2.
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
...@@ -2684,7 +2615,6 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2684,7 +2615,6 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
...@@ -2721,7 +2651,6 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2721,7 +2651,6 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vmov.u8 d28, #24 \n" // BB coefficient "vmov.u8 d28, #24 \n" // BB coefficient
"vmov.u8 d29, #98 \n" // BG coefficient "vmov.u8 d29, #98 \n" // BG coefficient
"vmov.u8 d30, #50 \n" // BR coefficient "vmov.u8 d30, #50 \n" // BR coefficient
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
...@@ -2760,7 +2689,6 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -2760,7 +2689,6 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
...@@ -2820,7 +2748,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2820,7 +2748,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
...@@ -2854,7 +2781,6 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2854,7 +2781,6 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
...@@ -2881,7 +2807,6 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2881,7 +2807,6 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
...@@ -2913,7 +2838,6 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2913,7 +2838,6 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
asm volatile ( asm volatile (
"vmov.u8 d3, #255 \n" // alpha "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx. "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
...@@ -2940,7 +2864,6 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2940,7 +2864,6 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width) { uint8* dst_y, int width) {
asm volatile ( asm volatile (
// 16 pixel loop. // 16 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
...@@ -2970,7 +2893,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2970,7 +2893,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
asm volatile ( asm volatile (
"vmov.u8 d3, #255 \n" // alpha "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx. "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
...@@ -2997,7 +2919,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2997,7 +2919,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) { const uint8* src_y2, uint8* dst_sobelx, int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d0}, [%0],%5 \n" // top
...@@ -3041,7 +2962,6 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3041,7 +2962,6 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) { uint8* dst_sobely, int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d0}, [%0],%4 \n" // left
......
...@@ -31,7 +31,6 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -31,7 +31,6 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n" "beqz $t9, 2f \n"
" nop \n" " nop \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
...@@ -90,7 +89,6 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -90,7 +89,6 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bltz $t9, 2f \n" "bltz $t9, 2f \n"
" nop \n" " nop \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
...@@ -188,7 +186,6 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -188,7 +186,6 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n" "beqz $t9, 2f \n"
" nop \n" " nop \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
...@@ -248,7 +245,6 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -248,7 +245,6 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"srl $t9, %[dst_width], 1 \n" "srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n" "andi $t8, %[dst_width], 1 \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4| "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
...@@ -319,7 +315,6 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -319,7 +315,6 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
...@@ -368,7 +363,6 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -368,7 +363,6 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
".set noreorder \n" ".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003 "repl.ph $t3, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
...@@ -425,7 +419,6 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -425,7 +419,6 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
".set noreorder \n" ".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003 "repl.ph $t2, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
...@@ -477,7 +470,6 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -477,7 +470,6 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
...@@ -528,7 +520,6 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -528,7 +520,6 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
...@@ -586,7 +577,6 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, ...@@ -586,7 +577,6 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
......
...@@ -26,7 +26,6 @@ extern "C" { ...@@ -26,7 +26,6 @@ extern "C" {
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0) MEMACCESS(0)
...@@ -47,7 +46,6 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -47,7 +46,6 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
...@@ -73,7 +71,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -73,7 +71,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile ( asm volatile (
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
...@@ -101,7 +98,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -101,7 +98,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
...@@ -123,7 +119,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -123,7 +119,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q0}, [%0]! \n" // load up 16x4
...@@ -162,7 +157,6 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -162,7 +157,6 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
...@@ -185,7 +179,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -185,7 +179,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
...@@ -245,7 +238,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -245,7 +238,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
...@@ -300,7 +292,6 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -300,7 +292,6 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
MEMACCESS(3) MEMACCESS(3)
"vld1.8 {q3}, [%3] \n" "vld1.8 {q3}, [%3] \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
...@@ -334,7 +325,6 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -334,7 +325,6 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
MEMACCESS(7) MEMACCESS(7)
"vld1.8 {q15}, [%7] \n" "vld1.8 {q15}, [%7] \n"
"add %3, %0 \n" "add %3, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
...@@ -450,7 +440,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -450,7 +440,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
MEMACCESS(5) MEMACCESS(5)
"vld1.8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"add %3, %0 \n" "add %3, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
...@@ -545,7 +534,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -545,7 +534,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) { uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp = NULL; const uint8* src_tmp = NULL;
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"mov r12, %5 \n" "mov r12, %5 \n"
...@@ -590,7 +578,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, ...@@ -590,7 +578,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int* tmp = dx_offset; int* tmp = dx_offset;
const uint8* src_tmp = src_ptr; const uint8* src_tmp = src_ptr;
asm volatile ( asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x "vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx "vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
...@@ -749,7 +736,6 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -749,7 +736,6 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0) MEMACCESS(0)
...@@ -773,7 +759,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -773,7 +759,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
...@@ -804,7 +789,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -804,7 +789,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile ( asm volatile (
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
...@@ -845,7 +829,6 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -845,7 +829,6 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) { int src_stepx, uint8* dst_argb, int dst_width) {
asm volatile ( asm volatile (
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[0]}, [%0], r12 \n"
...@@ -875,7 +858,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -875,7 +858,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
asm volatile ( asm volatile (
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
...@@ -930,7 +912,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, ...@@ -930,7 +912,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int tmp = 0; int tmp = 0;
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
LOAD1_DATA32_LANE(d0, 0) LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d0, 1)
...@@ -974,7 +955,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, ...@@ -974,7 +955,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int* tmp = dx_offset; int* tmp = dx_offset;
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
asm volatile ( asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x "vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx "vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment