Commit c2295807 authored by fbarchard@google.com's avatar fbarchard@google.com

Reduce alignment for loops from 16 bytes to 4 bytes. Reduces outer loop…

Reduce alignment for loops from 16 bytes to 4 bytes.  Reduces outer loop overhead without hurting innerloop time.
BUG=none
TESTED=try bots
R=fbarchard@chromium.org, mflodman@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/4659004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@880 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent dbe48143
......@@ -31,7 +31,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
asm volatile ( // NOLINT
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
".p2align 4 \n"
".p2align 2 \n"
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
"lea " MEMLEA(0x10, 0) ",%0 \n"
......@@ -107,7 +107,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"
".p2align 4 \n"
".p2align 2 \n"
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"lea " MEMLEA(0x10, 0) ",%0 \n"
......
......@@ -27,7 +27,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pxor xmm0, xmm0
pxor xmm5, xmm5
align 16
align 4
wloop:
movdqa xmm1, [eax]
lea eax, [eax + 16]
......@@ -70,7 +70,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
align 16
align 4
wloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
......@@ -145,7 +145,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33
align 16
align 4
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
......@@ -195,7 +195,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
movd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33
align 16
align 4
wloop:
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
......
......@@ -91,7 +91,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
// Read in the data from the source pointer.
// First round of bit swap.
align 16
align 4
convertloop:
movq xmm0, qword ptr [eax]
lea ebp, [eax + 8]
......@@ -190,7 +190,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
align 16
align 4
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
......@@ -304,7 +304,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 4 \n"
".p2align 2 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
......@@ -523,7 +523,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 4 \n"
".p2align 2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3),%%xmm1 \n"
......@@ -664,7 +664,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 4 \n"
".p2align 2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%4),%%xmm1 \n"
......
......@@ -31,7 +31,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"sub %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
".p2align 2 \n"
"1: \n"
"mov r9, %0 \n"
......@@ -198,7 +198,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"sub %6, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
".p2align 2 \n"
"1: \n"
"mov r9, %0 \n"
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -28,7 +28,7 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
psrlw m2, m2, 8
%endif
ALIGN 16
align 4
.convertloop:
mov%2 m0, [src_yuy2q]
mov%2 m1, [src_yuy2q + mmsize]
......@@ -74,7 +74,7 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
psrlw m4, m4, 8
sub dst_vq, dst_uq
ALIGN 16
align 4
.convertloop:
mov%1 m0, [src_uvq]
mov%1 m1, [src_uvq + mmsize]
......@@ -113,7 +113,7 @@ SplitUVRow a,
cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
sub src_vq, src_uq
ALIGN 16
align 4
.convertloop:
mov%1 m0, [src_uq]
mov%1 m1, [src_vq]
......
......@@ -30,6 +30,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
"beqz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
......@@ -88,6 +89,7 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bltz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
......@@ -185,6 +187,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
"beqz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
......@@ -244,6 +247,7 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
......@@ -314,6 +318,7 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
......@@ -360,7 +365,9 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
"repl.ph $t3, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
......@@ -416,6 +423,8 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
......@@ -466,6 +475,8 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
......@@ -515,6 +526,8 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
......@@ -571,6 +584,8 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
......
This diff is collapsed.
This diff is collapsed.
......@@ -103,7 +103,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
align 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -133,7 +133,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -172,7 +172,7 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -214,7 +214,7 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
align 16
align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
......@@ -244,7 +244,7 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16
align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
......@@ -284,7 +284,7 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16
align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
......@@ -328,7 +328,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrld xmm5, 24
pslld xmm5, 16
align 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -363,7 +363,7 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
psrlw xmm7, 8
align 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -426,7 +426,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf1
movdqa xmm5, kShuf2
align 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -480,7 +480,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11
movdqa xmm7, kRound34
align 16
align 4
wloop:
movdqa xmm0, [eax] // pixels 0..7
movdqa xmm1, [eax + esi]
......@@ -539,7 +539,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11
movdqa xmm7, kRound34
align 16
align 4
wloop:
movdqa xmm0, [eax] // pixels 0..7
movdqa xmm1, [eax + esi]
......@@ -596,7 +596,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf38a
movdqa xmm5, kShuf38b
align 16
align 4
xloop:
movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
......@@ -632,7 +632,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kScaleAc33
pxor xmm5, xmm5
align 16
align 4
xloop:
movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
movdqa xmm6, [eax + esi]
......@@ -698,7 +698,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kShufAb2
movdqa xmm5, kScaleAb2
align 16
align 4
xloop:
movdqa xmm0, [eax] // average 2 rows into xmm0
pavgb xmm0, [eax + esi]
......@@ -746,7 +746,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pxor xmm4, xmm4
dec ebx
align 16
align 4
xloop:
// first row
movdqa xmm0, [esi]
......@@ -760,7 +760,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
je ydone
// sum remaining rows
align 16
align 4
yloop:
movdqa xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row
......@@ -772,7 +772,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
sub ebp, 1
jg yloop
align 16
align 4
ydone:
movdqa [edi], xmm0
movdqa [edi + 16], xmm1
......@@ -861,7 +861,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd ebx, xmm0
mov [edi], bl
align 16
align 4
xloop99:
pop edi
......@@ -881,7 +881,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
mov eax, [esp + 8] // src_ptr
mov ecx, [esp + 12] // dst_width
align 16
align 4
wloop:
movdqa xmm0, [eax]
lea eax, [eax + 16]
......@@ -910,7 +910,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width
align 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -937,7 +937,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width
align 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -968,7 +968,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // dst_width
align 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -1008,7 +1008,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2]
align 16
align 4
wloop:
movd xmm0, [eax]
movd xmm1, [eax + ebx]
......@@ -1049,7 +1049,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2]
align 16
align 4
wloop:
movq xmm0, qword ptr [eax] // row0 4 pairs
movhps xmm0, qword ptr [eax + ebx]
......@@ -1238,7 +1238,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
movd [edi], xmm0
align 16
align 4
xloop99:
pop edi
......@@ -1257,7 +1257,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
mov eax, [esp + 8] // src_argb
mov ecx, [esp + 12] // dst_width
align 16
align 4
wloop:
movdqa xmm0, [eax]
lea eax, [eax + 16]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment