Commit c5aac16a authored by fbarchard@google.com's avatar fbarchard@google.com

Remove loop alignment for benefit of modern cpus that dont require alignment.

BUG=none
TESTED=local libyuv unittest passes
R=brucedawson@google.com, tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/32159004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1180 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent fd89cd79
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1178 Version: 1180
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -200,7 +200,6 @@ extern "C" { ...@@ -200,7 +200,6 @@ extern "C" {
#define HAS_MERGEUVROW_AVX2 #define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2 #define HAS_MIRRORROW_AVX2
#define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2
#define HAS_I422TOARGBROW_AVX2
// Effects: // Effects:
#define HAS_ARGBADDROW_AVX2 #define HAS_ARGBADDROW_AVX2
...@@ -216,6 +215,7 @@ extern "C" { ...@@ -216,6 +215,7 @@ extern "C" {
#define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYROW_AVX2
#define HAS_I422TOARGBROW_AVX2
#define HAS_I422TORGBAROW_AVX2 #define HAS_I422TORGBAROW_AVX2
#define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOABGRROW_AVX2
#define HAS_INTERPOLATEROW_AVX2 #define HAS_INTERPOLATEROW_AVX2
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1178 #define LIBYUV_VERSION 1180
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -27,7 +27,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -27,7 +27,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pxor xmm0, xmm0 pxor xmm0, xmm0
pxor xmm5, xmm5 pxor xmm5, xmm5
align 4
wloop: wloop:
movdqu xmm1, [eax] movdqu xmm1, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -70,7 +69,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -70,7 +69,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax sub edx, eax
align 4
wloop: wloop:
vmovdqu ymm1, [eax] vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx] vmovdqu ymm2, [eax + edx]
...@@ -145,7 +143,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { ...@@ -145,7 +143,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pxor xmm7, xmm7 // constant 0 for unpck pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33 movdqa xmm6, kHash16x33
align 4
wloop: wloop:
movdqu xmm1, [eax] // src[0-15] movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -195,7 +192,6 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { ...@@ -195,7 +192,6 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
movd xmm0, [esp + 12] // seed movd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33 movdqa xmm6, kHash16x33
align 4
wloop: wloop:
vpmovzxbd xmm3, dword ptr [eax] // src[0-3] vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16 pmulld xmm0, xmm6 // hash *= 33 ^ 16
......
This diff is collapsed.
...@@ -103,7 +103,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -103,7 +103,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -133,7 +132,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -133,7 +132,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -172,7 +170,6 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -172,7 +170,6 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -216,7 +213,6 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -216,7 +213,6 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrld xmm5, 24 psrld xmm5, 24
pslld xmm5, 16 pslld xmm5, 16
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -251,7 +247,6 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -251,7 +247,6 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm7, 8
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -314,7 +309,6 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -314,7 +309,6 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf1 movdqa xmm4, kShuf1
movdqa xmm5, kShuf2 movdqa xmm5, kShuf2
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -368,7 +362,6 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ...@@ -368,7 +362,6 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11 movdqa xmm6, kMadd11
movdqa xmm7, kRound34 movdqa xmm7, kRound34
align 4
wloop: wloop:
movdqu xmm0, [eax] // pixels 0..7 movdqu xmm0, [eax] // pixels 0..7
movdqu xmm1, [eax + esi] movdqu xmm1, [eax + esi]
...@@ -427,7 +420,6 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ...@@ -427,7 +420,6 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11 movdqa xmm6, kMadd11
movdqa xmm7, kRound34 movdqa xmm7, kRound34
align 4
wloop: wloop:
movdqu xmm0, [eax] // pixels 0..7 movdqu xmm0, [eax] // pixels 0..7
movdqu xmm1, [eax + esi] movdqu xmm1, [eax + esi]
...@@ -484,7 +476,6 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -484,7 +476,6 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf38a movdqa xmm4, kShuf38a
movdqa xmm5, kShuf38b movdqa xmm5, kShuf38b
align 4
xloop: xloop:
movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
...@@ -520,7 +511,6 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ...@@ -520,7 +511,6 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kScaleAc33 movdqa xmm4, kScaleAc33
pxor xmm5, xmm5 pxor xmm5, xmm5
align 4
xloop: xloop:
movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
movdqu xmm6, [eax + esi] movdqu xmm6, [eax + esi]
...@@ -586,7 +576,6 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ...@@ -586,7 +576,6 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kShufAb2 movdqa xmm4, kShufAb2
movdqa xmm5, kScaleAb2 movdqa xmm5, kScaleAb2
align 4
xloop: xloop:
movdqu xmm0, [eax] // average 2 rows into xmm0 movdqu xmm0, [eax] // average 2 rows into xmm0
movdqu xmm1, [eax + esi] movdqu xmm1, [eax + esi]
...@@ -635,7 +624,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -635,7 +624,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pxor xmm4, xmm4 pxor xmm4, xmm4
dec ebx dec ebx
align 4
xloop: xloop:
// first row // first row
movdqu xmm0, [esi] movdqu xmm0, [esi]
...@@ -649,7 +637,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -649,7 +637,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
je ydone je ydone
// sum remaining rows // sum remaining rows
align 4
yloop: yloop:
movdqu xmm2, [eax] // read 16 pixels movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row lea eax, [eax + edx] // advance to next row
...@@ -661,7 +648,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -661,7 +648,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
sub ebp, 1 sub ebp, 1
jg yloop jg yloop
align 4
ydone: ydone:
movdqu [edi], xmm0 movdqu [edi], xmm0
movdqu [edi + 16], xmm1 movdqu [edi + 16], xmm1
...@@ -716,7 +702,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -716,7 +702,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
pextrw edx, xmm2, 3 // get x1 integer. preroll pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop. // 2 Pixel loop.
align 4
xloop2: xloop2:
movdqa xmm1, xmm2 // x0, x1 fractions. movdqa xmm1, xmm2 // x0, x1 fractions.
paddd xmm2, xmm3 // x += dx paddd xmm2, xmm3 // x += dx
...@@ -739,7 +724,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -739,7 +724,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 2 // 2 pixels sub ecx, 2 // 2 pixels
jge xloop2 jge xloop2
align 4
xloop29: xloop29:
add ecx, 2 - 1 add ecx, 2 - 1
...@@ -757,7 +741,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -757,7 +741,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd ebx, xmm0 movd ebx, xmm0
mov [edi], bl mov [edi], bl
align 4
xloop99: xloop99:
pop edi pop edi
...@@ -777,7 +760,6 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -777,7 +760,6 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
mov eax, [esp + 8] // src_ptr mov eax, [esp + 8] // src_ptr
mov ecx, [esp + 12] // dst_width mov ecx, [esp + 12] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -806,7 +788,6 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ...@@ -806,7 +788,6 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -833,7 +814,6 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ...@@ -833,7 +814,6 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -864,7 +844,6 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ...@@ -864,7 +844,6 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -904,7 +883,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -904,7 +883,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea ebx, [ebx * 4] lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2] lea edi, [ebx + ebx * 2]
align 4
wloop: wloop:
movd xmm0, [eax] movd xmm0, [eax]
movd xmm1, [eax + ebx] movd xmm1, [eax + ebx]
...@@ -945,7 +923,6 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ...@@ -945,7 +923,6 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
lea ebx, [ebx * 4] lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2] lea edi, [ebx + ebx * 2]
align 4
wloop: wloop:
movq xmm0, qword ptr [eax] // row0 4 pairs movq xmm0, qword ptr [eax] // row0 4 pairs
movhps xmm0, qword ptr [eax + ebx] movhps xmm0, qword ptr [eax + ebx]
...@@ -1006,7 +983,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1006,7 +983,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
jl xloop49 jl xloop49
// 4 Pixel loop. // 4 Pixel loop.
align 4
xloop4: xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels
...@@ -1026,7 +1002,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1026,7 +1002,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
sub ecx, 4 // 4 pixels sub ecx, 4 // 4 pixels
jge xloop4 jge xloop4
align 4
xloop49: xloop49:
test ecx, 2 test ecx, 2
je xloop29 je xloop29
...@@ -1047,7 +1022,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1047,7 +1022,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
// 1 Pixels. // 1 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd xmm0, [esi + eax * 4] // 1 source x2 pixels
movd dword ptr [edi], xmm0 movd dword ptr [edi], xmm0
align 4
xloop99: xloop99:
pop esi pop esi
...@@ -1097,7 +1071,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, ...@@ -1097,7 +1071,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. preroll pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop. // 2 Pixel loop.
align 4
xloop2: xloop2:
movdqa xmm1, xmm2 // x0, x1 fractions. movdqa xmm1, xmm2 // x0, x1 fractions.
paddd xmm2, xmm3 // x += dx paddd xmm2, xmm3 // x += dx
...@@ -1117,7 +1090,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, ...@@ -1117,7 +1090,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
sub ecx, 2 // 2 pixels sub ecx, 2 // 2 pixels
jge xloop2 jge xloop2
align 4
xloop29: xloop29:
add ecx, 2 - 1 add ecx, 2 - 1
...@@ -1134,7 +1106,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, ...@@ -1134,7 +1106,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
movd [edi], xmm0 movd [edi], xmm0
align 4
xloop99: xloop99:
pop edi pop edi
...@@ -1153,7 +1124,6 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1153,7 +1124,6 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
mov eax, [esp + 8] // src_argb mov eax, [esp + 8] // src_argb
mov ecx, [esp + 12] // dst_width mov ecx, [esp + 12] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment