Commit 455ae94c authored by fbarchard@google.com's avatar fbarchard@google.com

Make rotate SIMD allow unaligned pointers.

BUG=365
TESTED=libyuv_unittest
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/22899004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1102 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 044f914c
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1101 Version: 1102
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1101 #define LIBYUV_VERSION 1102
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -210,31 +210,31 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -210,31 +210,31 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
convertloop: convertloop:
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
movdqa xmm0, [eax] movdqu xmm0, [eax]
movdqa xmm1, [eax + edi] movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi] lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register. movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1 punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1 punpckhbw xmm7, xmm1
movdqa xmm1, xmm7 movdqa xmm1, xmm7
movdqa xmm2, [eax] movdqu xmm2, [eax]
movdqa xmm3, [eax + edi] movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi] lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2 movdqa xmm7, xmm2
punpcklbw xmm2, xmm3 punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3 punpckhbw xmm7, xmm3
movdqa xmm3, xmm7 movdqa xmm3, xmm7
movdqa xmm4, [eax] movdqu xmm4, [eax]
movdqa xmm5, [eax + edi] movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi] lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4 movdqa xmm7, xmm4
punpcklbw xmm4, xmm5 punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5 punpckhbw xmm7, xmm5
movdqa xmm5, xmm7 movdqa xmm5, xmm7
movdqa xmm6, [eax] movdqu xmm6, [eax]
movdqa xmm7, [eax + edi] movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi] lea eax, [eax + 2 * edi]
movdqa [esp], xmm5 // backup xmm5 movdqu [esp], xmm5 // backup xmm5
neg edi neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register. movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7 punpcklbw xmm6, xmm7
...@@ -255,8 +255,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -255,8 +255,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
punpcklwd xmm4, xmm6 punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6 punpckhwd xmm5, xmm6
movdqa xmm6, xmm5 movdqa xmm6, xmm5
movdqa xmm5, [esp] // restore xmm5 movdqu xmm5, [esp] // restore xmm5
movdqa [esp], xmm6 // backup xmm6 movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register. movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7 punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7 punpckhwd xmm6, xmm7
...@@ -267,7 +267,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -267,7 +267,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
punpckldq xmm0, xmm4 punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4 punpckhdq xmm6, xmm4
movdqa xmm4, xmm6 movdqa xmm4, xmm6
movdqa xmm6, [esp] // restore xmm6 movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0 movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0 movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4 movlpd qword ptr [edx + esi], xmm4
...@@ -427,31 +427,31 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -427,31 +427,31 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"mov 0x2c(%ecx),%ecx \n" "mov 0x2c(%ecx),%ecx \n"
"1: \n" "1: \n"
"movdqa (%eax),%xmm0 \n" "movdqu (%eax),%xmm0 \n"
"movdqa (%eax,%edi,1),%xmm1 \n" "movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n" "lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n" "movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n" "punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n" "punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n" "movdqa %xmm7,%xmm1 \n"
"movdqa (%eax),%xmm2 \n" "movdqu (%eax),%xmm2 \n"
"movdqa (%eax,%edi,1),%xmm3 \n" "movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n" "lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n" "movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n" "punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n" "punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n" "movdqa %xmm7,%xmm3 \n"
"movdqa (%eax),%xmm4 \n" "movdqu (%eax),%xmm4 \n"
"movdqa (%eax,%edi,1),%xmm5 \n" "movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n" "lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n" "movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n" "punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n" "punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n" "movdqa %xmm7,%xmm5 \n"
"movdqa (%eax),%xmm6 \n" "movdqu (%eax),%xmm6 \n"
"movdqa (%eax,%edi,1),%xmm7 \n" "movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n" "lea (%eax,%edi,2),%eax \n"
"movdqa %xmm5,(%esp) \n" "movdqu %xmm5,(%esp) \n"
"neg %edi \n" "neg %edi \n"
"movdqa %xmm6,%xmm5 \n" "movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n" "punpcklbw %xmm7,%xmm6 \n"
...@@ -471,8 +471,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -471,8 +471,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"punpcklwd %xmm6,%xmm4 \n" "punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n" "punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n" "movdqa %xmm5,%xmm6 \n"
"movdqa (%esp),%xmm5 \n" "movdqu (%esp),%xmm5 \n"
"movdqa %xmm6,(%esp) \n" "movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n" "movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n" "punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n" "punpckhwd %xmm7,%xmm6 \n"
...@@ -481,7 +481,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -481,7 +481,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"punpckldq %xmm4,%xmm0 \n" "punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n" "punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n" "movdqa %xmm6,%xmm4 \n"
"movdqa (%esp),%xmm6 \n" "movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n" "movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n" "movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n" "movlpd %xmm4,(%edx,%esi,1) \n"
...@@ -541,38 +541,38 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, ...@@ -541,38 +541,38 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
// First round of bit swap. // First round of bit swap.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqa (%0,%3),%%xmm1 \n" "movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n" "punpckhbw %%xmm1,%%xmm8 \n"
"movdqa (%0),%%xmm2 \n" "movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n" "movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n" "palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n" "palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqa (%0,%3),%%xmm3 \n" "movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n" "movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n" "punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n" "punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n" "movdqa %%xmm10,%%xmm11 \n"
"movdqa (%0),%%xmm4 \n" "movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n" "palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqa (%0,%3),%%xmm5 \n" "movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n" "movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n" "punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n" "punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n" "movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n" "movdqa %%xmm12,%%xmm13 \n"
"movdqa (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n" "palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqa (%0,%3),%%xmm7 \n" "movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n" "movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n" "punpcklbw %%xmm7,%%xmm6 \n"
...@@ -682,29 +682,29 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -682,29 +682,29 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
// First round of bit swap. // First round of bit swap.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqa (%0,%4),%%xmm1 \n" "movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n" "punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n" "movdqa %%xmm8,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n" "movdqu (%0),%%xmm2 \n"
"movdqa (%0,%4),%%xmm3 \n" "movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n" "movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n" "punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n" "punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n" "movdqa %%xmm8,%%xmm3 \n"
"movdqa (%0),%%xmm4 \n" "movdqu (%0),%%xmm4 \n"
"movdqa (%0,%4),%%xmm5 \n" "movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n" "movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n" "punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n" "punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n" "movdqa %%xmm8,%%xmm5 \n"
"movdqa (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqa (%0,%4),%%xmm7 \n" "movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n" "movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n" "punpcklbw %%xmm7,%%xmm6 \n"
...@@ -834,9 +834,7 @@ void TransposePlane(const uint8* src, int src_stride, ...@@ -834,9 +834,7 @@ void TransposePlane(const uint8* src, int src_stride,
} }
#endif #endif
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
IS_ALIGNED(width, 16) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
TransposeWx8 = TransposeWx8_FAST_SSSE3; TransposeWx8 = TransposeWx8_FAST_SSSE3;
} }
#endif #endif
...@@ -904,16 +902,12 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -904,16 +902,12 @@ void RotatePlane180(const uint8* src, int src_stride,
} }
#endif #endif
#if defined(HAS_MIRRORROW_SSE2) #if defined(HAS_MIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
MirrorRow = MirrorRow_SSE2; MirrorRow = MirrorRow_SSE2;
} }
#endif #endif
#if defined(HAS_MIRRORROW_SSSE3) #if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
MirrorRow = MirrorRow_SSSE3; MirrorRow = MirrorRow_SSSE3;
} }
#endif #endif
...@@ -922,6 +916,7 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -922,6 +916,7 @@ void RotatePlane180(const uint8* src, int src_stride,
MirrorRow = MirrorRow_AVX2; MirrorRow = MirrorRow_AVX2;
} }
#endif #endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_MIPS_DSPR2) #if defined(HAS_MIRRORROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
...@@ -940,9 +935,7 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -940,9 +935,7 @@ void RotatePlane180(const uint8* src, int src_stride,
} }
#endif #endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
CopyRow = CopyRow_SSE2; CopyRow = CopyRow_SSE2;
} }
#endif #endif
...@@ -1032,9 +1025,7 @@ void TransposeUV(const uint8* src, int src_stride, ...@@ -1032,9 +1025,7 @@ void TransposeUV(const uint8* src, int src_stride,
TransposeUVWx8 = TransposeUVWx8_NEON; TransposeUVWx8 = TransposeUVWx8_NEON;
} }
#elif defined(HAS_TRANSPOSE_UVWX8_SSE2) #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
IS_ALIGNED(width, 8) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
TransposeUVWx8 = TransposeUVWx8_SSE2; TransposeUVWx8 = TransposeUVWx8_SSE2;
} }
#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) #elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
...@@ -1106,8 +1097,7 @@ void RotateUV180(const uint8* src, int src_stride, ...@@ -1106,8 +1097,7 @@ void RotateUV180(const uint8* src, int src_stride,
MirrorRowUV = MirrorUVRow_NEON; MirrorRowUV = MirrorUVRow_NEON;
} }
#elif defined(HAS_MIRRORROW_UV_SSSE3) #elif defined(HAS_MIRRORROW_UV_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
MirrorRowUV = MirrorUVRow_SSSE3; MirrorRowUV = MirrorUVRow_SSSE3;
} }
#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2) #elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
......
...@@ -2970,10 +2970,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -2970,10 +2970,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"lea " MEMLEA(-0x10,0) ",%0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
...@@ -3039,7 +3039,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ...@@ -3039,7 +3039,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
"sub %1,%2 \n" "sub %1,%2 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n"
"pshufb %%xmm1,%%xmm0 \n" "pshufb %%xmm1,%%xmm0 \n"
"sub $8,%3 \n" "sub $8,%3 \n"
...@@ -3077,11 +3077,11 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -3077,11 +3077,11 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"movdqa %3,%%xmm5 \n" "movdqa %3,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
......
...@@ -3288,10 +3288,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -3288,10 +3288,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
align 4 align 4
convertloop: convertloop:
movdqa xmm0, [eax + ecx] movdqu xmm0, [eax + ecx]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
sub ecx, 16 sub ecx, 16
movdqa [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
jg convertloop jg convertloop
ret ret
...@@ -3381,7 +3381,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ...@@ -3381,7 +3381,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
align 4 align 4
convertloop: convertloop:
movdqa xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax - 16] lea eax, [eax - 16]
pshufb xmm0, xmm1 pshufb xmm0, xmm1
sub ecx, 8 sub ecx, 8
...@@ -3413,11 +3413,11 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -3413,11 +3413,11 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
align 4 align 4
convertloop: convertloop:
movdqa xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax - 16] lea eax, [eax - 16]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
sub ecx, 4 sub ecx, 4
movdqa [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
jg convertloop jg convertloop
ret ret
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment