Commit b720049a authored by fbarchard@google.com's avatar fbarchard@google.com

Make row functions used for planarfunctions and convert use movdqu to relax…

Make row functions used for planarfunctions and convert use movdqu to relax alignment constraint.  Step 1 - make functions unaligned.
BUG=365
TESTED=libyuv_unittest passes
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/26709004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1111 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 147bbede
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1109
Version: 1111
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1109
#define LIBYUV_VERSION 1111
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -47,9 +47,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
}
#endif
......@@ -101,9 +99,7 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_16_SSE2;
}
#endif
......@@ -254,9 +250,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
#endif
......@@ -307,16 +301,10 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
}
}
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
......@@ -385,16 +373,10 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
UYVYToUV422Row = UYVYToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
UYVYToYRow = UYVYToYRow_SSE2;
}
}
}
}
#endif
#if defined(HAS_UYVYTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
......@@ -504,9 +486,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
}
#if defined(HAS_ARGBMIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_SSSE3;
}
#endif
......@@ -824,12 +804,9 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
I422ToBGRARow = I422ToBGRARow_SSSE3;
}
}
}
#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
......@@ -894,12 +871,9 @@ int I422ToABGR(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
I422ToABGRRow = I422ToABGRRow_SSSE3;
}
}
}
#endif
for (y = 0; y < height; ++y) {
......@@ -956,12 +930,9 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
I422ToRGBARow = I422ToRGBARow_SSSE3;
}
}
}
#endif
for (y = 0; y < height; ++y) {
......@@ -1084,9 +1055,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
dst_stride_y = 0;
}
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
SetRow = SetRow_NEON;
}
#endif
......@@ -1150,8 +1119,7 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
return 0;
}
......@@ -1202,9 +1170,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
......@@ -1317,9 +1283,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#elif defined(HAS_ARGBGRAYROW_NEON)
......@@ -1355,8 +1319,7 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#elif defined(HAS_ARGBGRAYROW_NEON)
......@@ -1388,8 +1351,7 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBSEPIAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_SSSE3;
}
#elif defined(HAS_ARGBSEPIAROW_NEON)
......@@ -1430,8 +1392,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
}
#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
......@@ -1573,8 +1534,7 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBQUANTIZEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
}
#elif defined(HAS_ARGBQUANTIZEROW_NEON)
......@@ -1748,9 +1708,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBSHADEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBShadeRow = ARGBShadeRow_SSE2;
}
#elif defined(HAS_ARGBSHADEROW_NEON)
......@@ -1882,13 +1840,9 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBShuffleRow = ARGBShuffleRow_SSSE3;
}
}
}
#endif
#if defined(HAS_ARGBSHUFFLEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
......@@ -1942,8 +1896,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
// ARGBToBayer used to select G channel from ARGB.
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
......@@ -1951,8 +1904,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
......@@ -2043,8 +1995,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelRow_C;
#if defined(HAS_SOBELROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelRow = SobelRow_SSE2;
}
#endif
......@@ -2065,8 +2016,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_, int width) = SobelToPlaneRow_C;
#if defined(HAS_SOBELTOPLANEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_SSE2;
}
#endif
......@@ -2088,8 +2038,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelXYRow_C;
#if defined(HAS_SOBELXYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelXYRow = SobelXYRow_SSE2;
}
#endif
......@@ -2213,10 +2162,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
}
#endif
......@@ -2259,10 +2205,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
}
#endif
......
......@@ -221,7 +221,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"1: \n"
"movq " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
......@@ -252,8 +252,8 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
......@@ -318,17 +318,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
"movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgb24), // %0
......@@ -359,17 +359,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
"movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_raw), // %0
......@@ -418,8 +418,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
BUNDLEALIGN
MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
......@@ -475,8 +475,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
BUNDLEALIGN
MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
......@@ -519,8 +519,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
BUNDLEALIGN
MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2)
MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2)
MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
......@@ -631,7 +631,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pslld $0x8,%%xmm0 \n"
......@@ -672,7 +672,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xf,%%xmm7 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm0,%%xmm3 \n"
......@@ -712,7 +712,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
"psrlw $0x8,%%xmm3 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm3,%%xmm0 \n"
"pand %%xmm4,%%xmm1 \n"
......@@ -744,10 +744,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
......@@ -760,7 +760,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
......@@ -820,10 +820,10 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
......@@ -837,7 +837,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
......@@ -912,10 +912,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
......@@ -979,10 +979,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
......@@ -1187,10 +1187,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
......@@ -1202,11 +1202,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm0 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm2 \n"
......@@ -1219,7 +1219,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"paddb %%xmm5,%%xmm0 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
BUNDLEALIGN
MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1)
MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
......@@ -1317,10 +1317,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
......@@ -1430,10 +1430,10 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
......@@ -1446,7 +1446,7 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_bgra), // %0
......@@ -1513,10 +1513,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
......@@ -1640,10 +1640,10 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
......@@ -1656,7 +1656,7 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_abgr), // %0
......@@ -1714,10 +1714,10 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
......@@ -1730,7 +1730,7 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgba), // %0
......@@ -1797,10 +1797,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
......@@ -1933,10 +1933,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
......@@ -2199,8 +2199,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
......@@ -2354,8 +2354,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
......@@ -2393,8 +2393,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
......@@ -2430,8 +2430,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
......@@ -2464,8 +2464,8 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
......@@ -2686,8 +2686,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n"
"movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
"movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
"lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
......@@ -2725,8 +2725,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
"movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
"lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
......@@ -2765,8 +2765,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n"
"movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
"movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
"lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
......@@ -2939,8 +2939,8 @@ void YToARGBRow_SSE2(const uint8* y_buf,
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"por %%xmm4,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
......@@ -2970,7 +2970,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"lea " MEMLEA(-0x10,0) ",%0 \n"
LABELALIGN
"1: \n"
MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
......@@ -3104,8 +3104,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
......@@ -3115,8 +3115,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"psrlw $0x8,%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2)
"movdqu %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
......@@ -3183,7 +3183,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
......@@ -3405,16 +3405,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
"punpcklbw %%xmm2,%%xmm2 \n"
"punpckhwd %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm2,%%xmm2 \n"
"movdqa " MEMACCESS(1) ",%%xmm4 \n"
"movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
"movdqu " MEMACCESS(1) ",%%xmm4 \n"
"movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
"pand %%xmm0,%%xmm2 \n"
"pand %%xmm0,%%xmm3 \n"
"pand %%xmm1,%%xmm4 \n"
"pand %%xmm1,%%xmm5 \n"
"por %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm3 \n"
"movdqa %%xmm2," MEMACCESS(1) " \n"
"movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm2," MEMACCESS(1) " \n"
"movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
......@@ -3498,13 +3498,13 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
"psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
......@@ -3527,11 +3527,11 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
BUNDLEALIGN
MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
......@@ -3572,8 +3572,8 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
......@@ -3722,14 +3722,14 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
......@@ -3751,11 +3751,11 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
BUNDLEALIGN
MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
......@@ -3796,8 +3796,8 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
......@@ -4014,7 +4014,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 41b \n"
......@@ -4132,16 +4132,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 4 pixel loop.
LABELALIGN
"40: \n"
"movdqa " MEMACCESS(0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqa " MEMACCESS(1) ",%%xmm2 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqa " MEMACCESS(1) ",%%xmm1 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
......@@ -4151,7 +4151,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 40b \n"
"jmp 49f \n"
......@@ -4178,7 +4178,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 41b \n"
......@@ -4237,17 +4237,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
// 4 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"pshufhw $0xff,%%xmm1,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"movdqa " MEMACCESS(0) ",%%xmm2 \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"pand %%xmm4,%%xmm2 \n"
......@@ -4256,7 +4256,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
......@@ -4389,16 +4389,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm0 \n"
"paddw %%xmm5,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x18,%%xmm2 \n"
"psrld $0x18,%%xmm3 \n"
......@@ -4411,8 +4411,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"punpcklwd %%xmm3,%%xmm0 \n"
"punpckhwd %%xmm3,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
......@@ -4455,30 +4455,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"pmaddubsw %%xmm2,%%xmm6 \n"
"phaddw %%xmm6,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm5 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm5 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm5 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm5 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm5 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm5 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"psrld $0x18,%%xmm6 \n"
"psrld $0x18,%%xmm1 \n"
"packuswb %%xmm1,%%xmm6 \n"
......@@ -4488,8 +4488,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
"punpcklwd %%xmm5,%%xmm0 \n"
"punpckhwd %%xmm5,%%xmm1 \n"
"sub $0x8,%1 \n"
"movdqa %%xmm0," MEMACCESS(0) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
......@@ -4520,12 +4520,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"pmaddubsw %%xmm2,%%xmm7 \n"
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"phaddsw %%xmm7,%%xmm0 \n"
......@@ -4535,13 +4535,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm6,%%xmm6 \n"
"punpcklbw %%xmm6,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm7 \n"
"phaddsw %%xmm7,%%xmm1 \n"
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm5,%%xmm6 \n"
"pmaddubsw %%xmm5,%%xmm7 \n"
"phaddsw %%xmm7,%%xmm6 \n"
......@@ -4554,8 +4554,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"punpcklwd %%xmm1,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm6 \n"
"sub $0x8,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm6," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
......@@ -4593,14 +4593,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
// 4 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm1 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"pmullw %%xmm3,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm7 \n"
"movdqu " MEMACCESS(0) ",%%xmm7 \n"
"pmullw %%xmm3,%%xmm1 \n"
"pand %%xmm6,%%xmm7 \n"
"paddw %%xmm4,%%xmm0 \n"
......@@ -4608,7 +4608,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm7,%%xmm0 \n"
"sub $0x4,%1 \n"
"movdqa %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
......@@ -4637,7 +4637,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
// 4 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
......@@ -4648,7 +4648,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
......@@ -4901,8 +4901,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
......@@ -4919,10 +4919,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"por %%xmm5,%%xmm3 \n"
"por %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqa %%xmm1," MEMACCESS(2) " \n"
"movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
"movdqa %%xmm3," MEMACCESS2(0x20,2) " \n"
"movdqa %%xmm0," MEMACCESS2(0x30,2) " \n"
"movdqu %%xmm1," MEMACCESS(2) " \n"
"movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
"movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
"movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
......@@ -4953,12 +4953,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
......@@ -4992,8 +4992,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"paddusb %%xmm1,%%xmm2 \n"
......@@ -5010,10 +5010,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"punpcklwd %%xmm0,%%xmm7 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"sub $0x10,%3 \n"
"movdqa %%xmm6," MEMACCESS(2) " \n"
"movdqa %%xmm4," MEMACCESS2(0x10,2) " \n"
"movdqa %%xmm7," MEMACCESS2(0x20,2) " \n"
"movdqa %%xmm1," MEMACCESS2(0x30,2) " \n"
"movdqu %%xmm6," MEMACCESS(2) " \n"
"movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
"movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
"movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
......@@ -5060,22 +5060,22 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"punpcklwd %%xmm1,%%xmm4 \n"
"punpckhwd %%xmm1,%%xmm5 \n"
"paddd %%xmm2,%%xmm0 \n"
"movdqa " MEMACCESS(2) ",%%xmm2 \n"
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n"
"movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
"paddd %%xmm0,%%xmm3 \n"
"paddd %%xmm4,%%xmm0 \n"
"movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n"
"movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
"paddd %%xmm0,%%xmm4 \n"
"paddd %%xmm5,%%xmm0 \n"
"movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n"
"movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"paddd %%xmm0,%%xmm5 \n"
"movdqa %%xmm2," MEMACCESS(1) " \n"
"movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
"movdqa %%xmm4," MEMACCESS2(0x20,1) " \n"
"movdqa %%xmm5," MEMACCESS2(0x30,1) " \n"
"movdqu %%xmm2," MEMACCESS(1) " \n"
"movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
"movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
......@@ -5140,10 +5140,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 4 pixel small loop \n"
LABELALIGN
"4: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
BUNDLEALIGN
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
......@@ -5174,10 +5174,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 4 pixel loop \n"
LABELALIGN
"40: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
BUNDLEALIGN
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
......@@ -5221,7 +5221,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 1 pixel loop \n"
LABELALIGN
"10: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
"lea " MEMLEA(0x10,0) ",%0 \n"
"psubd " MEMACCESS(1) ",%%xmm0 \n"
......@@ -5822,8 +5822,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
"pshufd $0x0,%%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
......@@ -5852,8 +5852,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
"psrld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x8,%%xmm0 \n"
"psrld $0x8,%%xmm1 \n"
......@@ -5882,17 +5882,17 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
"movdqa " MEMACCESS(3) ",%%xmm5 \n"
"movdqu " MEMACCESS(3) ",%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
......@@ -5909,7 +5909,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
"movdqa " MEMACCESS(3) ",%%xmm5 \n"
"movdqu " MEMACCESS(3) ",%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
......
......@@ -326,8 +326,8 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
punpckhwd xmm1, xmm1
por xmm0, xmm5
por xmm1, xmm5
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -386,17 +386,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
movdqa [edx + 32], xmm2
movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
movdqa [edx], xmm0
movdqu [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
movdqa [edx + 16], xmm1
movdqu [edx + 16], xmm1
por xmm3, xmm5
sub ecx, 16
movdqa [edx + 48], xmm3
movdqu [edx + 48], xmm3
lea edx, [edx + 64]
jg convertloop
ret
......@@ -426,17 +426,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
movdqa [edx + 32], xmm2
movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
movdqa [edx], xmm0
movdqu [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
movdqa [edx + 16], xmm1
movdqu [edx + 16], xmm1
por xmm3, xmm5
sub ecx, 16
movdqa [edx + 48], xmm3
movdqu [edx + 48], xmm3
lea edx, [edx + 64]
jg convertloop
ret
......@@ -491,8 +491,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
......@@ -545,8 +545,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
......@@ -585,8 +585,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
movdqa xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
......@@ -688,7 +688,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
align 4
convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
pslld xmm0, 8 // R
......@@ -728,7 +728,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
align 4
convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
movdqa xmm3, xmm0 // R
......@@ -766,7 +766,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
align 4
convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0
pand xmm0, xmm3 // low nibble
pand xmm1, xmm4 // high nibble
......@@ -795,10 +795,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
......@@ -811,7 +811,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
......@@ -830,10 +830,10 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
......@@ -847,7 +847,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
......@@ -1016,10 +1016,10 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
......@@ -1032,7 +1032,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
......@@ -1084,10 +1084,10 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
......@@ -1100,7 +1100,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
......@@ -1152,10 +1152,10 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
......@@ -1168,7 +1168,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
......@@ -1228,10 +1228,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
......@@ -1294,10 +1294,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
......@@ -1567,10 +1567,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
align 4
convertloop:
/* convert to U and V */
movdqa xmm0, [eax] // U
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax] // U
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
......@@ -1582,12 +1582,12 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
movdqa xmm0, [eax] // V
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax] // V
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm6
pmaddubsw xmm1, xmm6
pmaddubsw xmm2, xmm6
......@@ -1599,7 +1599,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm2
paddb xmm0, xmm5
lea eax, [eax + 64]
movdqa [edx + edi], xmm0
movdqu [edx + edi], xmm0
lea edx, [edx + 16]
jg convertloop
......@@ -1683,10 +1683,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
......@@ -1803,10 +1803,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
......@@ -1939,10 +1939,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
......@@ -2075,10 +2075,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
......@@ -2423,8 +2423,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -2633,8 +2633,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -2678,8 +2678,8 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -2718,8 +2718,8 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -2756,8 +2756,8 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -3004,8 +3004,8 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
movdqa [edx], xmm5
movdqa [edx + 16], xmm0
movdqu [edx], xmm5
movdqu [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -3086,8 +3086,8 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm2
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
movdqa [edx], xmm2
movdqa [edx + 16], xmm1
movdqu [edx], xmm2
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -3168,8 +3168,8 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // RGBA first 4 pixels
punpckhwd xmm0, xmm1 // RGBA next 4 pixels
movdqa [edx], xmm5
movdqa [edx + 16], xmm0
movdqu [edx], xmm5
movdqu [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -3260,8 +3260,8 @@ void YToARGBRow_SSE2(const uint8* y_buf,
punpckhwd xmm1, xmm1 // BGRA next 4 pixels
por xmm0, xmm4
por xmm1, xmm4
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -3468,8 +3468,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0
movdqa xmm3, xmm1
......@@ -3479,8 +3479,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
psrlw xmm2, 8 // odd bytes
psrlw xmm3, 8
packuswb xmm2, xmm3
movdqa [edx], xmm0
movdqa [edx + edi], xmm2
movdqu [edx], xmm0
movdqu [edx + edi], xmm2
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
......@@ -3581,14 +3581,14 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
align 4
convertloop:
movdqa xmm0, [eax] // read 16 U's
movdqa xmm1, [eax + edx] // and 16 V's
movdqu xmm0, [eax] // read 16 U's
movdqu xmm1, [eax + edx] // and 16 V's
lea eax, [eax + 16]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1 // first 8 UV pairs
punpckhbw xmm2, xmm1 // next 8 UV pairs
movdqa [edi], xmm0
movdqa [edi + 16], xmm2
movdqu [edi], xmm0
movdqu [edi + 16], xmm2
lea edi, [edi + 32]
sub ecx, 16
jg convertloop
......@@ -3763,19 +3763,19 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
align 4
convertloop:
movdqa xmm2, [eax]
movdqa xmm3, [eax + 16]
movdqu xmm2, [eax]
movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
movdqa xmm4, [edx]
movdqa xmm5, [edx + 16]
movdqu xmm4, [edx]
movdqu xmm5, [edx + 16]
pand xmm2, xmm0
pand xmm3, xmm0
pand xmm4, xmm1
pand xmm5, xmm1
por xmm2, xmm4
por xmm3, xmm5
movdqa [edx], xmm2
movdqa [edx + 16], xmm3
movdqu [edx], xmm2
movdqu [edx + 16], xmm3
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -3835,16 +3835,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
punpcklbw xmm2, xmm2
punpckhwd xmm3, xmm2
punpcklwd xmm2, xmm2
movdqa xmm4, [edx]
movdqa xmm5, [edx + 16]
movdqu xmm4, [edx]
movdqu xmm5, [edx + 16]
pand xmm2, xmm0
pand xmm3, xmm0
pand xmm4, xmm1
pand xmm5, xmm1
por xmm2, xmm4
por xmm3, xmm5
movdqa [edx], xmm2
movdqa [edx + 16], xmm3
movdqu [edx], xmm2
movdqu [edx + 16], xmm3
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -4173,14 +4173,14 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
......@@ -4204,10 +4204,10 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
......@@ -4246,8 +4246,8 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrlw xmm0, 8 // YUYV -> UVUV
psrlw xmm1, 8
......@@ -4385,14 +4385,14 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
......@@ -4416,10 +4416,10 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
......@@ -4458,8 +4458,8 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5 // UYVY -> UVUV
pand xmm1, xmm5
......@@ -4666,7 +4666,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jge convertloop4
......@@ -4782,16 +4782,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 4 pixel loop.
convertloop4:
movdqa xmm3, [eax] // src argb
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqa xmm2, [esi] // _r_b
movdqu xmm2, [esi] // _r_b
pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqa xmm1, [esi] // _a_g
movdqu xmm1, [esi] // _a_g
lea esi, [esi + 16]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
......@@ -4801,7 +4801,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jge convertloop4
jmp convertloop4b
......@@ -4827,7 +4827,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jge convertuloop4
......@@ -4883,17 +4883,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
align 4
convertloop:
movdqa xmm0, [eax] // read 4 pixels
movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm0 // first 2
pshufhw xmm2, xmm0, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm0, xmm2 // rgb * a
movdqa xmm1, [eax] // read 4 pixels
movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm1 // next 2 pixels
pshufhw xmm2, xmm1, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // alphas
movdqu xmm2, [eax] // alphas
lea eax, [eax + 16]
psrlw xmm0, 8
pand xmm2, xmm4
......@@ -4902,7 +4902,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pand xmm0, xmm5 // keep original alphas
por xmm0, xmm2
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
......@@ -5177,16 +5177,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
align 4
convertloop:
movdqa xmm0, [eax] // G
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax] // G
movdqu xmm1, [eax + 16]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm0, xmm1
paddw xmm0, xmm5 // Add .5 for rounding.
psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 G bytes
movdqa xmm2, [eax] // A
movdqa xmm3, [eax + 16]
movdqu xmm2, [eax] // A
movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
psrld xmm2, 24
psrld xmm3, 24
......@@ -5199,8 +5199,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
punpcklwd xmm0, xmm3 // GGGA first 4
punpckhwd xmm1, xmm3 // GGGA next 4
sub ecx, 8
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
jg convertloop
ret
......@@ -5237,30 +5237,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
align 4
convertloop:
movdqa xmm0, [eax] // B
movdqa xmm6, [eax + 16]
movdqu xmm0, [eax] // B
movdqu xmm6, [eax + 16]
pmaddubsw xmm0, xmm2
pmaddubsw xmm6, xmm2
phaddw xmm0, xmm6
psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 B values
movdqa xmm5, [eax] // G
movdqa xmm1, [eax + 16]
movdqu xmm5, [eax] // G
movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm3
pmaddubsw xmm1, xmm3
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 G values
punpcklbw xmm0, xmm5 // 8 BG values
movdqa xmm5, [eax] // R
movdqa xmm1, [eax + 16]
movdqu xmm5, [eax] // R
movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 R values
movdqa xmm6, [eax] // A
movdqa xmm1, [eax + 16]
movdqu xmm6, [eax] // A
movdqu xmm1, [eax + 16]
psrld xmm6, 24
psrld xmm1, 24
packuswb xmm6, xmm1
......@@ -5270,8 +5270,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
punpcklwd xmm0, xmm5 // BGRA first 4
punpckhwd xmm1, xmm5 // BGRA next 4
sub ecx, 8
movdqa [eax], xmm0
movdqa [eax + 16], xmm1
movdqu [eax], xmm0
movdqu [eax + 16], xmm1
lea eax, [eax + 32]
jg convertloop
ret
......@@ -5300,12 +5300,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
align 4
convertloop:
movdqa xmm0, [eax] // B
movdqa xmm7, [eax + 16]
movdqu xmm0, [eax] // B
movdqu xmm7, [eax + 16]
pmaddubsw xmm0, xmm2
pmaddubsw xmm7, xmm2
movdqa xmm6, [eax] // G
movdqa xmm1, [eax + 16]
movdqu xmm6, [eax] // G
movdqu xmm1, [eax + 16]
pmaddubsw xmm6, xmm3
pmaddubsw xmm1, xmm3
phaddsw xmm0, xmm7 // B
......@@ -5315,13 +5315,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
packuswb xmm0, xmm0 // 8 B values
packuswb xmm6, xmm6 // 8 G values
punpcklbw xmm0, xmm6 // 8 BG values
movdqa xmm1, [eax] // R
movdqa xmm7, [eax + 16]
movdqu xmm1, [eax] // R
movdqu xmm7, [eax + 16]
pmaddubsw xmm1, xmm4
pmaddubsw xmm7, xmm4
phaddsw xmm1, xmm7 // R
movdqa xmm6, [eax] // A
movdqa xmm7, [eax + 16]
movdqu xmm6, [eax] // A
movdqu xmm7, [eax + 16]
pmaddubsw xmm6, xmm5
pmaddubsw xmm7, xmm5
phaddsw xmm6, xmm7 // A
......@@ -5334,8 +5334,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
punpcklwd xmm0, xmm1 // BGRA first 4
punpckhwd xmm6, xmm1 // BGRA next 4
sub ecx, 8
movdqa [edx], xmm0
movdqa [edx + 16], xmm6
movdqu [edx], xmm0
movdqu [edx + 16], xmm6
lea eax, [eax + 32]
lea edx, [edx + 32]
jg convertloop
......@@ -5368,14 +5368,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
align 4
convertloop:
movdqa xmm0, [eax] // read 4 pixels
movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm5 // first 2 pixels
pmulhuw xmm0, xmm2 // pixel * scale >> 16
movdqa xmm1, [eax] // read 4 pixels
movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm5 // next 2 pixels
pmulhuw xmm1, xmm2
pmullw xmm0, xmm3 // * interval_size
movdqa xmm7, [eax] // read 4 pixels
movdqu xmm7, [eax] // read 4 pixels
pmullw xmm1, xmm3
pand xmm7, xmm6 // mask alpha
paddw xmm0, xmm4 // + interval_size / 2
......@@ -5383,7 +5383,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
packuswb xmm0, xmm1
por xmm0, xmm7
sub ecx, 4
movdqa [eax], xmm0
movdqu [eax], xmm0
lea eax, [eax + 16]
jg convertloop
ret
......@@ -5407,7 +5407,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
align 4
convertloop:
movdqa xmm0, [eax] // read 4 pixels
movdqu xmm0, [eax] // read 4 pixels
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0 // first 2
......@@ -5418,7 +5418,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
......@@ -5775,8 +5775,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
align 4
convertloop:
movdqa xmm0, [eax] // read 16 pixels src_sobelx
movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely
movdqa xmm2, xmm0 // GG
......@@ -5793,10 +5793,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
por xmm3, xmm5 // GGGA
por xmm0, xmm5
sub ecx, 16
movdqa [edx], xmm1
movdqa [edx + 16], xmm2
movdqa [edx + 32], xmm3
movdqa [edx + 48], xmm0
movdqu [edx], xmm1
movdqu [edx + 16], xmm2
movdqu [edx + 32], xmm3
movdqu [edx + 48], xmm0
lea edx, [edx + 64]
jg convertloop
......@@ -5821,12 +5821,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
align 4
convertloop:
movdqa xmm0, [eax] // read 16 pixels src_sobelx
movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
......@@ -5856,8 +5856,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
align 4
convertloop:
movdqa xmm0, [eax] // read 16 pixels src_sobelx
movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
movdqa xmm2, xmm0
paddusb xmm2, xmm1 // sobel = sobelx + sobely
......@@ -5874,10 +5874,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
punpcklwd xmm7, xmm0 // Next 4
punpckhwd xmm1, xmm0 // Last 4
sub ecx, 16
movdqa [edx], xmm6
movdqa [edx + 16], xmm4
movdqa [edx + 32], xmm7
movdqa [edx + 48], xmm1
movdqu [edx], xmm6
movdqu [edx + 16], xmm4
movdqu [edx + 32], xmm7
movdqu [edx + 48], xmm1
lea edx, [edx + 64]
jg convertloop
......@@ -5933,10 +5933,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
align 4
s4:
// top left
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
// - top right
psubd xmm0, [eax + edx * 4]
......@@ -5976,10 +5976,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
align 4
l4:
// top left
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
// - top right
psubd xmm0, [eax + edx * 4]
......@@ -6028,7 +6028,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 1 pixel loop
align 4
l1:
movdqa xmm0, [eax]
movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
lea eax, [eax + 16]
psubd xmm0, [esi]
......@@ -6084,26 +6084,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
punpckhwd xmm5, xmm1
paddd xmm0, xmm2
movdqa xmm2, [esi] // previous row above.
movdqu xmm2, [esi] // previous row above.
paddd xmm2, xmm0
paddd xmm0, xmm3
movdqa xmm3, [esi + 16]
movdqu xmm3, [esi + 16]
paddd xmm3, xmm0
paddd xmm0, xmm4
movdqa xmm4, [esi + 32]
movdqu xmm4, [esi + 32]
paddd xmm4, xmm0
paddd xmm0, xmm5
movdqa xmm5, [esi + 48]
movdqu xmm5, [esi + 48]
lea esi, [esi + 64]
paddd xmm5, xmm0
movdqa [edx], xmm2
movdqa [edx + 16], xmm3
movdqa [edx + 32], xmm4
movdqa [edx + 48], xmm5
movdqu [edx], xmm2
movdqu [edx + 16], xmm3
movdqu [edx + 32], xmm4
movdqu [edx + 48], xmm5
lea edx, [edx + 64]
sub ecx, 4
......@@ -6552,8 +6552,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pshufb xmm0, xmm5
pshufb xmm1, xmm5
......@@ -6580,8 +6580,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrld xmm0, 8 // Move green to bottom.
psrld xmm1, 8
......@@ -6605,19 +6605,19 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
movdqa xmm5, [ecx]
movdqu xmm5, [ecx]
mov ecx, [esp + 16] // pix
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pshufb xmm0, xmm5
pshufb xmm1, xmm5
sub ecx, 8
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
jg wloop
ret
......@@ -6631,7 +6631,7 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
movdqa xmm5, [ecx]
movdqu xmm5, [ecx]
mov ecx, [esp + 16] // pix
align 4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment