Commit 91f240c5 authored by fbarchard@google.com's avatar fbarchard@google.com

Move sub before branch for loops.

Remove CopyRow_x86
Add CopyRow_Any versions for AVX, SSE2 and Neon.
BUG=269
TESTED=local build
R=harryjin@google.com, tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/26209004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1175 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 813bf9f9
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1174
Version: 1175
License: BSD
License File: LICENSE
......
......@@ -111,7 +111,6 @@ extern "C" {
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86
#define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
......@@ -877,10 +876,12 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
void CopyRow_C(const uint8* src, uint8* dst, int count);
void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_16_C(const uint16* src, uint16* dst, int count);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1174
#define LIBYUV_VERSION 1175
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -29,7 +29,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"lea " MEMLEA(0x10, 0) ",%0 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
"lea " MEMLEA(0x10, 1) ",%1 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n"
......@@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n"
......@@ -124,13 +124,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n"
"sub $0x10,%1 \n"
"paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"sub $0x10,%1 \n"
"jg 1b \n"
"movd %%xmm0,%3 \n"
: "+r"(src), // %0
......
......@@ -33,7 +33,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
lea eax, [eax + 16]
movdqu xmm2, [edx]
lea edx, [edx + 16]
sub ecx, 16
movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2
psubusb xmm2, xmm3
......@@ -45,6 +44,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pmaddwd xmm2, xmm2
paddd xmm0, xmm1
paddd xmm0, xmm2
sub ecx, 16
jg wloop
pshufd xmm1, xmm0, 0xee
......@@ -75,7 +75,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
sub ecx, 32
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
......@@ -85,6 +84,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
......@@ -170,7 +170,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
......@@ -178,6 +177,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
......@@ -209,13 +209,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
......
......@@ -188,19 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
int width, int height) {
int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -209,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
......@@ -419,24 +414,14 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
dst_stride_v = -dst_stride_v;
}
// CopyRow for rows of just Y in Q420 copied to Y plane of I420.
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -444,12 +429,16 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
}
#endif
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
......
......@@ -41,19 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
if (src_y == dst_y && src_stride_y == dst_stride_y) {
return;
}
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -62,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
......@@ -93,11 +88,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
height = 1;
src_stride_y = dst_stride_y = 0;
}
#if defined(HAS_COPYROW_16_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_16_X86;
}
#endif
#if defined(HAS_COPYROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_16_SSE2;
......
......@@ -918,24 +918,14 @@ void RotatePlane180(const uint8* src, int src_stride,
MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -943,6 +933,11 @@ void RotatePlane180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
......
......@@ -125,24 +125,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
}
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -150,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
......
......@@ -621,8 +621,6 @@ NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
#endif
#undef NANY
#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \
......@@ -659,6 +657,27 @@ MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3)
#endif
#undef MANY
#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
COPY_SIMD(src_y, dst_y, n); \
} \
COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \
}
#ifdef HAS_COPYROW_AVX
MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63)
#endif
#ifdef HAS_COPYROW_SSE2
MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31)
#endif
#ifdef HAS_COPYROW_NEON
MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
#endif
#undef MANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -296,9 +296,9 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"pshufb %%xmm4,%%xmm3 \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
......@@ -337,9 +337,9 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"pshufb %%xmm4,%%xmm3 \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
......@@ -725,9 +725,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
......@@ -765,9 +765,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
......@@ -837,10 +837,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
......@@ -910,10 +910,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
......@@ -961,7 +961,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"psraw $0x8,%%xmm2 \n"
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
......@@ -980,6 +979,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"lea " MEMLEA(0x40,0) ",%0 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
......@@ -1038,10 +1038,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
......@@ -1080,9 +1080,9 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
......@@ -1145,10 +1145,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_bgra0), // %0
"+r"(dst_u), // %1
......@@ -1186,9 +1186,9 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
......@@ -1223,9 +1223,9 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
......@@ -1288,10 +1288,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
......@@ -1357,10 +1357,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_rgba0), // %0
"+r"(dst_u), // %1
......@@ -2186,9 +2186,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"1: \n"
MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -2215,9 +2215,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpermq $0x4e,%%ymm0,%%ymm0 \n"
"sub $0x20,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
......@@ -2249,9 +2249,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1)",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -2285,10 +2285,10 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n"
"pshufb %%xmm1,%%xmm0 \n"
"sub $8,%3 \n"
"movlpd %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $8,%3 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_u), // %1
......@@ -2322,9 +2322,9 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -2346,13 +2346,13 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = {
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile (
"vmovdqa %3,%%ymm5 \n"
"vmovdqu %3,%%ymm5 \n"
LABELALIGN
"1: \n"
VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
"sub $0x20,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
......@@ -2574,21 +2574,6 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_AVX
#ifdef HAS_COPYROW_X86
void CopyRow_X86(const uint8* src, uint8* dst, int width) {
size_t width_tmp = (size_t)(width);
asm volatile (
"shr $0x2,%2 \n"
"rep movsl " MEMMOVESTRING(0,1) " \n"
: "+S"(src), // %0
"+D"(dst), // %1
"+c"(width_tmp) // %2
:
: "memory", "cc"
);
}
#endif // HAS_COPYROW_X86
#ifdef HAS_COPYROW_ERMS
// Multiple of 1.
void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
......@@ -2894,9 +2879,9 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
......@@ -3006,9 +2991,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
"vpand %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"sub $0x20,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
......@@ -3119,9 +3104,9 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"sub $0x20,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
......@@ -3263,9 +3248,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
......@@ -3295,9 +3280,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jge 41b \n"
"49: \n"
......@@ -3326,9 +3311,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
......@@ -3398,9 +3383,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
......@@ -3428,9 +3413,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
"49: \n"
......@@ -3457,9 +3442,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
......@@ -3505,9 +3490,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"packuswb %%xmm1,%%xmm0 \n"
"pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -3558,9 +3543,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -3603,9 +3588,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpor %%ymm6,%%ymm0,%%ymm0 \n"
"sub $0x8,%2 \n"
MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
"lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
......@@ -3651,9 +3636,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pmulhuw %%xmm2,%%xmm1 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -3723,9 +3708,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"sub $0x8,%2 \n"
MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
"lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
......@@ -3776,10 +3761,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm3,%%xmm0 \n"
"punpckhwd %%xmm3,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -3853,10 +3838,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm5,%%xmm0 \n"
"punpckhwd %%xmm5,%%xmm1 \n"
"sub $0x8,%1 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%1 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
......@@ -3919,11 +3904,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"movdqa %%xmm0,%%xmm6 \n"
"punpcklwd %%xmm1,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm6 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -3972,9 +3957,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
"paddw %%xmm4,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm7,%%xmm0 \n"
"sub $0x4,%1 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x4,%1 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
......@@ -4011,9 +3996,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -4050,9 +4035,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
......@@ -4119,9 +4104,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
......@@ -4179,9 +4164,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"psubusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
......@@ -4264,9 +4249,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
"psubw %%xmm0,%%xmm1 \n"
"pmaxsw %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"sub $0x8,%4 \n"
MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
"lea " MEMLEA(0x8,0) ",%0 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
......@@ -4322,9 +4307,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
"psubw %%xmm0,%%xmm1 \n"
"pmaxsw %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"sub $0x8,%3 \n"
MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
"lea " MEMLEA(0x8,0) ",%0 \n"
"sub $0x8,%3 \n"
"jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
......@@ -4375,12 +4360,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"punpckhwd %%xmm0,%%xmm0 \n"
"por %%xmm5,%%xmm3 \n"
"por %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqu %%xmm1," MEMACCESS(2) " \n"
"movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
"movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
"movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
......@@ -4414,9 +4399,9 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
......@@ -4466,12 +4451,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"movdqa %%xmm1,%%xmm7 \n"
"punpcklwd %%xmm0,%%xmm7 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"sub $0x10,%3 \n"
"movdqu %%xmm6," MEMACCESS(2) " \n"
"movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
"movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
"movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
......@@ -4757,9 +4742,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
"punpckldq %%xmm6,%%xmm0 \n"
"addps %%xmm4,%%xmm3 \n"
"sub $0x4,%4 \n"
"movq %%xmm0," MEMACCESS2(0x08,2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%4 \n"
"jge 40b \n"
"49: \n"
......@@ -4775,9 +4760,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"addps %%xmm7,%%xmm2 \n"
"movd %%xmm0,%k1 \n"
MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
"sub $0x1,%4 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x04,2) ",%2 \n"
"sub $0x1,%4 \n"
"jge 10b \n"
"19: \n"
: "+r"(src_argb), // %0
......@@ -4836,9 +4821,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"jmp 99f \n"
......@@ -4849,9 +4834,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 25b \n"
"jmp 99f \n"
......@@ -4861,9 +4846,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 50b \n"
"jmp 99f \n"
......@@ -4874,9 +4859,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm0)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 75b \n"
"jmp 99f \n"
......@@ -4884,9 +4869,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
LABELALIGN
"100: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 100b \n"
"99: \n"
......@@ -4952,9 +4937,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"jmp 99f \n"
......@@ -4965,9 +4950,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 25b \n"
"jmp 99f \n"
......@@ -4977,9 +4962,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 50b \n"
"jmp 99f \n"
......@@ -4990,9 +4975,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 75b \n"
"jmp 99f \n"
......@@ -5000,9 +4985,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
LABELALIGN
"100: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 100b \n"
"99: \n"
......@@ -5037,9 +5022,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"punpckldq %%xmm1,%%xmm0 \n"
"sub $0x8,%2 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
......@@ -5070,9 +5055,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
"pand %%xmm5,%%xmm1 \n"
"packssdw %%xmm1,%%xmm0 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x8,%2 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
......@@ -5099,10 +5084,10 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -5129,10 +5114,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
"sub $0x10,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
......@@ -5196,9 +5181,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0x1b,%%xmm1,%%xmm1 \n"
"pshuflw $0x1b,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%3 \n"
"jg 123b \n"
"jmp 99f \n"
......@@ -5214,9 +5199,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0x39,%%xmm1,%%xmm1 \n"
"pshuflw $0x39,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%3 \n"
"jg 321b \n"
"jmp 99f \n"
......@@ -5232,9 +5217,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0x93,%%xmm1,%%xmm1 \n"
"pshuflw $0x93,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%3 \n"
"jg 2103b \n"
"jmp 99f \n"
......@@ -5250,9 +5235,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0xc6,%%xmm1,%%xmm1 \n"
"pshuflw $0xc6,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%3 \n"
"jg 3012b \n"
"99: \n"
......@@ -5394,9 +5379,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
"cvttps2dq %%xmm4,%%xmm4 \n"
"packuswb %%xmm4,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"sub $0x2,%2 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x2,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -5435,9 +5420,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
"vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
"sub $0x2,%2 \n"
"vmovq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x2,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
......@@ -5597,9 +5582,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"mov %b0," MEMACCESS2(0xe,3) " \n"
"movzb " MEMACCESS2(0xf,2) ",%0 \n"
"mov %b0," MEMACCESS2(0xf,3) " \n"
"sub $0x4,%4 \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"lea " MEMLEA(0x10,3) ",%3 \n"
"sub $0x4,%4 \n"
"jg 1b \n"
: "+d"(pixel_temp), // %0
"+a"(table_temp), // %1
......
......@@ -313,9 +313,9 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
pshufb xmm3, xmm4
movdqu [edx + 16], xmm1
por xmm3, xmm5
sub ecx, 16
movdqu [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
ret
}
......@@ -353,9 +353,9 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
pshufb xmm3, xmm4
movdqu [edx + 16], xmm1
por xmm3, xmm5
sub ecx, 16
movdqu [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
ret
}
......@@ -728,9 +728,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
......@@ -764,9 +764,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
......@@ -782,7 +782,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov ecx, [esp + 12] /* pix */
vbroadcastf128 ymm4, kARGBToY
vbroadcastf128 ymm5, kAddY16
vmovdqa ymm6, kPermdARGBToY_AVX
vmovdqu ymm6, kPermdARGBToY_AVX
align 4
convertloop:
......@@ -802,9 +802,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vpackuswb ymm0, ymm0, ymm2 // mutates.
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
vpaddb ymm0, ymm0, ymm5
sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
vzeroupper
ret
......@@ -822,7 +822,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov ecx, [esp + 12] /* pix */
vbroadcastf128 ymm4, kARGBToYJ
vbroadcastf128 ymm5, kAddYJ64
vmovdqa ymm6, kPermdARGBToY_AVX
vmovdqu ymm6, kPermdARGBToY_AVX
align 4
convertloop:
......@@ -843,9 +843,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vpsrlw ymm2, ymm2, 7
vpackuswb ymm0, ymm0, ymm2 // mutates.
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
vzeroupper
......@@ -880,9 +880,9 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
......@@ -914,9 +914,9 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
......@@ -948,9 +948,9 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
......@@ -1015,10 +1015,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
pop edi
......@@ -1087,10 +1087,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
packsswb xmm0, xmm1
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
pop edi
......@@ -1152,10 +1152,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values
sub ecx, 32
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
pop edi
......@@ -1197,7 +1197,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
psraw xmm2, 8
packsswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0
movdqu xmm0, [eax] // V
......@@ -1217,6 +1216,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
lea eax, [eax + 64]
movdqu [edx + edi], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
pop edi
......@@ -1272,10 +1272,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
pop edi
......@@ -1342,10 +1342,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
pop edi
......@@ -1413,10 +1413,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
pop edi
......@@ -1484,10 +1484,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
pop edi
......@@ -2043,9 +2043,9 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
por xmm3, xmm2 // BG
por xmm1, xmm3 // BGR
packssdw xmm0, xmm1
sub ecx, 8
movdqu [edx], xmm0 // store 8 pixels of RGB565
lea edx, [edx + 16]
sub ecx, 8
jg convertloop
pop edi
......@@ -2411,9 +2411,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
convertloop:
movdqu xmm0, [eax - 16 + ecx]
pshufb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
......@@ -2434,9 +2434,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
vmovdqu ymm0, [eax - 32 + ecx]
vpshufb ymm0, ymm0, ymm5
vpermq ymm0, ymm0, 0x4e // swap high and low halfs
sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
vzeroupper
ret
......@@ -2462,9 +2462,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
pshuflw xmm0, xmm0, 0x1b // swap words
pshufhw xmm0, xmm0, 0x1b
pshufd xmm0, xmm0, 0x4e // swap qwords
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
......@@ -2495,10 +2495,10 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
movdqu xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm1
sub ecx, 8
movlpd qword ptr [edx], xmm0
movhpd qword ptr [edx + edi], xmm0
lea edx, [edx + 8]
sub ecx, 8
jg convertloop
pop edi
......@@ -2527,9 +2527,9 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
movdqu xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm5
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg convertloop
ret
}
......@@ -2548,14 +2548,14 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
vmovdqa ymm5, kARGBShuffleMirror_AVX2
vmovdqu ymm5, kARGBShuffleMirror_AVX2
align 4
convertloop:
vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
sub ecx, 8
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
vzeroupper
ret
......@@ -2773,25 +2773,6 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
}
}
#ifdef HAS_COPYROW_X86
// Multiple of 4.
__declspec(naked) __declspec(align(16))
void CopyRow_X86(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, esi
mov edx, edi
mov esi, [esp + 4] // src
mov edi, [esp + 8] // dst
mov ecx, [esp + 12] // count
shr ecx, 2
rep movsd
mov edi, edx
mov esi, eax
ret
}
}
#endif // HAS_COPYROW_X86
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels
__declspec(naked) __declspec(align(16))
......@@ -2998,9 +2979,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
vzeroupper
ret
......@@ -3109,9 +3090,9 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
vzeroupper
ret
......@@ -3223,9 +3204,9 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
......@@ -3328,9 +3309,9 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
......@@ -3466,9 +3447,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
jge alignloop1
alignloop1b:
......@@ -3497,9 +3478,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jge convertloop4
convertloop4b:
......@@ -3528,9 +3509,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
jge convertloop1
convertloop1b:
......@@ -3598,9 +3579,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
jge alignloop1
alignloop1b:
......@@ -3627,9 +3608,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jge convertloop4
convertloop4b:
......@@ -3656,9 +3637,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
jge convertloop1
convertloop1b:
......@@ -3701,9 +3682,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
packuswb xmm0, xmm1
pand xmm0, xmm5 // keep original alphas
por xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg convertloop
ret
......@@ -3750,9 +3731,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
psrlw xmm1, 8
packuswb xmm0, xmm1
por xmm0, xmm2 // copy original alpha
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg convertloop
ret
......@@ -3790,9 +3771,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // unmutated.
vpor ymm0, ymm0, ymm6 // copy original alpha
sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
sub ecx, 8
jg convertloop
vzeroupper
......@@ -3839,9 +3820,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 16]
packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg convertloop
pop edi
pop esi
......@@ -3883,9 +3864,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
vpackuswb ymm0, ymm0, ymm1 // unmutated.
sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
sub ecx, 8
jg convertloop
vzeroupper
......@@ -3945,9 +3926,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
vpackuswb ymm0, ymm0, ymm1 // unmutated.
sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
sub ecx, 8
jg convertloop
pop edi
......@@ -3993,10 +3974,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
movdqa xmm1, xmm0
punpcklwd xmm0, xmm3 // GGGA first 4
punpckhwd xmm1, xmm3 // GGGA next 4
sub ecx, 8
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
ret
}
......@@ -4064,10 +4045,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
movdqa xmm1, xmm0 // Weave BG, RA together
punpcklwd xmm0, xmm5 // BGRA first 4
punpckhwd xmm1, xmm5 // BGRA next 4
sub ecx, 8
movdqu [eax], xmm0
movdqu [eax + 16], xmm1
lea eax, [eax + 32]
sub ecx, 8
jg convertloop
ret
}
......@@ -4128,11 +4109,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
movdqa xmm6, xmm0 // Weave BG, RA together
punpcklwd xmm0, xmm1 // BGRA first 4
punpckhwd xmm6, xmm1 // BGRA next 4
sub ecx, 8
movdqu [edx], xmm0
movdqu [edx + 16], xmm6
lea eax, [eax + 32]
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
ret
}
......@@ -4176,9 +4157,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
paddw xmm1, xmm4
packuswb xmm0, xmm1
por xmm0, xmm7
sub ecx, 4
movdqu [eax], xmm0
lea eax, [eax + 16]
sub ecx, 4
jg convertloop
ret
}
......@@ -4210,9 +4191,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
psrlw xmm0, 8
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg convertloop
ret
......@@ -4248,9 +4229,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
lea eax, [eax + 16]
lea esi, [esi + 16]
packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg convertloop
pop esi
......@@ -4282,9 +4263,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
paddusb xmm0, xmm1 // src_argb0 + src_argb1
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jge convertloop4
convertloop49:
......@@ -4297,9 +4278,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4]
paddusb xmm0, xmm1 // src_argb0 + src_argb1
sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
jge convertloop1
convertloop19:
......@@ -4328,9 +4309,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
psubusb xmm0, xmm1 // src_argb0 - src_argb1
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg convertloop
pop esi
......@@ -4482,9 +4463,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
psubw xmm1, xmm0
pmaxsw xmm0, xmm1
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [eax + edx], xmm0
lea eax, [eax + 8]
sub ecx, 8
jg convertloop
pop edi
......@@ -4536,9 +4517,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
psubw xmm1, xmm0
pmaxsw xmm0, xmm1
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [eax + edx], xmm0
lea eax, [eax + 8]
sub ecx, 8
jg convertloop
pop esi
......@@ -4585,12 +4566,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
punpckhwd xmm0, xmm0 // Last 4
por xmm3, xmm5 // GGGA
por xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm1
movdqu [edx + 16], xmm2
movdqu [edx + 32], xmm3
movdqu [edx + 48], xmm0
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
pop esi
......@@ -4618,9 +4599,9 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
pop esi
......@@ -4666,12 +4647,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
movdqa xmm7, xmm1 // YSXA
punpcklwd xmm7, xmm0 // Next 4
punpckhwd xmm1, xmm0 // Last 4
sub ecx, 16
movdqu [edx], xmm6
movdqu [edx + 16], xmm4
movdqu [edx + 32], xmm7
movdqu [edx + 48], xmm1
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
pop esi
......@@ -4983,9 +4964,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
movd xmm0, [eax + edi] // read pixel 3
punpckldq xmm6, xmm0 // combine pixel 2 and 3
addps xmm3, xmm4 // x, y += dx, dy next 2
sub ecx, 4
movq qword ptr 8[edx], xmm6
lea edx, [edx + 16]
sub ecx, 4
jge l4
l4b:
......@@ -5001,9 +4982,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
addps xmm2, xmm7 // x, y += dx, dy
movd esi, xmm0
movd xmm0, [eax + esi] // copy a pixel
sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
jge l1
l1b:
pop edi
......@@ -5059,9 +5040,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vpsrlw ymm0, ymm0, 7
vpsrlw ymm1, ymm1, 7
vpackuswb ymm0, ymm0, ymm1 // unmutates
sub ecx, 32
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
sub ecx, 32
jg xloop
jmp xloop99
......@@ -5072,9 +5053,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vmovdqu ymm1, [esi + edx]
vpavgb ymm0, ymm0, ymm1
vpavgb ymm0, ymm0, ymm1
sub ecx, 32
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
sub ecx, 32
jg xloop25
jmp xloop99
......@@ -5083,9 +5064,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
xloop50:
vmovdqu ymm0, [esi]
vpavgb ymm0, ymm0, [esi + edx]
sub ecx, 32
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
sub ecx, 32
jg xloop50
jmp xloop99
......@@ -5096,9 +5077,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vmovdqu ymm0, [esi + edx]
vpavgb ymm0, ymm0, ymm1
vpavgb ymm0, ymm0, ymm1
sub ecx, 32
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
sub ecx, 32
jg xloop75
jmp xloop99
......@@ -5161,9 +5142,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
psrlw xmm0, 7
psrlw xmm1, 7
packuswb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop
jmp xloop99
......@@ -5174,9 +5155,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop25
jmp xloop99
......@@ -5186,9 +5167,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop50
jmp xloop99
......@@ -5199,9 +5180,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop75
jmp xloop99
......@@ -5209,9 +5190,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
align 4
xloop100:
movdqu xmm0, [esi]
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop100
xloop99:
......@@ -5273,9 +5254,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
paddw xmm0, xmm2 // sum rows
paddw xmm1, xmm3
packuswb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop
jmp xloop99
......@@ -5286,9 +5267,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop25
jmp xloop99
......@@ -5298,9 +5279,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop50
jmp xloop99
......@@ -5311,9 +5292,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop75
jmp xloop99
......@@ -5321,9 +5302,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
align 4
xloop100:
movdqu xmm0, [esi]
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop100
xloop99:
......@@ -5352,9 +5333,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
pshufb xmm0, xmm5
pshufb xmm1, xmm5
punpckldq xmm0, xmm1
sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
jg wloop
ret
}
......@@ -5383,9 +5364,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
pand xmm1, xmm5
packssdw xmm0, xmm1
packuswb xmm0, xmm1
sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
jg wloop
ret
}
......@@ -5409,10 +5390,10 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 32]
pshufb xmm0, xmm5
pshufb xmm1, xmm5
sub ecx, 8
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg wloop
ret
}
......@@ -5436,10 +5417,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 64]
vpshufb ymm0, ymm0, ymm5
vpshufb ymm1, ymm1, ymm5
sub ecx, 16
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 16
jg wloop
vzeroupper
......@@ -5502,9 +5483,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 01Bh
pshuflw xmm1, xmm1, 01Bh
packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg shuf_0123
jmp shuf99
......@@ -5520,9 +5501,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 039h
pshuflw xmm1, xmm1, 039h
packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg shuf_0321
jmp shuf99
......@@ -5538,9 +5519,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 093h
pshuflw xmm1, xmm1, 093h
packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg shuf_2103
jmp shuf99
......@@ -5556,9 +5537,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 0C6h
pshuflw xmm1, xmm1, 0C6h
packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg shuf_3012
shuf99:
......@@ -5700,9 +5681,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
cvttps2dq xmm4, xmm4
packuswb xmm0, xmm4
packuswb xmm0, xmm0
sub ecx, 2
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 2
jg convertloop
pop esi
ret
......@@ -5740,9 +5721,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
sub ecx, 2
vmovq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 2
jg convertloop
vzeroupper
ret
......@@ -5905,9 +5886,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
movzx edx, byte ptr [eax + 15] // copy alpha.
mov byte ptr [edi + 15], dl
sub ecx, 4
lea eax, [eax + 16]
lea edi, [edi + 16]
sub ecx, 4
jg convertloop
pop edi
......
......@@ -534,11 +534,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm0,%%xmm1 \n"
"pmulhuw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"sub $0x6,%2 \n"
"movd %%xmm1," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm1 \n"
"movd %%xmm1," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -602,11 +602,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm7,%%xmm6 \n"
"pmulhuw %%xmm4,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"sub $0x6,%2 \n"
"movd %%xmm6," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm6 \n"
"movd %%xmm6," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -765,10 +765,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"sub $0x20,%2 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
: "+r"(dst_ptr), // %0
......@@ -792,9 +792,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"shufps $0xdd,%%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -820,9 +820,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -852,9 +852,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -890,9 +890,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
"lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
"punpckldq %%xmm3,%%xmm2 \n"
"punpcklqdq %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
......@@ -941,9 +941,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
......@@ -997,9 +997,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
"pextrw $0x3,%%xmm2,%k1 \n"
"punpckldq %%xmm4,%%xmm1 \n"
"punpcklqdq %%xmm1,%%xmm0 \n"
"sub $0x4,%4 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%4 \n"
"jge 40b \n"
"49: \n"
......@@ -1046,10 +1046,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
"movdqa %%xmm0,%%xmm1 \n"
"punpckldq %%xmm0,%%xmm0 \n"
"punpckhdq %%xmm1,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
......
......@@ -111,9 +111,9 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
ret
......@@ -149,9 +149,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm1, xmm3
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
ret
......@@ -192,9 +192,9 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm1, xmm3
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
pop esi
......@@ -226,9 +226,9 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
packuswb xmm0, xmm1
psrlw xmm0, 8
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
jg wloop
ret
......@@ -285,9 +285,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm0, xmm2
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
jg wloop
pop edi
......@@ -398,9 +398,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx + 24]
sub ecx, 24
jg wloop
pop esi
......@@ -460,9 +460,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx+24]
sub ecx, 24
jg wloop
pop esi
......@@ -493,11 +493,11 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
pshufb xmm1, xmm5
paddusb xmm0, xmm1
sub ecx, 12
movq qword ptr [edx], xmm0 // write 12 pixels
movhlps xmm1, xmm0
movd [edx + 8], xmm1
lea edx, [edx + 12]
sub ecx, 12
jg xloop
ret
......@@ -558,11 +558,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
packuswb xmm6, xmm6
sub ecx, 6
movd [edx], xmm6 // write 6 pixels
psrlq xmm6, 16
movd [edx + 2], xmm6
lea edx, [edx + 6]
sub ecx, 6
jg xloop
pop esi
......@@ -604,11 +604,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
packuswb xmm1, xmm1
sub ecx, 6
movd [edx], xmm1 // write 6 pixels
psrlq xmm1, 16
movd [edx + 2], xmm1
lea edx, [edx + 6]
sub ecx, 6
jg xloop
pop esi
......@@ -784,10 +784,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0
punpckhbw xmm1, xmm1
sub ecx, 32
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 32
jg wloop
ret
......@@ -812,9 +812,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
shufps xmm0, xmm1, 0xdd
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
ret
......@@ -842,9 +842,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
ret
......@@ -877,9 +877,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
pop esi
......@@ -914,9 +914,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea eax, [eax + ebx * 4]
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
pop edi
......@@ -963,9 +963,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
pop edi
......@@ -1021,9 +1021,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
punpckldq xmm1, xmm4 // x2 x3
punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
sub ecx, 4 // 4 pixels
movdqu [edi], xmm0
lea edi, [edi + 16]
sub ecx, 4 // 4 pixels
jge xloop4
align 4
......@@ -1160,10 +1160,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
movdqa xmm1, xmm0
punpckldq xmm0, xmm0
punpckhdq xmm1, xmm1
sub ecx, 8
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg wloop
ret
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment