Commit 91f240c5 authored by fbarchard@google.com's avatar fbarchard@google.com

Move sub before branch for loops.

Remove CopyRow_x86
Add CopyRow_Any versions for AVX, SSE2 and Neon.
BUG=269
TESTED=local build
R=harryjin@google.com, tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/26209004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1175 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 813bf9f9
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1174
Version: 1175
License: BSD
License File: LICENSE
......
......@@ -111,7 +111,6 @@ extern "C" {
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86
#define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
......@@ -877,10 +876,12 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
void CopyRow_C(const uint8* src, uint8* dst, int count);
void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_16_C(const uint16* src, uint16* dst, int count);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1174
#define LIBYUV_VERSION 1175
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -29,7 +29,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"lea " MEMLEA(0x10, 0) ",%0 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
"lea " MEMLEA(0x10, 1) ",%1 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n"
......@@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n"
......@@ -124,13 +124,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n"
"sub $0x10,%1 \n"
"paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"sub $0x10,%1 \n"
"jg 1b \n"
"movd %%xmm0,%3 \n"
: "+r"(src), // %0
......
......@@ -33,7 +33,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
lea eax, [eax + 16]
movdqu xmm2, [edx]
lea edx, [edx + 16]
sub ecx, 16
movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2
psubusb xmm2, xmm3
......@@ -45,6 +44,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pmaddwd xmm2, xmm2
paddd xmm0, xmm1
paddd xmm0, xmm2
sub ecx, 16
jg wloop
pshufd xmm1, xmm0, 0xee
......@@ -75,7 +75,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
sub ecx, 32
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
......@@ -85,6 +84,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
......@@ -170,7 +170,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
......@@ -178,6 +177,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
......@@ -209,13 +209,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
......
......@@ -188,19 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
int width, int height) {
int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -209,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
......@@ -419,24 +414,14 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
dst_stride_v = -dst_stride_v;
}
// CopyRow for rows of just Y in Q420 copied to Y plane of I420.
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -444,12 +429,16 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
}
#endif
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
......
......@@ -41,19 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
if (src_y == dst_y && src_stride_y == dst_stride_y) {
return;
}
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -62,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
......@@ -93,11 +88,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
height = 1;
src_stride_y = dst_stride_y = 0;
}
#if defined(HAS_COPYROW_16_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_16_X86;
}
#endif
#if defined(HAS_COPYROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_16_SSE2;
......
......@@ -918,24 +918,14 @@ void RotatePlane180(const uint8* src, int src_stride,
MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -943,6 +933,11 @@ void RotatePlane180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
......
......@@ -125,24 +125,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
}
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
......@@ -150,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
......
......@@ -621,8 +621,6 @@ NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
#endif
#undef NANY
#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \
......@@ -659,6 +657,27 @@ MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3)
#endif
#undef MANY
#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
COPY_SIMD(src_y, dst_y, n); \
} \
COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \
}
#ifdef HAS_COPYROW_AVX
MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63)
#endif
#ifdef HAS_COPYROW_SSE2
MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31)
#endif
#ifdef HAS_COPYROW_NEON
MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
#endif
#undef MANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
This diff is collapsed.
This diff is collapsed.
......@@ -534,11 +534,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm0,%%xmm1 \n"
"pmulhuw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"sub $0x6,%2 \n"
"movd %%xmm1," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm1 \n"
"movd %%xmm1," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -602,11 +602,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm7,%%xmm6 \n"
"pmulhuw %%xmm4,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"sub $0x6,%2 \n"
"movd %%xmm6," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm6 \n"
"movd %%xmm6," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -765,10 +765,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"sub $0x20,%2 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
: "+r"(dst_ptr), // %0
......@@ -792,9 +792,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"shufps $0xdd,%%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -820,9 +820,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -852,9 +852,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -890,9 +890,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
"lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
"punpckldq %%xmm3,%%xmm2 \n"
"punpcklqdq %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
......@@ -941,9 +941,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
......@@ -997,9 +997,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
"pextrw $0x3,%%xmm2,%k1 \n"
"punpckldq %%xmm4,%%xmm1 \n"
"punpcklqdq %%xmm1,%%xmm0 \n"
"sub $0x4,%4 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%4 \n"
"jge 40b \n"
"49: \n"
......@@ -1046,10 +1046,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
"movdqa %%xmm0,%%xmm1 \n"
"punpckldq %%xmm0,%%xmm0 \n"
"punpckhdq %%xmm1,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
......
......@@ -111,9 +111,9 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
ret
......@@ -149,9 +149,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm1, xmm3
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
ret
......@@ -192,9 +192,9 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm1, xmm3
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
pop esi
......@@ -226,9 +226,9 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
packuswb xmm0, xmm1
psrlw xmm0, 8
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
jg wloop
ret
......@@ -285,9 +285,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm0, xmm2
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
jg wloop
pop edi
......@@ -398,9 +398,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx + 24]
sub ecx, 24
jg wloop
pop esi
......@@ -460,9 +460,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx+24]
sub ecx, 24
jg wloop
pop esi
......@@ -493,11 +493,11 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
pshufb xmm1, xmm5
paddusb xmm0, xmm1
sub ecx, 12
movq qword ptr [edx], xmm0 // write 12 pixels
movhlps xmm1, xmm0
movd [edx + 8], xmm1
lea edx, [edx + 12]
sub ecx, 12
jg xloop
ret
......@@ -558,11 +558,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
packuswb xmm6, xmm6
sub ecx, 6
movd [edx], xmm6 // write 6 pixels
psrlq xmm6, 16
movd [edx + 2], xmm6
lea edx, [edx + 6]
sub ecx, 6
jg xloop
pop esi
......@@ -604,11 +604,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
packuswb xmm1, xmm1
sub ecx, 6
movd [edx], xmm1 // write 6 pixels
psrlq xmm1, 16
movd [edx + 2], xmm1
lea edx, [edx + 6]
sub ecx, 6
jg xloop
pop esi
......@@ -784,10 +784,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0
punpckhbw xmm1, xmm1
sub ecx, 32
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 32
jg wloop
ret
......@@ -812,9 +812,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
shufps xmm0, xmm1, 0xdd
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
ret
......@@ -842,9 +842,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
ret
......@@ -877,9 +877,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
pop esi
......@@ -914,9 +914,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea eax, [eax + ebx * 4]
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
pop edi
......@@ -963,9 +963,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
pop edi
......@@ -1021,9 +1021,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
punpckldq xmm1, xmm4 // x2 x3
punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
sub ecx, 4 // 4 pixels
movdqu [edi], xmm0
lea edi, [edi + 16]
sub ecx, 4 // 4 pixels
jge xloop4
align 4
......@@ -1160,10 +1160,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
movdqa xmm1, xmm0
punpckldq xmm0, xmm0
punpckhdq xmm1, xmm1
sub ecx, 8
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg wloop
ret
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment