Commit 91f240c5 authored by fbarchard@google.com's avatar fbarchard@google.com

Move sub before branch for loops.

Remove CopyRow_x86
Add CopyRow_Any versions for AVX, SSE2 and Neon.
BUG=269
TESTED=local build
R=harryjin@google.com, tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/26209004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1175 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 813bf9f9
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1174 Version: 1175
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -111,7 +111,6 @@ extern "C" { ...@@ -111,7 +111,6 @@ extern "C" {
#define HAS_BGRATOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS #define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2 #define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86
#define HAS_I400TOARGBROW_SSE2 #define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3 #define HAS_I411TOARGBROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3
...@@ -877,10 +876,12 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -877,10 +876,12 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count); void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
void CopyRow_C(const uint8* src, uint8* dst, int count); void CopyRow_C(const uint8* src, uint8* dst, int count);
void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_16_C(const uint16* src, uint16* dst, int count); void CopyRow_16_C(const uint16* src, uint16* dst, int count);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1174 #define LIBYUV_VERSION 1175
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -29,7 +29,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -29,7 +29,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"lea " MEMLEA(0x10, 0) ",%0 \n" "lea " MEMLEA(0x10, 0) ",%0 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n" "movdqu " MEMACCESS(1) ",%%xmm2 \n"
"lea " MEMLEA(0x10, 1) ",%1 \n" "lea " MEMLEA(0x10, 1) ",%1 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n" "movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n" "psubusb %%xmm3,%%xmm2 \n"
...@@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"pmaddwd %%xmm2,%%xmm2 \n" "pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n" "pshufd $0xee,%%xmm0,%%xmm1 \n"
...@@ -124,13 +124,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { ...@@ -124,13 +124,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"pmulld %%xmm5,%%xmm1 \n" "pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n" "paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n" "paddd %%xmm2,%%xmm1 \n"
"sub $0x10,%1 \n"
"paddd %%xmm3,%%xmm1 \n" "paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n" "pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n" "paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n" "pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n" "paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"sub $0x10,%1 \n"
"jg 1b \n" "jg 1b \n"
"movd %%xmm0,%3 \n" "movd %%xmm0,%3 \n"
: "+r"(src), // %0 : "+r"(src), // %0
......
...@@ -33,7 +33,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -33,7 +33,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
lea eax, [eax + 16] lea eax, [eax + 16]
movdqu xmm2, [edx] movdqu xmm2, [edx]
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
movdqa xmm3, xmm1 // abs trick movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2 psubusb xmm1, xmm2
psubusb xmm2, xmm3 psubusb xmm2, xmm3
...@@ -45,6 +44,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -45,6 +44,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pmaddwd xmm2, xmm2 pmaddwd xmm2, xmm2
paddd xmm0, xmm1 paddd xmm0, xmm1
paddd xmm0, xmm2 paddd xmm0, xmm2
sub ecx, 16
jg wloop jg wloop
pshufd xmm1, xmm0, 0xee pshufd xmm1, xmm0, 0xee
...@@ -75,7 +75,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -75,7 +75,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vmovdqu ymm1, [eax] vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx] vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32] lea eax, [eax + 32]
sub ecx, 32
vpsubusb ymm3, ymm1, ymm2 // abs difference trick vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1 vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3 vpor ymm1, ymm2, ymm3
...@@ -85,6 +84,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -85,6 +84,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpmaddwd ymm1, ymm1, ymm1 vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1 vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2 vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
...@@ -170,7 +170,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { ...@@ -170,7 +170,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pmulld(0xcd) // pmulld xmm1, xmm5 pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2 paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3 paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords pshufd xmm2, xmm1, 0x0e // upper 2 dwords
...@@ -178,6 +177,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { ...@@ -178,6 +177,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pshufd xmm2, xmm1, 0x01 pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2 paddd xmm1, xmm2
paddd xmm0, xmm1 paddd xmm0, xmm1
sub ecx, 16
jg wloop jg wloop
movd eax, xmm0 // return hash movd eax, xmm0 // return hash
...@@ -209,13 +209,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { ...@@ -209,13 +209,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
pmulld xmm1, kHashMul3 pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2 paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3 paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2 paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01 pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2 paddd xmm1, xmm2
paddd xmm0, xmm1 paddd xmm0, xmm1
sub ecx, 16
jg wloop jg wloop
movd eax, xmm0 // return hash movd eax, xmm0 // return hash
......
...@@ -188,19 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, ...@@ -188,19 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
int width, int height) { int width, int height) {
int y; int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
...@@ -209,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, ...@@ -209,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
} }
#endif #endif
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = CopyRow_NEON; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
} }
#endif #endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
...@@ -419,24 +414,14 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, ...@@ -419,24 +414,14 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
dst_stride_v = -dst_stride_v; dst_stride_v = -dst_stride_v;
} }
// CopyRow for rows of just Y in Q420 copied to Y plane of I420. // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
...@@ -444,12 +429,16 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, ...@@ -444,12 +429,16 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) { if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS; CopyRow = CopyRow_MIPS;
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_SSE2) #if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
......
...@@ -41,19 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y, ...@@ -41,19 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
if (src_y == dst_y && src_stride_y == dst_stride_y) { if (src_y == dst_y && src_stride_y == dst_stride_y) {
return; return;
} }
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
...@@ -62,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y, ...@@ -62,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = CopyRow_NEON; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
} }
#endif #endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
...@@ -93,11 +88,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, ...@@ -93,11 +88,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
height = 1; height = 1;
src_stride_y = dst_stride_y = 0; src_stride_y = dst_stride_y = 0;
} }
#if defined(HAS_COPYROW_16_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_16_X86;
}
#endif
#if defined(HAS_COPYROW_16_SSE2) #if defined(HAS_COPYROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_16_SSE2; CopyRow = CopyRow_16_SSE2;
......
...@@ -918,24 +918,14 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -918,24 +918,14 @@ void RotatePlane180(const uint8* src, int src_stride,
MirrorRow = MirrorRow_MIPS_DSPR2; MirrorRow = MirrorRow_MIPS_DSPR2;
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
...@@ -943,6 +933,11 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -943,6 +933,11 @@ void RotatePlane180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) { if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS; CopyRow = CopyRow_MIPS;
......
...@@ -125,24 +125,14 @@ void ARGBRotate180(const uint8* src, int src_stride, ...@@ -125,24 +125,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
} }
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
...@@ -150,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride, ...@@ -150,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) { if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS; CopyRow = CopyRow_MIPS;
......
...@@ -621,8 +621,6 @@ NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C, ...@@ -621,8 +621,6 @@ NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
#endif #endif
#undef NANY #undef NANY
#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \ #define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \ void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \ int n = width & ~MASK; \
...@@ -659,6 +657,27 @@ MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3) ...@@ -659,6 +657,27 @@ MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3)
#endif #endif
#undef MANY #undef MANY
#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
COPY_SIMD(src_y, dst_y, n); \
} \
COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \
}
#ifdef HAS_COPYROW_AVX
MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63)
#endif
#ifdef HAS_COPYROW_SSE2
MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31)
#endif
#ifdef HAS_COPYROW_NEON
MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
#endif
#undef MANY
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
This diff is collapsed.
This diff is collapsed.
...@@ -534,11 +534,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ...@@ -534,11 +534,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm0,%%xmm1 \n" "paddusw %%xmm0,%%xmm1 \n"
"pmulhuw %%xmm5,%%xmm1 \n" "pmulhuw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n"
"sub $0x6,%2 \n"
"movd %%xmm1," MEMACCESS(1) " \n" "movd %%xmm1," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm1 \n" "psrlq $0x10,%%xmm1 \n"
"movd %%xmm1," MEMACCESS2(0x2,1) " \n" "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n" "lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -602,11 +602,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ...@@ -602,11 +602,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm7,%%xmm6 \n" "paddusw %%xmm7,%%xmm6 \n"
"pmulhuw %%xmm4,%%xmm6 \n" "pmulhuw %%xmm4,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n"
"sub $0x6,%2 \n"
"movd %%xmm6," MEMACCESS(1) " \n" "movd %%xmm6," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm6 \n" "psrlq $0x10,%%xmm6 \n"
"movd %%xmm6," MEMACCESS2(0x2,1) " \n" "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n" "lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -765,10 +765,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -765,10 +765,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n"
"sub $0x20,%2 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n" "movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -792,9 +792,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ...@@ -792,9 +792,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"shufps $0xdd,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -820,9 +820,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ...@@ -820,9 +820,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -852,9 +852,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ...@@ -852,9 +852,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -890,9 +890,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -890,9 +890,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
"lea " MEMLEA4(0x00,0,1,4) ",%0 \n" "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
"punpckldq %%xmm3,%%xmm2 \n" "punpckldq %%xmm3,%%xmm2 \n"
"punpcklqdq %%xmm2,%%xmm0 \n" "punpcklqdq %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1 "+r"(src_stepx_x4), // %1
...@@ -941,9 +941,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ...@@ -941,9 +941,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1 "+r"(src_stepx_x4), // %1
...@@ -997,9 +997,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -997,9 +997,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
"pextrw $0x3,%%xmm2,%k1 \n" "pextrw $0x3,%%xmm2,%k1 \n"
"punpckldq %%xmm4,%%xmm1 \n" "punpckldq %%xmm4,%%xmm1 \n"
"punpcklqdq %%xmm1,%%xmm0 \n" "punpcklqdq %%xmm1,%%xmm0 \n"
"sub $0x4,%4 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%4 \n"
"jge 40b \n" "jge 40b \n"
"49: \n" "49: \n"
...@@ -1046,10 +1046,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1046,10 +1046,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpckldq %%xmm0,%%xmm0 \n" "punpckldq %%xmm0,%%xmm0 \n"
"punpckhdq %%xmm1,%%xmm1 \n" "punpckhdq %%xmm1,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n" "movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
......
...@@ -111,9 +111,9 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -111,9 +111,9 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrlw xmm0, 8 // isolate odd pixels. psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg wloop jg wloop
ret ret
...@@ -149,9 +149,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -149,9 +149,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm1, xmm3 pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg wloop jg wloop
ret ret
...@@ -192,9 +192,9 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -192,9 +192,9 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm1, xmm3 pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg wloop jg wloop
pop esi pop esi
...@@ -226,9 +226,9 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -226,9 +226,9 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
packuswb xmm0, xmm1 packuswb xmm0, xmm1
psrlw xmm0, 8 psrlw xmm0, 8
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 8
jg wloop jg wloop
ret ret
...@@ -285,9 +285,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -285,9 +285,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm0, xmm2 pavgw xmm0, xmm2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 8
jg wloop jg wloop
pop edi pop edi
...@@ -398,9 +398,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ...@@ -398,9 +398,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0 movq qword ptr [edx + 16], xmm0
lea edx, [edx + 24] lea edx, [edx + 24]
sub ecx, 24
jg wloop jg wloop
pop esi pop esi
...@@ -460,9 +460,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ...@@ -460,9 +460,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0 movq qword ptr [edx + 16], xmm0
lea edx, [edx+24] lea edx, [edx+24]
sub ecx, 24
jg wloop jg wloop
pop esi pop esi
...@@ -493,11 +493,11 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -493,11 +493,11 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
pshufb xmm1, xmm5 pshufb xmm1, xmm5
paddusb xmm0, xmm1 paddusb xmm0, xmm1
sub ecx, 12
movq qword ptr [edx], xmm0 // write 12 pixels movq qword ptr [edx], xmm0 // write 12 pixels
movhlps xmm1, xmm0 movhlps xmm1, xmm0
movd [edx + 8], xmm1 movd [edx + 8], xmm1
lea edx, [edx + 12] lea edx, [edx + 12]
sub ecx, 12
jg xloop jg xloop
ret ret
...@@ -558,11 +558,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ...@@ -558,11 +558,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
packuswb xmm6, xmm6 packuswb xmm6, xmm6
sub ecx, 6
movd [edx], xmm6 // write 6 pixels movd [edx], xmm6 // write 6 pixels
psrlq xmm6, 16 psrlq xmm6, 16
movd [edx + 2], xmm6 movd [edx + 2], xmm6
lea edx, [edx + 6] lea edx, [edx + 6]
sub ecx, 6
jg xloop jg xloop
pop esi pop esi
...@@ -604,11 +604,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ...@@ -604,11 +604,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
packuswb xmm1, xmm1 packuswb xmm1, xmm1
sub ecx, 6
movd [edx], xmm1 // write 6 pixels movd [edx], xmm1 // write 6 pixels
psrlq xmm1, 16 psrlq xmm1, 16
movd [edx + 2], xmm1 movd [edx + 2], xmm1
lea edx, [edx + 6] lea edx, [edx + 6]
sub ecx, 6
jg xloop jg xloop
pop esi pop esi
...@@ -784,10 +784,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -784,10 +784,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpcklbw xmm0, xmm0 punpcklbw xmm0, xmm0
punpckhbw xmm1, xmm1 punpckhbw xmm1, xmm1
sub ecx, 32
movdqu [edx], xmm0 movdqu [edx], xmm0
movdqu [edx + 16], xmm1 movdqu [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32
jg wloop jg wloop
ret ret
...@@ -812,9 +812,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ...@@ -812,9 +812,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
shufps xmm0, xmm1, 0xdd shufps xmm0, xmm1, 0xdd
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
ret ret
...@@ -842,9 +842,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ...@@ -842,9 +842,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2 pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
ret ret
...@@ -877,9 +877,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ...@@ -877,9 +877,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2 pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
pop esi pop esi
...@@ -914,9 +914,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -914,9 +914,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea eax, [eax + ebx * 4] lea eax, [eax + ebx * 4]
punpckldq xmm2, xmm3 punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2 punpcklqdq xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
pop edi pop edi
...@@ -963,9 +963,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ...@@ -963,9 +963,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2 pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
pop edi pop edi
...@@ -1021,9 +1021,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1021,9 +1021,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
punpckldq xmm1, xmm4 // x2 x3 punpckldq xmm1, xmm4 // x2 x3
punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
sub ecx, 4 // 4 pixels
movdqu [edi], xmm0 movdqu [edi], xmm0
lea edi, [edi + 16] lea edi, [edi + 16]
sub ecx, 4 // 4 pixels
jge xloop4 jge xloop4
align 4 align 4
...@@ -1160,10 +1160,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1160,10 +1160,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpckldq xmm0, xmm0 punpckldq xmm0, xmm0
punpckhdq xmm1, xmm1 punpckhdq xmm1, xmm1
sub ecx, 8
movdqu [edx], xmm0 movdqu [edx], xmm0
movdqu [edx + 16], xmm1 movdqu [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8
jg wloop jg wloop
ret ret
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment