Commit 9b4c00b9 authored by fbarchard@google.com's avatar fbarchard@google.com

Move vzeroupper to row functions to simplify caller and allow mix of avx2 and…

Move vzeroupper to row functions to simplify caller and allow mix of avx2 and sse2.  Impact reduced by row coalescing.
BUG=none
TEST=all tests pass with sde
Review URL: https://webrtc-codereview.appspot.com/1269009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@641 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 91c50c3a
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 639 Version: 641
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 639 #define LIBYUV_VERSION 641
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -103,9 +103,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, ...@@ -103,9 +103,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
} }
#endif #endif
#if defined(HAS_SUMSQUAREERROR_AVX2) #if defined(HAS_SUMSQUAREERROR_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
clear = true;
// Note only used for multiples of 32 so count is not checked. // Note only used for multiples of 32 so count is not checked.
SumSquareError = SumSquareError_AVX2; SumSquareError = SumSquareError_AVX2;
} }
...@@ -133,12 +131,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, ...@@ -133,12 +131,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
if (remainder) { if (remainder) {
sse += SumSquareError_C(src_a, src_b, remainder); sse += SumSquareError_C(src_a, src_b, remainder);
} }
#if defined(HAS_SUMSQUAREERROR_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return sse; return sse;
} }
...@@ -164,9 +156,7 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, ...@@ -164,9 +156,7 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
} }
#endif #endif
#if defined(HAS_SUMSQUAREERROR_AVX2) #if defined(HAS_SUMSQUAREERROR_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
clear = true;
SumSquareError = SumSquareError_AVX2; SumSquareError = SumSquareError_AVX2;
} }
#endif #endif
...@@ -176,12 +166,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, ...@@ -176,12 +166,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
src_a += stride_a; src_a += stride_a;
src_b += stride_b; src_b += stride_b;
} }
#if defined(HAS_SUMSQUAREERROR_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return sse; return sse;
} }
......
...@@ -94,6 +94,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -94,6 +94,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpermq ymm1, ymm0, 0x02 // high + low lane. vpermq ymm1, ymm0, 0x02 // high + low lane.
vpaddd ymm0, ymm0, ymm1 vpaddd ymm0, ymm0, ymm1
vmovd eax, xmm0 vmovd eax, xmm0
vzeroupper
ret ret
} }
} }
......
...@@ -99,9 +99,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y, ...@@ -99,9 +99,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_HALFROW_AVX2) #if defined(HAS_HALFROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) {
clear = true;
HalfRow = HalfRow_AVX2; HalfRow = HalfRow_AVX2;
} }
#endif #endif
...@@ -136,11 +134,6 @@ int I422ToI420(const uint8* src_y, int src_stride_y, ...@@ -136,11 +134,6 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
if (height & 1) { if (height & 1) {
HalfRow(src_v, 0, dst_v, halfwidth); HalfRow(src_v, 0, dst_v, halfwidth);
} }
#if defined(HAS_HALFROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -583,9 +576,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, ...@@ -583,9 +576,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_AVX2) #if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
...@@ -623,11 +614,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, ...@@ -623,11 +614,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow(src_y, dst_y, width); CopyRow(src_y, dst_y, width);
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
} }
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -667,9 +653,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -667,9 +653,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_AVX2) #if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
...@@ -704,12 +688,6 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -704,12 +688,6 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width); YUY2ToYRow(src_yuy2, dst_y, width);
} }
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -749,9 +727,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -749,9 +727,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
} }
#endif #endif
#if defined(HAS_UYVYTOYROW_AVX2) #if defined(HAS_UYVYTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
UYVYToUVRow = UYVYToUVRow_Any_AVX2; UYVYToUVRow = UYVYToUVRow_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
...@@ -786,12 +762,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -786,12 +762,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
UYVYToYRow(src_uyvy, dst_y, width); UYVYToYRow(src_uyvy, dst_y, width);
} }
#if defined(HAS_UYVYTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -834,9 +804,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, ...@@ -834,9 +804,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) #if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
...@@ -873,12 +841,6 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, ...@@ -873,12 +841,6 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb, dst_y, width);
} }
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
......
...@@ -483,9 +483,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, ...@@ -483,9 +483,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_MERGEUVROW_AVX2) #if defined(HAS_MERGEUVROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
clear = true;
MergeUVRow_ = MergeUVRow_Any_AVX2; MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) { if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX2; MergeUVRow_ = MergeUVRow_AVX2;
...@@ -509,12 +507,6 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, ...@@ -509,12 +507,6 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
src_v += src_stride_v; src_v += src_stride_v;
dst_uv += dst_stride_uv; dst_uv += dst_stride_uv;
} }
#if defined(HAS_MERGEUVROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
......
...@@ -218,9 +218,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, ...@@ -218,9 +218,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) #if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToYRow = ARGBToYRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2;
...@@ -250,11 +248,6 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, ...@@ -250,11 +248,6 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -690,9 +683,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, ...@@ -690,9 +683,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) #if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToYRow = ARGBToYRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2;
...@@ -713,11 +704,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, ...@@ -713,11 +704,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_y += dst_stride_y; dst_y += dst_stride_y;
} }
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
......
...@@ -197,9 +197,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, ...@@ -197,9 +197,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_MIRRORROW_AVX2) #if defined(HAS_MIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
clear = true;
MirrorRow = MirrorRow_AVX2; MirrorRow = MirrorRow_AVX2;
} }
#endif #endif
...@@ -210,11 +208,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, ...@@ -210,11 +208,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
src_y += src_stride_y; src_y += src_stride_y;
dst_y += dst_stride_y; dst_y += dst_stride_y;
} }
#if defined(HAS_MIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
} }
// Convert YUY2 to I422. // Convert YUY2 to I422.
...@@ -264,9 +257,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -264,9 +257,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_AVX2) #if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
...@@ -296,12 +287,6 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -296,12 +287,6 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -352,9 +337,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -352,9 +337,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
} }
#endif #endif
#if defined(HAS_UYVYTOYROW_AVX2) #if defined(HAS_UYVYTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
...@@ -384,12 +367,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -384,12 +367,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
#if defined(HAS_UYVYTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -473,9 +450,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, ...@@ -473,9 +450,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBMIRRORROW_AVX2) #if defined(HAS_ARGBMIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
clear = true;
ARGBMirrorRow = ARGBMirrorRow_AVX2; ARGBMirrorRow = ARGBMirrorRow_AVX2;
} }
#endif #endif
...@@ -491,12 +466,6 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, ...@@ -491,12 +466,6 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -601,9 +570,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, ...@@ -601,9 +570,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
} }
#endif #endif
#if defined(HAS_ARGBMULTIPLYROW_AVX2) #if defined(HAS_ARGBMULTIPLYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBMultiplyRow = ARGBMultiplyRow_AVX2; ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
...@@ -626,12 +593,6 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, ...@@ -626,12 +593,6 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBMULTIPLYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -674,9 +635,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, ...@@ -674,9 +635,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
} }
#endif #endif
#if defined(HAS_ARGBADDROW_AVX2) #if defined(HAS_ARGBADDROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBAddRow = ARGBAddRow_Any_AVX2; ARGBAddRow = ARGBAddRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_AVX2; ARGBAddRow = ARGBAddRow_AVX2;
...@@ -699,12 +658,6 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, ...@@ -699,12 +658,6 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBADDROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -747,9 +700,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, ...@@ -747,9 +700,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
} }
#endif #endif
#if defined(HAS_ARGBSUBTRACTROW_AVX2) #if defined(HAS_ARGBSUBTRACTROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_AVX2; ARGBSubtractRow = ARGBSubtractRow_AVX2;
...@@ -772,12 +723,6 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, ...@@ -772,12 +723,6 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBSUBTRACTROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -1223,9 +1168,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1223,9 +1168,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBATTENUATEROW_AVX2) #if defined(HAS_ARGBATTENUATEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_AVX2; ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
...@@ -1246,13 +1189,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1246,13 +1189,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBATTENUATEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -1289,9 +1225,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1289,9 +1225,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBUNATTENUATEROW_AVX2) #if defined(HAS_ARGBUNATTENUATEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
...@@ -1305,13 +1239,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1305,13 +1239,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -1791,9 +1718,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, ...@@ -1791,9 +1718,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
} }
#endif #endif
#if defined(HAS_ARGBSHUFFLEROW_AVX2) #if defined(HAS_ARGBSHUFFLEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
clear = true;
ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBShuffleRow = ARGBShuffleRow_AVX2; ARGBShuffleRow = ARGBShuffleRow_AVX2;
...@@ -1814,11 +1739,6 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, ...@@ -1814,11 +1739,6 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
src_bgra += src_stride_bgra; src_bgra += src_stride_bgra;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBSHUFFLEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
......
...@@ -882,9 +882,7 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -882,9 +882,7 @@ void RotatePlane180(const uint8* src, int src_stride,
} }
#endif #endif
#if defined(HAS_MIRRORROW_AVX2) #if defined(HAS_MIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
clear = true;
MirrorRow = MirrorRow_AVX2; MirrorRow = MirrorRow_AVX2;
} }
#endif #endif
...@@ -942,11 +940,6 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -942,11 +940,6 @@ void RotatePlane180(const uint8* src, int src_stride,
src_bot -= src_stride; src_bot -= src_stride;
dst_bot -= dst_stride; dst_bot -= dst_stride;
} }
#if defined(HAS_MIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
} }
static void TransposeUVWx8_C(const uint8* src, int src_stride, static void TransposeUVWx8_C(const uint8* src, int src_stride,
......
...@@ -101,9 +101,7 @@ void ARGBRotate180(const uint8* src, int src_stride, ...@@ -101,9 +101,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
} }
#endif #endif
#if defined(HAS_ARGBMIRRORROW_AVX2) #if defined(HAS_ARGBMIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
clear = true;
ARGBMirrorRow = ARGBMirrorRow_AVX2; ARGBMirrorRow = ARGBMirrorRow_AVX2;
} }
#endif #endif
...@@ -159,11 +157,6 @@ void ARGBRotate180(const uint8* src, int src_stride, ...@@ -159,11 +157,6 @@ void ARGBRotate180(const uint8* src, int src_stride,
src_bot -= src_stride; src_bot -= src_stride;
dst_bot -= dst_stride; dst_bot -= dst_stride;
} }
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
} }
LIBYUV_API LIBYUV_API
......
...@@ -763,6 +763,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -763,6 +763,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
...@@ -1277,6 +1278,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, ...@@ -1277,6 +1278,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
pop edi pop edi
pop esi pop esi
vzeroupper
ret ret
} }
} }
...@@ -3133,6 +3135,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -3133,6 +3135,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
...@@ -3254,6 +3257,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -3254,6 +3257,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
...@@ -3367,6 +3371,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ...@@ -3367,6 +3371,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
...@@ -3462,6 +3467,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -3462,6 +3467,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
...@@ -3599,6 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, ...@@ -3599,6 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
...@@ -3643,6 +3650,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, ...@@ -3643,6 +3650,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
pop edi pop edi
pop esi pop esi
vzeroupper
ret ret
} }
} }
...@@ -3682,6 +3690,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, ...@@ -3682,6 +3690,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
...@@ -3708,6 +3717,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, ...@@ -3708,6 +3717,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
ret ret
vzeroupper
} }
} }
...@@ -3751,6 +3761,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, ...@@ -3751,6 +3761,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
pop edi pop edi
pop esi pop esi
vzeroupper
ret ret
} }
} }
...@@ -3790,6 +3801,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, ...@@ -3790,6 +3801,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
...@@ -4633,6 +4645,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4633,6 +4645,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
lea eax, [eax + 32] lea eax, [eax + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
...@@ -4727,6 +4740,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -4727,6 +4740,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 32] lea eax, [eax + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
...@@ -4790,6 +4804,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -4790,6 +4804,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
pop edi pop edi
pop esi pop esi
vzeroupper
ret ret
} }
} }
...@@ -5240,6 +5255,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5240,6 +5255,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
jg convertloop jg convertloop
pop esi pop esi
vzeroupper
ret ret
} }
} }
...@@ -5270,6 +5286,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5270,6 +5286,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
jg convertloop jg convertloop
pop esi pop esi
vzeroupper
ret ret
} }
} }
...@@ -5299,6 +5316,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5299,6 +5316,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
jg convertloop jg convertloop
pop esi pop esi
vzeroupper
ret ret
} }
} }
...@@ -6053,7 +6071,9 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, ...@@ -6053,7 +6071,9 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
vmovdqu [eax + edi], ymm0 vmovdqu [eax + edi], ymm0
lea eax, [eax + 32] lea eax, [eax + 32]
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
...@@ -6162,6 +6182,8 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -6162,6 +6182,8 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vmovdqu [edx + 32], ymm1 vmovdqu [edx + 32], ymm1
lea edx, [edx + 64] lea edx, [edx + 64]
jg wloop jg wloop
vzeroupper
ret ret
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment