Commit 9b4c00b9 authored by fbarchard@google.com's avatar fbarchard@google.com

Move vzeroupper to row functions to simplify caller and allow mix of avx2 and…

Move vzeroupper to row functions to simplify caller and allow mix of avx2 and sse2.  Impact reduced by row coalescing.
BUG=none
TEST=all tests pass with sde
Review URL: https://webrtc-codereview.appspot.com/1269009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@641 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 91c50c3a
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 639
Version: 641
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 639
#define LIBYUV_VERSION 641
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -103,9 +103,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
#endif
#if defined(HAS_SUMSQUAREERROR_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2)) {
clear = true;
// Note only used for multiples of 32 so count is not checked.
SumSquareError = SumSquareError_AVX2;
}
......@@ -133,12 +131,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
if (remainder) {
sse += SumSquareError_C(src_a, src_b, remainder);
}
#if defined(HAS_SUMSQUAREERROR_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return sse;
}
......@@ -164,9 +156,7 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
}
#endif
#if defined(HAS_SUMSQUAREERROR_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
clear = true;
SumSquareError = SumSquareError_AVX2;
}
#endif
......@@ -176,12 +166,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
src_a += stride_a;
src_b += stride_b;
}
#if defined(HAS_SUMSQUAREERROR_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return sse;
}
......
......@@ -94,6 +94,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpermq ymm1, ymm0, 0x02 // high + low lane.
vpaddd ymm0, ymm0, ymm1
vmovd eax, xmm0
vzeroupper
ret
}
}
......
......@@ -99,9 +99,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_HALFROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) {
clear = true;
HalfRow = HalfRow_AVX2;
}
#endif
......@@ -136,11 +134,6 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
if (height & 1) {
HalfRow(src_v, 0, dst_v, halfwidth);
}
#if defined(HAS_HALFROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -583,9 +576,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
......@@ -623,11 +614,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow(src_y, dst_y, width);
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
}
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -667,9 +653,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
}
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
......@@ -704,12 +688,6 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width);
}
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -749,9 +727,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
}
#endif
#if defined(HAS_UYVYTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
UYVYToUVRow = UYVYToUVRow_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
......@@ -786,12 +762,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
UYVYToYRow(src_uyvy, dst_y, width);
}
#if defined(HAS_UYVYTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -834,9 +804,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
......@@ -873,12 +841,6 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width);
}
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......
......@@ -483,9 +483,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_MERGEUVROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
clear = true;
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
......@@ -509,12 +507,6 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
src_v += src_stride_v;
dst_uv += dst_stride_uv;
}
#if defined(HAS_MERGEUVROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......
......@@ -218,9 +218,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
......@@ -250,11 +248,6 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -690,9 +683,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
......@@ -713,11 +704,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb;
dst_y += dst_stride_y;
}
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......
......@@ -197,9 +197,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_MIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
clear = true;
MirrorRow = MirrorRow_AVX2;
}
#endif
......@@ -210,11 +208,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
src_y += src_stride_y;
dst_y += dst_stride_y;
}
#if defined(HAS_MIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
}
// Convert YUY2 to I422.
......@@ -264,9 +257,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
}
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
......@@ -296,12 +287,6 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -352,9 +337,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
}
#endif
#if defined(HAS_UYVYTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
......@@ -384,12 +367,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
#if defined(HAS_UYVYTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -473,9 +450,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBMIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
clear = true;
ARGBMirrorRow = ARGBMirrorRow_AVX2;
}
#endif
......@@ -491,12 +466,6 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -601,9 +570,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBMULTIPLYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
......@@ -626,12 +593,6 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb;
}
#if defined(HAS_ARGBMULTIPLYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -674,9 +635,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBADDROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBAddRow = ARGBAddRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_AVX2;
......@@ -699,12 +658,6 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb;
}
#if defined(HAS_ARGBADDROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -747,9 +700,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBSUBTRACTROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_AVX2;
......@@ -772,12 +723,6 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb;
}
#if defined(HAS_ARGBSUBTRACTROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -1223,9 +1168,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBATTENUATEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
......@@ -1246,13 +1189,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
#if defined(HAS_ARGBATTENUATEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -1289,9 +1225,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
......@@ -1305,13 +1239,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -1791,9 +1718,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
}
#endif
#if defined(HAS_ARGBSHUFFLEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
clear = true;
ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGBShuffleRow = ARGBShuffleRow_AVX2;
......@@ -1814,11 +1739,6 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
src_bgra += src_stride_bgra;
dst_argb += dst_stride_argb;
}
#if defined(HAS_ARGBSHUFFLEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......
......@@ -882,9 +882,7 @@ void RotatePlane180(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_MIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
clear = true;
MirrorRow = MirrorRow_AVX2;
}
#endif
......@@ -942,11 +940,6 @@ void RotatePlane180(const uint8* src, int src_stride,
src_bot -= src_stride;
dst_bot -= dst_stride;
}
#if defined(HAS_MIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
}
static void TransposeUVWx8_C(const uint8* src, int src_stride,
......
......@@ -101,9 +101,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_ARGBMIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
clear = true;
ARGBMirrorRow = ARGBMirrorRow_AVX2;
}
#endif
......@@ -159,11 +157,6 @@ void ARGBRotate180(const uint8* src, int src_stride,
src_bot -= src_stride;
dst_bot -= dst_stride;
}
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
}
LIBYUV_API
......
......@@ -763,6 +763,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vmovdqu [edx], ymm0
lea edx, [edx + 32]
jg convertloop
vzeroupper
ret
}
}
......@@ -1277,6 +1278,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
pop edi
pop esi
vzeroupper
ret
}
}
......@@ -3133,6 +3135,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
vmovdqu [edx], ymm0
lea edx, [edx + 32]
jg convertloop
vzeroupper
ret
}
}
......@@ -3254,6 +3257,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
vmovdqu [edx], ymm0
lea edx, [edx + 32]
jg convertloop
vzeroupper
ret
}
}
......@@ -3367,6 +3371,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
jg convertloop
pop edi
vzeroupper
ret
}
}
......@@ -3462,6 +3467,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
jg convertloop
pop edi
vzeroupper
ret
}
}
......@@ -3599,6 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
vmovdqu [edx], ymm0
lea edx, [edx + 32]
jg convertloop
vzeroupper
ret
}
}
......@@ -3643,6 +3650,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
pop edi
pop esi
vzeroupper
ret
}
}
......@@ -3682,6 +3690,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
jg convertloop
pop edi
vzeroupper
ret
}
}
......@@ -3708,6 +3717,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
lea edx, [edx + 32]
jg convertloop
ret
vzeroupper
}
}
......@@ -3751,6 +3761,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
pop edi
pop esi
vzeroupper
ret
}
}
......@@ -3790,6 +3801,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
jg convertloop
pop edi
vzeroupper
ret
}
}
......@@ -4633,6 +4645,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
lea eax, [eax + 32]
jg convertloop
vzeroupper
ret
}
}
......@@ -4727,6 +4740,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 32]
jg convertloop
vzeroupper
ret
}
}
......@@ -4790,6 +4804,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
pop edi
pop esi
vzeroupper
ret
}
}
......@@ -5240,6 +5255,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
jg convertloop
pop esi
vzeroupper
ret
}
}
......@@ -5270,6 +5286,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
jg convertloop
pop esi
vzeroupper
ret
}
}
......@@ -5299,6 +5316,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
jg convertloop
pop esi
vzeroupper
ret
}
}
......@@ -6053,7 +6071,9 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
vmovdqu [eax + edi], ymm0
lea eax, [eax + 32]
jg convertloop
pop edi
vzeroupper
ret
}
}
......@@ -6162,6 +6182,8 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
jg wloop
vzeroupper
ret
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment