Move vzeroupper to row functions to simplify caller and allow mix of avx2 and…

Move vzeroupper to row functions to simplify caller and allow mix of avx2 and sse2. Impact reduced by row coalescing. BUG=none TEST=all tests pass with sde Review URL: https://webrtc-codereview.appspot.com/1269009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@641 16f28f9a-4ce2-e073-06de-1de4eb20be90

Move vzeroupper to row functions to simplify caller and allow mix of avx2 and…
Move vzeroupper to row functions to simplify caller and allow mix of avx2 and sse2. Impact reduced by row coalescing. BUG=none TEST=all tests pass with sde Review URL: https://webrtc-codereview.appspot.com/1269009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@641 16f28f9a-4ce2-e073-06de-1de4eb20be90
9b4c00b9 · fbarchard@google.com · 91c50c3a · 9b4c00b9 · 9b4c00b9 · 9b4c00b9
Commit 9b4c00b9 authored Apr 04, 2013 by fbarchard@google.com
11 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 639
+Version: 641
 License: BSD
 License File: LICENSE


--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 639
+#define LIBYUV_VERSION 641

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -103,9 +103,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  }
 #endif
 #if defined(HAS_SUMSQUAREERROR_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2)) {
-    clear = true;
    // Note only used for multiples of 32 so count is not checked.
    SumSquareError = SumSquareError_AVX2;
  }
@@ -133,12 +131,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  if (remainder) {
    sse += SumSquareError_C(src_a, src_b, remainder);
  }
-
-#if defined(HAS_SUMSQUAREERROR_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return sse;
 }

@@ -164,9 +156,7 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
  }
 #endif
 #if defined(HAS_SUMSQUAREERROR_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    clear = true;
    SumSquareError = SumSquareError_AVX2;
  }
 #endif
@@ -176,12 +166,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
    src_a += stride_a;
    src_b += stride_b;
  }
-
-#if defined(HAS_SUMSQUAREERROR_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return sse;
 }


--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -94,6 +94,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    vpermq     ymm1, ymm0, 0x02  // high + low lane.
    vpaddd     ymm0, ymm0, ymm1
    vmovd      eax, xmm0
+    vzeroupper
    ret
  }
 }

--- a/source/convert.cc
+++ b/source/convert.cc
@@ -99,9 +99,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_HALFROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) {
-    clear = true;
    HalfRow = HalfRow_AVX2;
  }
 #endif
@@ -136,11 +134,6 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
  if (height & 1) {
    HalfRow(src_v, 0, dst_v, halfwidth);
  }
-#if defined(HAS_HALFROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -583,9 +576,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@@ -623,11 +614,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
    CopyRow(src_y, dst_y, width);
    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
  }
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -667,9 +653,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
  }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@@ -704,12 +688,6 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
    YUY2ToYRow(src_yuy2, dst_y, width);
  }
-
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -749,9 +727,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
  }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
    UYVYToYRow = UYVYToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@@ -786,12 +762,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
    UYVYToYRow(src_uyvy, dst_y, width);
  }
-
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -834,9 +804,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@@ -873,12 +841,6 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
    ARGBToYRow(src_argb, dst_y, width);
  }
-
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }


--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -483,9 +483,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
-    clear = true;
    MergeUVRow_ = MergeUVRow_Any_AVX2;
    if (IS_ALIGNED(halfwidth, 32)) {
      MergeUVRow_ = MergeUVRow_AVX2;
@@ -509,12 +507,6 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
    src_v += src_stride_v;
    dst_uv += dst_stride_uv;
  }
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
-
  return 0;
 }


--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -218,9 +218,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYRow = ARGBToYRow_AVX2;
@@ -250,11 +248,6 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -690,9 +683,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYRow = ARGBToYRow_AVX2;
@@ -713,11 +704,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
    src_argb += src_stride_argb;
    dst_y += dst_stride_y;
  }
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }


--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -197,9 +197,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    clear = true;
    MirrorRow = MirrorRow_AVX2;
  }
 #endif
@@ -210,11 +208,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
    src_y += src_stride_y;
    dst_y += dst_stride_y;
  }
-#if defined(HAS_MIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
 }

 // Convert YUY2 to I422.
@@ -264,9 +257,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
  }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@@ -296,12 +287,6 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
-
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -352,9 +337,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
  }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
    UYVYToYRow = UYVYToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@@ -384,12 +367,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
-
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -473,9 +450,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    clear = true;
    ARGBMirrorRow = ARGBMirrorRow_AVX2;
  }
 #endif
@@ -491,12 +466,6 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
    src_argb += src_stride_argb;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -601,9 +570,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
  }
 #endif
 #if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
@@ -626,12 +593,6 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -674,9 +635,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
  }
 #endif
 #if defined(HAS_ARGBADDROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBAddRow = ARGBAddRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBAddRow = ARGBAddRow_AVX2;
@@ -699,12 +658,6 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBADDROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -747,9 +700,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
  }
 #endif
 #if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBSubtractRow = ARGBSubtractRow_AVX2;
@@ -772,12 +723,6 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@@ -1223,9 +1168,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
@@ -1246,13 +1189,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
    src_argb += src_stride_argb;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
-
  return 0;
 }

@@ -1289,9 +1225,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
@@ -1305,13 +1239,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
    src_argb += src_stride_argb;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
-
  return 0;
 }

@@ -1791,9 +1718,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
  }
 #endif
 #if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
-    clear = true;
    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      ARGBShuffleRow = ARGBShuffleRow_AVX2;
@@ -1814,11 +1739,6 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
    src_bgra += src_stride_bgra;
    dst_argb += dst_stride_argb;
  }
-#if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }


--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -882,9 +882,7 @@ void RotatePlane180(const uint8* src, int src_stride,
  }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    clear = true;
    MirrorRow = MirrorRow_AVX2;
  }
 #endif
@@ -942,11 +940,6 @@ void RotatePlane180(const uint8* src, int src_stride,
    src_bot -= src_stride;
    dst_bot -= dst_stride;
  }
-#if defined(HAS_MIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
 }

 static void TransposeUVWx8_C(const uint8* src, int src_stride,

--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -101,9 +101,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
  }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    clear = true;
    ARGBMirrorRow = ARGBMirrorRow_AVX2;
  }
 #endif
@@ -159,11 +157,6 @@ void ARGBRotate180(const uint8* src, int src_stride,
    src_bot -= src_stride;
    dst_bot -= dst_stride;
  }
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
 }

 LIBYUV_API

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -763,6 +763,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
    vmovdqu    [edx], ymm0
    lea        edx, [edx + 32]
    jg         convertloop
+    vzeroupper
    ret
  }
 }
@@ -1277,6 +1278,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,

    pop        edi
    pop        esi
+    vzeroupper
    ret
  }
 }
@@ -3133,6 +3135,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
    vmovdqu   [edx], ymm0
    lea       edx, [edx + 32]
    jg        convertloop
+    vzeroupper
    ret
  }
 }
@@ -3254,6 +3257,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
    vmovdqu   [edx], ymm0
    lea       edx, [edx + 32]
    jg        convertloop
+    vzeroupper
    ret
  }
 }
@@ -3367,6 +3371,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
    jg         convertloop

    pop        edi
+    vzeroupper
    ret
  }
 }
@@ -3462,6 +3467,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
    jg         convertloop

    pop        edi
+    vzeroupper
    ret
  }
 }
@@ -3599,6 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
    vmovdqu    [edx], ymm0
    lea        edx, [edx + 32]
    jg         convertloop
+    vzeroupper
    ret
  }
 }
@@ -3643,6 +3650,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

    pop        edi
    pop        esi
+    vzeroupper
    ret
  }
 }
@@ -3682,6 +3690,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
    jg         convertloop

    pop        edi
+    vzeroupper
    ret
  }
 }
@@ -3708,6 +3717,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
    lea        edx, [edx + 32]
    jg         convertloop
    ret
+    vzeroupper
  }
 }

@@ -3751,6 +3761,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

    pop        edi
    pop        esi
+    vzeroupper
    ret
  }
 }
@@ -3790,6 +3801,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
    jg         convertloop

    pop        edi
+    vzeroupper
    ret
  }
 }
@@ -4633,6 +4645,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
    lea        eax, [eax + 32]
    jg         convertloop

+    vzeroupper
    ret
  }
 }
@@ -4727,6 +4740,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
    lea        eax, [eax + 32]
    jg         convertloop

+    vzeroupper
    ret
  }
 }
@@ -4790,6 +4804,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

    pop        edi
    pop        esi
+    vzeroupper
    ret
  }
 }
@@ -5240,6 +5255,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
    jg         convertloop

    pop        esi
+    vzeroupper
    ret
  }
 }
@@ -5270,6 +5286,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
    jg         convertloop

    pop        esi
+    vzeroupper
    ret
  }
 }
@@ -5299,6 +5316,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
    jg         convertloop

    pop        esi
+    vzeroupper
    ret
  }
 }
@@ -6053,7 +6071,9 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
    vmovdqu    [eax + edi], ymm0
    lea        eax,  [eax + 32]
    jg         convertloop
+
    pop        edi
+    vzeroupper
    ret
  }
 }
@@ -6162,6 +6182,8 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
    vmovdqu    [edx + 32], ymm1
    lea        edx, [edx + 64]
    jg         wloop
+
+    vzeroupper
    ret
  }
 }