checks

6bce6ee3 · Ilya Lavrenov · 1d3c8604 · 6bce6ee3 · 6bce6ee3 · 6bce6ee3
Commit 6bce6ee3 authored Jan 12, 2015 by Ilya Lavrenov
6 changed files
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -64,7 +64,7 @@ FUNCTOR_TEMPLATE(VLoadStore128);
 #if CV_SSE2
 FUNCTOR_TEMPLATE(VLoadStore64);
 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
-#if CV_AVX
+#if CV_AVX2
 FUNCTOR_TEMPLATE(VLoadStore256);
 FUNCTOR_TEMPLATE(VLoadStore256Aligned);
 #endif
@@ -2626,10 +2626,16 @@ struct Div_SIMD
 template <>
 struct Div_SIMD<uchar>
 {
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -2672,10 +2678,16 @@ struct Div_SIMD<uchar>
 template <>
 struct Div_SIMD<schar>
 {
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
    int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -2718,10 +2730,16 @@ struct Div_SIMD<schar>
 template <>
 struct Div_SIMD<ushort>
 {
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -2763,10 +2781,16 @@ struct Div_SIMD<ushort>
 template <>
 struct Div_SIMD<short>
 {
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
    int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -2806,10 +2830,16 @@ struct Div_SIMD<short>
 template <>
 struct Div_SIMD<int>
 {
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
    int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -2902,10 +2932,16 @@ struct Recip_SIMD
 template <>
 struct Recip_SIMD<uchar>
 {
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
    int operator() (const uchar * src2, uchar * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -2941,10 +2977,16 @@ struct Recip_SIMD<uchar>
 template <>
 struct Recip_SIMD<schar>
 {
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
    int operator() (const schar * src2, schar * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -2980,10 +3022,16 @@ struct Recip_SIMD<schar>
 template <>
 struct Recip_SIMD<ushort>
 {
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
    int operator() (const ushort * src2, ushort * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -3018,10 +3066,16 @@ struct Recip_SIMD<ushort>
 template <>
 struct Recip_SIMD<short>
 {
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
    int operator() (const short * src2, short * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -3054,10 +3108,16 @@ struct Recip_SIMD<short>
 template <>
 struct Recip_SIMD<int>
 {
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
    int operator() (const int * src2, int * dst, int width, double scale) const
    {
        int x = 0;
+        if (!haveSIMD)
+            return x;
        __m128d v_scale = _mm_set1_pd(scale);
        __m128i v_zero = _mm_setzero_si128();
@@ -4126,7 +4186,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
        {
            int x =0;
            #if CV_SSE2
-            if( USE_SSE2 ){
+            if( USE_SSE2 )
+            {
                __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
                __m128i c128 = _mm_set1_epi8 (-128);
                for( ; x <= size.width - 16; x += 16 )
@@ -4164,7 +4225,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
        {
            int x = 0;
            #if CV_SSE2
-            if( USE_SSE2 ){
+            if( USE_SSE2 )
+            {
                __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
                for( ; x <= size.width - 16; x += 16 )
                {
@@ -4254,7 +4316,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
        {
            int x =0;
            #if CV_SSE2
-            if( USE_SSE2){//
+            if( USE_SSE2)
+            {
                __m128i m128 =  code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
                for( ; x <= size.width - 16; x += 16 )
                {
@@ -4293,7 +4356,6 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
                vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
            }
            #endif
            for( ; x < size.width; x++ ){
@@ -4308,7 +4370,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
        {
            int x = 0;
            #if CV_SSE2
-            if( USE_SSE2 ){
+            if( USE_SSE2 )
+            {
                __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
                for( ; x <= size.width - 16; x += 16 )
                {

--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -597,6 +597,8 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
                k = 0;
 #if CV_SSE2
+                if (USE_SSE2)
+                {
                    for ( ; k <= len - 4; k += 4)
                    {
                        __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
@@ -607,6 +609,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
                        _mm_storeu_ps(buf[0] + k, v_dst0);
                        _mm_storeu_ps(buf[1] + k, v_dst1);
                    }
+                }
 #endif
                for( ; k < len; k++ )
@@ -619,12 +622,15 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
                k = 0;
 #if CV_SSE2
+                if (USE_SSE2)
+                {
                    for ( ; k <= len - 4; k += 4)
                    {
                        __m128 v_src = _mm_loadu_ps(buf[0] + k);
                        _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
                        _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
                    }
+                }
 #endif
                for( ; k < len; k++ )
@@ -728,6 +734,8 @@ void cartToPolar( InputArray src1, InputArray src2,
                k = 0;
 #if CV_SSE2
+                if (USE_SSE2)
+                {
                    for ( ; k <= len - 4; k += 4)
                    {
                        __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
@@ -738,6 +746,7 @@ void cartToPolar( InputArray src1, InputArray src2,
                        _mm_storeu_ps(buf[0] + k, v_dst0);
                        _mm_storeu_ps(buf[1] + k, v_dst1);
                    }
+                }
 #endif
                for( ; k < len; k++ )
@@ -750,12 +759,15 @@ void cartToPolar( InputArray src1, InputArray src2,
                k = 0;
 #if CV_SSE2
+                if (USE_SSE2)
+                {
                    for ( ; k <= len - 4; k += 4)
                    {
                        __m128 v_src = _mm_loadu_ps(buf[0] + k);
                        _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
                        _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
                    }
+                }
 #endif
                for( ; k < len; k++ )
@@ -832,7 +844,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
        k1 = N/360.;
 #if CV_AVX2
-    __m128d v_i = _mm_set_pd(1, 0);
+    if (USE_AVX2)
+    {
        __m128d v_k1 = _mm_set1_pd(k1);
        __m128d v_1 = _mm_set1_pd(1);
        __m128i v_N1 = _mm_set1_epi32(N - 1);
@@ -841,8 +854,6 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
        __m128d v_sin_a2 = _mm_set1_pd(sin_a2);
        __m128d v_cos_a0 = _mm_set1_pd(cos_a0);
-    if (USE_AVX2)
-    {
        for ( ; i <= len - 4; i += 4)
        {
            __m128 v_angle = _mm_loadu_ps(angle + i);
@@ -859,8 +870,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
            __m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
            __m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
-            __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1);
+            __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
-            __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1);
+            __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
            __m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
                                             _mm_mul_pd(v_cos_a, v_sin_b));
@@ -868,7 +879,7 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
                                             _mm_mul_pd(v_sin_a, v_sin_b));
            // 2-3
-            v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v_angle), 8))), v_k1);
+            v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1);
            v_it = _mm_cvtpd_epi32(v_t);
            v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it));
@@ -879,8 +890,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
            v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
            v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
-            v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1);
+            v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
-            v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1);
+            v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
            __m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
                                             _mm_mul_pd(v_cos_a, v_sin_b));
@@ -1032,12 +1043,15 @@ void polarToCart( InputArray src1, InputArray src2,
                        vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m));
                    }
                    #elif CV_SSE2
+                    if (USE_SSE2)
+                    {
                        for( ; k <= len - 4; k += 4 )
                        {
                            __m128 v_m = _mm_loadu_ps(mag + k);
                            _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m));
                            _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m));
                        }
+                    }
                    #endif
                    for( ; k < len; k++ )
@@ -1063,9 +1077,9 @@ void polarToCart( InputArray src1, InputArray src2,
                        x[k] = buf[0][k]*m; y[k] = buf[1][k]*m;
                    }
                else
-                    for( k = 0; k < len; k++ )
                {
-                        x[k] = buf[0][k]; y[k] = buf[1][k];
+                    std::memcpy(x, buf[0], sizeof(float) * len);
+                    std::memcpy(y, buf[1], sizeof(float) * len);
                }
            }

--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -397,6 +397,8 @@ static int countNonZero_(const T* src, int len )
    return nz;
 }
+#if CV_SSE2
 static const uchar * initPopcountTable()
 {
    static uchar tab[256];
@@ -425,6 +427,8 @@ static const uchar * initPopcountTable()
    return tab;
 }
+#endif
 static int countNonZero8u( const uchar* src, int len )
 {
    int i=0, nz = 0;

--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1963,9 +1963,9 @@ private:
 struct ResizeAreaFastVec_SIMD_32f
 {
    ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
-        scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)
+        cn(_cn), step(_step)
    {
-        fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
+        fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
    }
    int operator() (const float * S, float * D, int w) const
@@ -2005,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f
    }
 private:
-    int scale_x, scale_y;
    int cn;
    bool fast_mode;
    int step;
@@ -2289,9 +2288,10 @@ private:
 struct ResizeAreaFastVec_SIMD_32f
 {
    ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
-        scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)
+        cn(_cn), step(_step)
    {
-        fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
+        fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
+        fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
    }
    int operator() (const float * S, float * D, int w) const
@@ -2335,7 +2335,6 @@ struct ResizeAreaFastVec_SIMD_32f
    }
 private:
-    int scale_x, scale_y;
    int cn;
    bool fast_mode;
    int step;
@@ -4817,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
        size.height = 1;
    }
+#if CV_SSE2
+    bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+#endif
+#if CV_SSE4_1
+    bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
+#endif
    const float scale = 1.f/INTER_TAB_SIZE;
    int x, y;
    for( y = 0; y < size.height; y++ )
@@ -4848,6 +4854,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                    vst2q_s16(dst1 + (x << 1), v_dst);
                }
                #elif CV_SSE4_1
+                if (useSSE4_1)
+                {
                    for( ; x <= size.width - 16; x += 16 )
                    {
                        __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
@@ -4867,6 +4875,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
                    }
+                }
                #endif
                for( ; x < size.width; x++ )
                {
@@ -4902,6 +4911,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                    vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
                }
                #elif CV_SSE4_1
+                if (useSSE4_1)
+                {
                    __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
                    __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
@@ -4944,6 +4955,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
                    }
+                }
                #endif
                for( ; x < size.width; x++ )
                {
@@ -5005,6 +5017,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                    vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
                }
                #elif CV_SSE2
+                if (useSSE2)
+                {
                    __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
                    __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
                    __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
@@ -5025,6 +5039,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                                                      _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
                        _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
                    }
+                }
                #endif
                for( ; x < size.width; x++ )
                {
@@ -5150,6 +5165,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                vst2q_f32(dst1f + (x << 1) + 8, v_dst);
            }
            #elif CV_SSE2
+            if (useSSE2)
+            {
                __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
                __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
                __m128 v_scale = _mm_set1_ps(scale);
@@ -5167,6 +5184,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                    v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
                    _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
                }
+            }
            #endif
            for( ; x < size.width; x++ )
            {
@@ -5204,7 +5222,10 @@ public:
        const int AB_SCALE = 1 << AB_BITS;
        int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
    #if CV_SSE2
-        bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    #endif
+    #if CV_SSE4_1
+        bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
    #endif
        int bh0 = std::min(BLOCK_SZ/2, dst.rows);
@@ -5243,6 +5264,8 @@ public:
                            vst2q_s16(xy + (x1 << 1), v_dst);
                        }
                        #elif CV_SSE4_1
+                        if (useSSE4_1)
+                        {
                            __m128i v_X0 = _mm_set1_epi32(X0);
                            __m128i v_Y0 = _mm_set1_epi32(Y0);
                            for ( ; x1 <= bw - 16; x1 += 16)
@@ -5264,6 +5287,7 @@ public:
                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
                            }
+                        }
                        #endif
                        for( ; x1 < bw; x1++ )
                        {
@@ -5278,7 +5302,7 @@ public:
                        short* alpha = A + y1*bw;
                        x1 = 0;
                    #if CV_SSE2
-                        if( useSIMD )
+                        if( useSSE2 )
                        {
                            __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
                            __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
@@ -5672,6 +5696,7 @@ public:
        bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
        #if CV_SSE4_1
+        bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
        __m128d v_M0 = _mm_set1_pd(M[0]);
        __m128d v_M3 = _mm_set1_pd(M[3]);
        __m128d v_M6 = _mm_set1_pd(M[6]);
@@ -5706,6 +5731,8 @@ public:
                        x1 = 0;
                        #if CV_SSE4_1
+                        if (haveSSE4_1)
+                        {
                            __m128d v_X0d = _mm_set1_pd(X0);
                            __m128d v_Y0d = _mm_set1_pd(Y0);
                            __m128d v_W0 = _mm_set1_pd(W0);
@@ -5810,6 +5837,7 @@ public:
                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
                            }
+                        }
                        #endif
                        for( ; x1 < bw; x1++ )
@@ -5831,6 +5859,8 @@ public:
                        x1 = 0;
                        #if CV_SSE4_1
+                        if (haveSSE4_1)
+                        {
                            __m128d v_X0d = _mm_set1_pd(X0);
                            __m128d v_Y0d = _mm_set1_pd(Y0);
                            __m128d v_W0 = _mm_set1_pd(W0);
@@ -5948,6 +5978,7 @@ public:
                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
                            }
+                        }
                        #endif
                        for( ; x1 < bw; x1++ )