Replaced SSE2 area resize implementation with wide universal intrinsic implementation

9d602f27 · Vitaly Tuzov · ceca139f · 9d602f27
Commit 9d602f27 authored Sep 25, 2018 by Vitaly Tuzov
Hide whitespace changes
Inline Side-by-side

Showing with 458 additions and 898 deletions

resize.cpp modules/imgproc/src/resize.cpp +458 -898

No files found.
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -1181,760 +1181,247 @@ struct HResizeNoVec
        const uchar*, int, int, int, int, int) const { return 0; }
 };
-#if CV_SSE2
+#if CV_SIMD
 struct VResizeLinearVec_32s8u
 {
    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
        const int** src = (const int**)_src;
        const short* beta = (const short*)_beta;
        const int *S0 = src[0], *S1 = src[1];
        int x = 0;
-        __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
+        v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]);
-        __m128i delta = _mm_set1_epi16(2);
+        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-        if( (((size_t)S0|(size_t)S1)&15) == 0 )
+            for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
-            for( ; x <= width - 16; x += 16 )
+                v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x                      ) >> 4, vx_load_aligned(S0 + x +     v_int32::nlanes) >> 4), b0) +
-            {
+                                                  v_mul_hi(v_pack(vx_load_aligned(S1 + x                      ) >> 4, vx_load_aligned(S1 + x +     v_int32::nlanes) >> 4), b1),
-                __m128i x0, x1, x2, y0, y1, y2;
+                                                  v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
-                x0 = _mm_load_si128((const __m128i*)(S0 + x));
+                                                  v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
-                x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_load_si128((const __m128i*)(S1 + x));
-                y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
-                x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
-                y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
-                x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
-                x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
-                y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
-                y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
-                x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
-                y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
-                x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
-                x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
-                x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
-                x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
-                _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
-            }
        else
-            for( ; x <= width - 16; x += 16 )
+            for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
-            {
+                v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x                      ) >> 4, vx_load(S0 + x +     v_int32::nlanes) >> 4), b0) +
-                __m128i x0, x1, x2, y0, y1, y2;
+                                                  v_mul_hi(v_pack(vx_load(S1 + x                      ) >> 4, vx_load(S1 + x +     v_int32::nlanes) >> 4), b1),
-                x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
+                                                  v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
-                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
+                                                  v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
-                y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
-                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
-                x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
-                y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
-                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
-                x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
-                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
-                y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
-                x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
-                y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
-                x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
-                x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
-                x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
-                x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
-                _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
-            }
-        for( ; x < width - 4; x += 4 )
+        for( ; x < width - v_int16::nlanes; x += v_int16::nlanes)
-        {
+            v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) +
-            __m128i x0, y0;
+                                            v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1));
-            x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
-            y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
-            x0 = _mm_packs_epi32(x0, x0);
-            y0 = _mm_packs_epi32(y0, y0);
-            x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
-            x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
-            x0 = _mm_packus_epi16(x0, x0);
-            *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
-        }
        return x;
    }
 };
+struct VResizeLinearVec_32f16u
-template<int shiftval> struct VResizeLinearVec_32f16
 {
    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
        const float** src = (const float**)_src;
        const float* beta = (const float*)_beta;
        const float *S0 = src[0], *S1 = src[1];
        ushort* dst = (ushort*)_dst;
        int x = 0;
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
-        __m128i preshift = _mm_set1_epi32(shiftval);
-        __m128i postshift = _mm_set1_epi16((short)shiftval);
-        if( (((size_t)S0|(size_t)S1)&15) == 0 )
+        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - 16; x += 16 )
+            for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-            {
+                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
-                __m128 x0, x1, y0, y1;
+                                          v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
-                __m128i t0, t1, t2;
-                x0 = _mm_load_ps(S0 + x);
-                x1 = _mm_load_ps(S0 + x + 4);
-                y0 = _mm_load_ps(S1 + x);
-                y1 = _mm_load_ps(S1 + x + 4);
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
-                x0 = _mm_load_ps(S0 + x + 8);
-                x1 = _mm_load_ps(S0 + x + 12);
-                y0 = _mm_load_ps(S1 + x + 8);
-                y1 = _mm_load_ps(S1 + x + 12);
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
-                _mm_storeu_si128( (__m128i*)(dst + x), t0);
-                _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
-            }
        else
-            for( ; x <= width - 16; x += 16 )
+            for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-            {
+                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
-                __m128 x0, x1, y0, y1;
+                                          v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
-                __m128i t0, t1, t2;
+        for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
-                x0 = _mm_loadu_ps(S0 + x);
-                x1 = _mm_loadu_ps(S0 + x + 4);
-                y0 = _mm_loadu_ps(S1 + x);
-                y1 = _mm_loadu_ps(S1 + x + 4);
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
-                x0 = _mm_loadu_ps(S0 + x + 8);
-                x1 = _mm_loadu_ps(S0 + x + 12);
-                y0 = _mm_loadu_ps(S1 + x + 8);
-                y1 = _mm_loadu_ps(S1 + x + 12);
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
-                _mm_storeu_si128( (__m128i*)(dst + x), t0);
-                _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
-            }
-        for( ; x < width - 4; x += 4 )
        {
-            __m128 x0, y0;
+            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
-            __m128i t0;
+            v_store_low(dst + x, v_pack_u(t0, t0));
-            x0 = _mm_loadu_ps(S0 + x);
-            y0 = _mm_loadu_ps(S1 + x);
-            x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-            t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-            t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
-            _mm_storel_epi64( (__m128i*)(dst + x), t0);
        }
        return x;
    }
 };
-typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
+struct VResizeLinearVec_32f16s
-typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
-struct VResizeLinearVec_32f
 {
    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
        const float** src = (const float**)_src;
        const float* beta = (const float*)_beta;
        const float *S0 = src[0], *S1 = src[1];
-        float* dst = (float*)_dst;
+        short* dst = (short*)_dst;
        int x = 0;
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
-        if( (((size_t)S0|(size_t)S1)&15) == 0 )
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128 x0, x1, y0, y1;
-                x0 = _mm_load_ps(S0 + x);
-                x1 = _mm_load_ps(S0 + x + 4);
-                y0 = _mm_load_ps(S1 + x);
-                y1 = _mm_load_ps(S1 + x + 4);
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                _mm_storeu_ps( dst + x, x0);
-                _mm_storeu_ps( dst + x + 4, x1);
-            }
-        else
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128 x0, x1, y0, y1;
-                x0 = _mm_loadu_ps(S0 + x);
-                x1 = _mm_loadu_ps(S0 + x + 4);
-                y0 = _mm_loadu_ps(S1 + x);
-                y1 = _mm_loadu_ps(S1 + x + 4);
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                _mm_storeu_ps( dst + x, x0);
-                _mm_storeu_ps( dst + x + 4, x1);
-            }
-        return x;
-    }
-};
-struct VResizeCubicVec_32s8u
-{
-    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-        const int** src = (const int**)_src;
-        const short* beta = (const short*)_beta;
-        const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        int x = 0;
-        float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
-        __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
-            b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
-        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
+        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - 8; x += 8 )
+            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-            {
+                v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
-                __m128i x0, x1, y0, y1;
+                                        v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
-                __m128 s0, s1, f0, f1;
-                x0 = _mm_load_si128((const __m128i*)(S0 + x));
-                x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_load_si128((const __m128i*)(S1 + x));
-                y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
-                s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
-                s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                x0 = _mm_load_si128((const __m128i*)(S2 + x));
-                x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
-                y0 = _mm_load_si128((const __m128i*)(S3 + x));
-                y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                x0 = _mm_cvtps_epi32(s0);
-                x1 = _mm_cvtps_epi32(s1);
-                x0 = _mm_packs_epi32(x0, x1);
-                _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
-            }
        else
-            for( ; x <= width - 8; x += 8 )
+            for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-            {
+                v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
-                __m128i x0, x1, y0, y1;
+                                        v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
-                __m128 s0, s1, f0, f1;
+        for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
-                x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
-                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
-                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
-                s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
-                s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
-                x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
-                y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
-                y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                x0 = _mm_cvtps_epi32(s0);
-                x1 = _mm_cvtps_epi32(s1);
-                x0 = _mm_packs_epi32(x0, x1);
-                _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
-            }
-        return x;
-    }
-};
-template<int shiftval> struct VResizeCubicVec_32f16
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        ushort* dst = (ushort*)_dst;
-        int x = 0;
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
-            b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
-        __m128i preshift = _mm_set1_epi32(shiftval);
-        __m128i postshift = _mm_set1_epi16((short)shiftval);
-        for( ; x <= width - 8; x += 8 )
        {
-            __m128 x0, x1, y0, y1, s0, s1;
+            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
-            __m128i t0, t1;
+            v_store_low(dst + x, v_pack(t0, t0));
-            x0 = _mm_loadu_ps(S0 + x);
-            x1 = _mm_loadu_ps(S0 + x + 4);
-            y0 = _mm_loadu_ps(S1 + x);
-            y1 = _mm_loadu_ps(S1 + x + 4);
-            s0 = _mm_mul_ps(x0, b0);
-            s1 = _mm_mul_ps(x1, b0);
-            y0 = _mm_mul_ps(y0, b1);
-            y1 = _mm_mul_ps(y1, b1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-            x0 = _mm_loadu_ps(S2 + x);
-            x1 = _mm_loadu_ps(S2 + x + 4);
-            y0 = _mm_loadu_ps(S3 + x);
-            y1 = _mm_loadu_ps(S3 + x + 4);
-            x0 = _mm_mul_ps(x0, b2);
-            x1 = _mm_mul_ps(x1, b2);
-            y0 = _mm_mul_ps(y0, b3);
-            y1 = _mm_mul_ps(y1, b3);
-            s0 = _mm_add_ps(s0, x0);
-            s1 = _mm_add_ps(s1, x1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-            t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
-            t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
-            t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
-            _mm_storeu_si128( (__m128i*)(dst + x), t0);
        }
        return x;
    }
 };
-typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
+struct VResizeLinearVec_32f
-typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
-struct VResizeCubicVec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        float* dst = (float*)_dst;
-        int x = 0;
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
-            b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128 x0, x1, y0, y1, s0, s1;
-            x0 = _mm_loadu_ps(S0 + x);
-            x1 = _mm_loadu_ps(S0 + x + 4);
-            y0 = _mm_loadu_ps(S1 + x);
-            y1 = _mm_loadu_ps(S1 + x + 4);
-            s0 = _mm_mul_ps(x0, b0);
-            s1 = _mm_mul_ps(x1, b0);
-            y0 = _mm_mul_ps(y0, b1);
-            y1 = _mm_mul_ps(y1, b1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-            x0 = _mm_loadu_ps(S2 + x);
-            x1 = _mm_loadu_ps(S2 + x + 4);
-            y0 = _mm_loadu_ps(S3 + x);
-            y1 = _mm_loadu_ps(S3 + x + 4);
-            x0 = _mm_mul_ps(x0, b2);
-            x1 = _mm_mul_ps(x1, b2);
-            y0 = _mm_mul_ps(y0, b3);
-            y1 = _mm_mul_ps(y1, b3);
-            s0 = _mm_add_ps(s0, x0);
-            s1 = _mm_add_ps(s1, x1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-            _mm_storeu_ps( dst + x, s0);
-            _mm_storeu_ps( dst + x + 4, s1);
-        }
-        return x;
-    }
-};
-#if CV_TRY_SSE4_1
-struct VResizeLanczos4Vec_32f16u
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width);
-        else return 0;
-    }
-};
-#else
-typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
-#endif
-struct VResizeLanczos4Vec_32f16s
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
-                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
-        short * dst = (short*)_dst;
-        int x = 0;
-        __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
-               v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
-               v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
-               v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
-            __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
-            __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
-            __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
-        }
-        return x;
-    }
-};
-struct VResizeLanczos4Vec_32f
 {
    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    {
        const float** src = (const float**)_src;
        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
+        const float *S0 = src[0], *S1 = src[1];
-                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
        float* dst = (float*)_dst;
        int x = 0;
-        __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
-               v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
-               v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
-               v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
-        for( ; x <= width - 4; x += 4 )
+        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-        {
+            for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
-            __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
+                v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
+        else
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
+            for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
+                v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
-            _mm_storeu_ps(dst + x, v_dst);
-        }
        return x;
    }
 };
-#elif CV_NEON
+struct VResizeCubicVec_32s8u
-struct VResizeLinearVec_32s8u
 {
    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
    {
-        const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
+        const int** src = (const int**)_src;
        const short* beta = (const short*)_beta;
+        const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
        int x = 0;
-        int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
+        float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
-        for( ; x <= width - 16; x += 16)
-        {
-            int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
-            int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
-            int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
-            int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
-            int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
-                                         vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
-            v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
-            v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
-            v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
-            v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
-            v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
-            v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
-            v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
-            int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
-                                         vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
-            v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
-            vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
-        }
+        v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale),
+                  b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale);
+        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 )
+            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+                v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x                    )),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S1 + x                    )),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S2 + x                    )),  b2,
+                                                                v_cvt_f32(vx_load_aligned(S3 + x                    )) * b3)))),
+                                               v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)),  b2,
+                                                                v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3))))));
+        else
+            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+                v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x                    )),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load(S1 + x                    )),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load(S2 + x                    )),  b2,
+                                                                v_cvt_f32(vx_load(S3 + x                    )) * b3)))),
+                                               v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)),  b2,
+                                                                v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3))))));
        return x;
    }
 };
-struct VResizeLinearVec_32f16u
+struct VResizeCubicVec_32f16u
 {
    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    {
        const float** src = (const float**)_src;
        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
        ushort* dst = (ushort*)_dst;
        int x = 0;
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
-        for( ; x <= width - 8; x += 8 )
+        for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-        {
+            v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
-            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
+                                              v_muladd(vx_load(S1 + x                    ),  b1,
-            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
+                                              v_muladd(vx_load(S2 + x                    ),  b2,
+                                                       vx_load(S3 + x                    ) * b3)))),
-            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
+                                      v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
+                                              v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
+                                              v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
+                                                       vx_load(S3 + x + v_float32::nlanes) * b3))))));
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
-        }
        return x;
    }
 };
-struct VResizeLinearVec_32f16s
+struct VResizeCubicVec_32f16s
 {
    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    {
        const float** src = (const float**)_src;
        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
        short* dst = (short*)_dst;
        int x = 0;
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
-        for( ; x <= width - 8; x += 8 )
+        for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-        {
+            v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
-            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
+                                            v_muladd(vx_load(S1 + x                    ),  b1,
-            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
+                                            v_muladd(vx_load(S2 + x                    ),  b2,
+                                                     vx_load(S3 + x                    ) * b3)))),
-            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
+                                    v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
+                                            v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
+                                            v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
+                                                     vx_load(S3 + x + v_float32::nlanes) * b3))))));
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
-        }
        return x;
    }
 };
-struct VResizeLinearVec_32f
+struct VResizeCubicVec_32f
 {
    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    {
        const float** src = (const float**)_src;
        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
        float* dst = (float*)_dst;
        int x = 0;
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+            v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
-        for( ; x <= width - 8; x += 8 )
+                             v_muladd(vx_load(S1 + x),  b1,
-        {
+                             v_muladd(vx_load(S2 + x),  b2,
-            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
+                                      vx_load(S3 + x) * b3))));
-            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
-            vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
-            vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
-        }
        return x;
    }
 };
-typedef VResizeNoVec VResizeCubicVec_32s8u;
-struct VResizeCubicVec_32f16u
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        ushort* dst = (ushort*)_dst;
-        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
-        for( ; x <= width - 8; x += 8 )
+#if CV_TRY_SSE4_1
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                                         v_b1, vld1q_f32(S1 + x + 4)),
-                                                                         v_b2, vld1q_f32(S2 + x + 4)),
-                                                                         v_b3, vld1q_f32(S3 + x + 4));
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
-        }
-        return x;
-    }
-};
-struct VResizeCubicVec_32f16s
+struct VResizeLanczos4Vec_32f16u
 {
    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    {
-        const float** src = (const float**)_src;
+        if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width);
-        const float* beta = (const float*)_beta;
+        else return 0;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        short* dst = (short*)_dst;
-        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                                         v_b1, vld1q_f32(S1 + x + 4)),
-                                                                         v_b2, vld1q_f32(S2 + x + 4)),
-                                                                         v_b3, vld1q_f32(S3 + x + 4));
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
-        }
-        return x;
    }
 };
-struct VResizeCubicVec_32f
+#else
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        float* dst = (float*)_dst;
-        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
-        for( ; x <= width - 8; x += 8 )
-        {
-            vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                       v_b1, vld1q_f32(S1 + x)),
-                                                                       v_b2, vld1q_f32(S2 + x)),
-                                                                       v_b3, vld1q_f32(S3 + x)));
-            vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                                          v_b1, vld1q_f32(S1 + x + 4)),
-                                                                          v_b2, vld1q_f32(S2 + x + 4)),
-                                                                          v_b3, vld1q_f32(S3 + x + 4)));
-        }
-        return x;
-    }
-};
 struct VResizeLanczos4Vec_32f16u
 {
@@ -1946,41 +1433,35 @@ struct VResizeLanczos4Vec_32f16u
                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
        ushort * dst = (ushort*)_dst;
        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
-                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
+                  b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
-                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
+                  b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
-        for( ; x <= width - 8; x += 8 )
+        for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-        {
+            v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                              v_muladd(vx_load(S1 + x                    ),  b1,
-                                                                         v_b1, vld1q_f32(S1 + x)),
+                                              v_muladd(vx_load(S2 + x                    ),  b2,
-                                                                         v_b2, vld1q_f32(S2 + x)),
+                                              v_muladd(vx_load(S3 + x                    ),  b3,
-                                                                         v_b3, vld1q_f32(S3 + x));
+                                              v_muladd(vx_load(S4 + x                    ),  b4,
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
+                                              v_muladd(vx_load(S5 + x                    ),  b5,
-                                                                         v_b5, vld1q_f32(S5 + x)),
+                                              v_muladd(vx_load(S6 + x                    ),  b6,
-                                                                         v_b6, vld1q_f32(S6 + x)),
+                                                       vx_load(S7 + x                    ) * b7)))))))),
-                                                                         v_b7, vld1q_f32(S7 + x));
+                                      v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-            float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
+                                              v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
+                                              v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-            v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                              v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
-                                                             v_b1, vld1q_f32(S1 + x + 4)),
+                                              v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
-                                                             v_b2, vld1q_f32(S2 + x + 4)),
+                                              v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
-                                                             v_b3, vld1q_f32(S3 + x + 4));
+                                              v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
-            v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
+                                                       vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
-                                                             v_b5, vld1q_f32(S5 + x + 4)),
-                                                             v_b6, vld1q_f32(S6 + x + 4)),
-                                                             v_b7, vld1q_f32(S7 + x + 4));
-            v_dst1 = vaddq_f32(v_dst0, v_dst1);
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
-        }
        return x;
    }
 };
+#endif
 struct VResizeLanczos4Vec_32f16s
 {
    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
@@ -1991,36 +1472,28 @@ struct VResizeLanczos4Vec_32f16s
                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
        short * dst = (short*)_dst;
        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
-                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
+                  b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
-                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
+                  b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
-        for( ; x <= width - 8; x += 8 )
+        for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-        {
+            v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                            v_muladd(vx_load(S1 + x                    ),  b1,
-                                                                         v_b1, vld1q_f32(S1 + x)),
+                                            v_muladd(vx_load(S2 + x                    ),  b2,
-                                                                         v_b2, vld1q_f32(S2 + x)),
+                                            v_muladd(vx_load(S3 + x                    ),  b3,
-                                                                         v_b3, vld1q_f32(S3 + x));
+                                            v_muladd(vx_load(S4 + x                    ),  b4,
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
+                                            v_muladd(vx_load(S5 + x                    ),  b5,
-                                                                         v_b5, vld1q_f32(S5 + x)),
+                                            v_muladd(vx_load(S6 + x                    ),  b6,
-                                                                         v_b6, vld1q_f32(S6 + x)),
+                                                     vx_load(S7 + x                    ) * b7)))))))),
-                                                                         v_b7, vld1q_f32(S7 + x));
+                                    v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-            float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
+                                            v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
+                                            v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-            v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                            v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
-                                                             v_b1, vld1q_f32(S1 + x + 4)),
+                                            v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
-                                                             v_b2, vld1q_f32(S2 + x + 4)),
+                                            v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
-                                                             v_b3, vld1q_f32(S3 + x + 4));
+                                            v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
-            v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
+                                                     vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
-                                                             v_b5, vld1q_f32(S5 + x + 4)),
-                                                             v_b6, vld1q_f32(S6 + x + 4)),
-                                                             v_b7, vld1q_f32(S7 + x + 4));
-            v_dst1 = vaddq_f32(v_dst0, v_dst1);
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
-        }
        return x;
    }
@@ -2036,23 +1509,21 @@ struct VResizeLanczos4Vec_32f
                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
        float* dst = (float*)_dst;
        int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
-                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
-                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
-        for( ; x <= width - 4; x += 4 )
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
-        {
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                  b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
-                                                                         v_b1, vld1q_f32(S1 + x)),
+                  b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
+        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
+            v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
-                                                                         v_b5, vld1q_f32(S5 + x)),
+                             v_muladd(vx_load(S1 + x),  b1,
-                                                                         v_b6, vld1q_f32(S6 + x)),
+                             v_muladd(vx_load(S2 + x),  b2,
-                                                                         v_b7, vld1q_f32(S7 + x));
+                             v_muladd(vx_load(S3 + x),  b3,
-            vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
+                             v_muladd(vx_load(S4 + x),  b4,
-        }
+                             v_muladd(vx_load(S5 + x),  b5,
+                             v_muladd(vx_load(S6 + x),  b6,
+                                      vx_load(S7 + x) * b7))))))));
        return x;
    }
@@ -2695,95 +2166,94 @@ private:
    int step;
 };
-#elif CV_SSE2
+#elif CV_SIMD
 class ResizeAreaFastVec_SIMD_8u
 {
 public:
    ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
-        cn(_cn), step(_step)
+        cn(_cn), step(_step) {}
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
    int operator() (const uchar* S, uchar* D, int w) const
    {
-        if (!use_simd)
-            return 0;
        int dx = 0;
        const uchar* S0 = S;
        const uchar* S1 = S0 + step;
-        __m128i zero = _mm_setzero_si128();
-        __m128i delta2 = _mm_set1_epi16(2);
        if (cn == 1)
        {
-            __m128i masklow = _mm_set1_epi16(0x00ff);
+            v_uint16 masklow = vx_setall_u16(0x00ff);
-            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes)
            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0));
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1));
+                v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow));
-                __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
-                __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
-                s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
-                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
            }
        }
        else if (cn == 3)
-            for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
+        {
+            if (CV_SIMD_WIDTH > 64)
+                return 0;
+            for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes)
            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                v_uint16 t0, t1, t2, t3, t4, t5;
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_uint16 s0, s1, s2, s3, s4, s5;
+                s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
-                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
+                s1 = vx_load_expand(S0 +   v_uint16::nlanes) + vx_load_expand(S1 +   v_uint16::nlanes);
-                __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
+                s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes);
-                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
+                s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes);
-                __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
+                s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes);
+                s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes);
-                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
-                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
-                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+                v_uint16 bl, gl, rl;
-                _mm_storel_epi64((__m128i*)D, s0);
+#if CV_SIMD_WIDTH == 16
+                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
-                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
+#elif CV_SIMD_WIDTH == 32
-                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
-                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+#elif CV_SIMD_WIDTH == 64
-                _mm_storel_epi64((__m128i*)(D+3), s0);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+#endif
+                s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes);
+                s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes);
+                s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes);
+                s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes);
+                s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes);
+                s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_uint16 bh, gh, rh;
+#if CV_SIMD_WIDTH == 16
+                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+#elif CV_SIMD_WIDTH == 32
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
+#elif CV_SIMD_WIDTH == 64
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+#endif
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
            }
+        }
        else
        {
            CV_Assert(cn == 4);
-            int v[] = { 0, 0, -1, -1 };
+            for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes)
-            __m128i mask = _mm_loadu_si128((const __m128i*)v);
-            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                v_uint32 r00, r01, r10, r11;
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_load_deinterleave((uint32_t*)S0, r00, r01);
+                v_load_deinterleave((uint32_t*)S1, r10, r11);
-                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
-                __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
+                v_uint16 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
-                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
+                v_expand(v_reinterpret_as_u8(r00), r00l, r00h);
-                __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
+                v_expand(v_reinterpret_as_u8(r01), r01l, r01h);
+                v_expand(v_reinterpret_as_u8(r10), r10l, r10h);
-                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
+                v_expand(v_reinterpret_as_u8(r11), r11l, r11h);
-                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
+                v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                __m128i res0 = _mm_srli_epi16(s0, 2);
-                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
-                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                __m128i res1 = _mm_srli_epi16(s0, 2);
-                s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
-                                                   _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
-                _mm_storel_epi64((__m128i*)(D), s0);
            }
        }
@@ -2792,7 +2262,6 @@ public:
 private:
    int cn;
-    bool use_simd;
    int step;
 };
@@ -2800,164 +2269,258 @@ class ResizeAreaFastVec_SIMD_16u
 {
 public:
    ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
-        cn(_cn), step(_step)
+        cn(_cn), step(_step) {}
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
    int operator() (const ushort* S, ushort* D, int w) const
    {
-        if (!use_simd)
-            return 0;
        int dx = 0;
        const ushort* S0 = (const ushort*)S;
        const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
-        __m128i masklow = _mm_set1_epi32(0x0000ffff);
-        __m128i zero = _mm_setzero_si128();
-        __m128i delta2 = _mm_set1_epi32(2);
-#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
        if (cn == 1)
        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            v_uint32 masklow = vx_setall_u32(0x0000ffff);
+            for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0));
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1));
+                v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow));
-                __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
-                __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
-                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
-                s0 = _mm_srli_epi32(s0, 2);
-                s0 = _mm_packus_epi32(s0, zero);
-                _mm_storel_epi64((__m128i*)D, s0);
            }
        }
        else if (cn == 3)
+        {
+#if CV_SIMD_WIDTH == 16
            for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
+#if CV_SSE4_1
            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                v_uint32 r0, r1, r2, r3;
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_expand(vx_load(S0), r0, r1);
+                v_expand(vx_load(S1), r2, r3);
-                __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
+                r0 += r2; r1 += r3;
-                __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
+                v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
-                __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
-                __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
-                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
-                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
-                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
-                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
            }
+#else
+                v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
+#endif
+#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
+            for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
+            {
+                v_uint32 t0, t1, t2, t3, t4, t5;
+                v_uint32 s0, s1, s2, s3, s4, s5;
+                s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
+                s1 = vx_load_expand(S0 +   v_uint32::nlanes) + vx_load_expand(S1 +   v_uint32::nlanes);
+                s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes);
+                s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes);
+                s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes);
+                s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_uint32 bl, gl, rl;
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+#if CV_SIMD_WIDTH == 32
+                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+#else //CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
+#endif
+                s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes);
+                s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes);
+                s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes);
+                s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes);
+                s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes);
+                s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_uint32 bh, gh, rh;
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+#if CV_SIMD_WIDTH == 32
+                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+#else //CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
+#endif
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
+            }
+#elif CV_SIMD_WIDTH >= 64
+            v_uint32 masklow = vx_setall_u32(0x0000ffff);
+            for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
+            {
+                v_uint16 b0, g0, r0, b1, g1, r1;
+                v_load_deinterleave(S0, b0, g0, r0);
+                v_load_deinterleave(S1, b1, g1, r1);
+                v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
+                v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
+                v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
+                v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0);
+                v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1);
+                v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
+                v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
+                v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
+            }
+#endif
+        }
        else
        {
            CV_Assert(cn == 4);
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+#if CV_SIMD_WIDTH >= 64
+            for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes)
            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                v_uint64 r00, r01, r10, r11;
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_load_deinterleave((uint64_t*)S0, r00, r01);
+                v_load_deinterleave((uint64_t*)S1, r10, r11);
-                __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
-                __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
+                v_uint32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
-                __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
+                v_expand(v_reinterpret_as_u16(r00), r00l, r00h);
-                __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
+                v_expand(v_reinterpret_as_u16(r01), r01l, r01h);
+                v_expand(v_reinterpret_as_u16(r10), r10l, r10h);
-                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
+                v_expand(v_reinterpret_as_u16(r11), r11l, r11h);
-                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
+                v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
-                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
-                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
            }
+#else
+            for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
+            {
+                v_uint32 r0, r1, r2, r3;
+                v_expand(vx_load(S0), r0, r1);
+                v_expand(vx_load(S1), r2, r3);
+                r0 += r2; r1 += r3;
+                v_uint32 v_d;
+#if CV_SIMD_WIDTH == 16
+                v_d = r0 + r1;
+#elif CV_SIMD_WIDTH == 32
+                v_uint32 t0, t1;
+                v_recombine(r0, r1, t0, t1);
+                v_d = t0 + t1;
+#endif
+                v_rshr_pack_store<2>(D, v_d);
+            }
+#endif
        }
-#undef _mm_packus_epi32
        return dx;
    }
 private:
    int cn;
    int step;
-    bool use_simd;
 };
 class ResizeAreaFastVec_SIMD_16s
 {
 public:
    ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
-        cn(_cn), step(_step)
+        cn(_cn), step(_step) {}
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
    int operator() (const short* S, short* D, int w) const
    {
-        if (!use_simd)
-            return 0;
        int dx = 0;
        const short* S0 = (const short*)S;
        const short* S1 = (const short*)((const uchar*)(S) + step);
-        __m128i masklow = _mm_set1_epi32(0x0000ffff);
-        __m128i zero = _mm_setzero_si128();
-        __m128i delta2 = _mm_set1_epi32(2);
        if (cn == 1)
        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            v_int32 masklow = vx_setall_s32(0x0000ffff);
+            for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes)
            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                v_int32 r0 = v_reinterpret_as_s32(vx_load(S0));
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_int32 r1 = v_reinterpret_as_s32(vx_load(S1));
+                v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16));
-                __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
-                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
-                __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
-                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
-                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
-                s0 = _mm_srai_epi32(s0, 2);
-                s0 = _mm_packs_epi32(s0, zero);
-                _mm_storel_epi64((__m128i*)D, s0);
            }
        }
        else if (cn == 3)
+        {
+#if CV_SIMD_WIDTH == 16
            for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
+                v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
+#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
+            for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
+            {
+                v_int32 t0, t1, t2, t3, t4, t5;
+                v_int32 s0, s1, s2, s3, s4, s5;
+                s0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
+                s1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
+                s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
+                s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
+                s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes);
+                s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_int32 bl, gl, rl;
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+#if CV_SIMD_WIDTH == 32
+                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+#else //CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
+#endif
+                s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes);
+                s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes);
+                s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes);
+                s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes);
+                s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes);
+                s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_int32 bh, gh, rh;
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+#if CV_SIMD_WIDTH == 32
+                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+#else //CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
+#endif
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
+            }
+#elif CV_SIMD_WIDTH >= 64
+            for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                v_int16 b0, g0, r0, b1, g1, r1;
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_load_deinterleave(S0, b0, g0, r0);
+                v_load_deinterleave(S1, b1, g1, r1);
-                __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
+                v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
-                __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
+                v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
-                __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
+                v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
-                __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
+                v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0);
+                v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1);
-                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
+                v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
-                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
+                v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
-                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
+                v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
-                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
-                _mm_storel_epi64((__m128i*)D, s0);
            }
+#endif
+        }
        else
        {
            CV_Assert(cn == 4);
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes)
            {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+#if CV_SIMD_WIDTH >= 64
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_int64 r00, r01, r10, r11;
+                v_load_deinterleave((int64_t*)S0, r00, r01);
-                __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
+                v_load_deinterleave((int64_t*)S1, r10, r11);
-                __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
-                __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
+                v_int32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
-                __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
+                v_expand(v_reinterpret_as_s16(r00), r00l, r00h);
+                v_expand(v_reinterpret_as_s16(r01), r01l, r01h);
-                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
+                v_expand(v_reinterpret_as_s16(r10), r10l, r10h);
-                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
+                v_expand(v_reinterpret_as_s16(r11), r11l, r11h);
-                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
+                v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
-                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
+#else
-                _mm_storel_epi64((__m128i*)D, s0);
+                v_int32 r0, r1, r2, r3;
+                r0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
+                r1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
+                r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
+                r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
+                v_int32 dl, dh;
+#if CV_SIMD_WIDTH == 16
+                dl = r0 + r1; dh = r2 + r3;
+#elif CV_SIMD_WIDTH == 32
+                v_int32 t0, t1, t2, t3;
+                v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
+                dl = t0 + t1; dh = t2 + t3;
+#endif
+                v_store(D, v_rshr_pack<2>(dl, dh));
+#endif
            }
        }
@@ -2967,7 +2530,6 @@ public:
 private:
    int cn;
    int step;
-    bool use_simd;
 };
 struct ResizeAreaFastVec_SIMD_32f
@@ -2976,7 +2538,6 @@ struct ResizeAreaFastVec_SIMD_32f
        cn(_cn), step(_step)
    {
        fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
-        fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
    }
    int operator() (const float * S, float * D, int w) const
@@ -2987,33 +2548,32 @@ struct ResizeAreaFastVec_SIMD_32f
        const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
        int dx = 0;
-        __m128 v_025 = _mm_set1_ps(0.25f);
        if (cn == 1)
        {
-            const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
+            v_float32 v_025 = vx_setall_f32(0.25f);
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
            {
-                __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
+                v_float32 v_row00, v_row01, v_row10, v_row11;
-                       v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
+                v_load_deinterleave(S0, v_row00, v_row01);
+                v_load_deinterleave(S1, v_row10, v_row11);
-                __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
+                v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025);
-                                           _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
-                __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
-                                           _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
-                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
            }
        }
        else if (cn == 4)
        {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+#if CV_SIMD_WIDTH == 16
+            v_float32 v_025 = vx_setall_f32(0.25f);
+            for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
+                v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025);
+#elif CV_SIMD256
+            v_float32x8 v_025 = v256_setall_f32(0.25f);
+            for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
            {
-                __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
+                v_float32x8 dst0, dst1;
-                __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
+                v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1);
+                v_store(D, (dst0 + dst1) * v_025);
-                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
            }
+#endif
        }
        return dx;