Merge pull request #2476 from tsenst:optimize_performance_rlof

Rework RLOF by using HAL universal instructions * * distinguish between SSE2 and SSE 4.1 support * SSE2 now implements no paralellized _blendv functiona and allows to compile with SSE2 instructions * * add interface function do enable disable M-estimator framework * bugfix blendv functions * * make use of _mm_store and _mm_load functions in blendv_ps and blendv_epi function to fix compiler error * * implement substitute of _mm_cvtepi8_epi16 and _mm_cvtepi16_epi32 when compiling with less than SSE 4.1 support * * implement substitute of _mm_abs_epi16 when compiling with less than SSE 3 support * * move _mm_abs_epi16 to SSE4 req. * * add HAL intrinsic functions for RLOF ICA * first tests are OK * * HAL intrinsic functions -> RLOF ICA is ready now * * add HAL functions for RLOF illumination model * * add HAL for PLK ILL * * add HAL functions for plk ica * * commit HAL functions for beplk ica * * add HAL functions for berlof ica * * add HAL functions for BEPLK ILL * *remove unused #ifdef * * remove white spaces and unused variables * * remove not used variables * + remove get4BitMask function which contained pure SEE instructions * * remove buffIdx which has not been used but initialized * * fix replacement of #ifdef CV_SIMD128 with #if CV_SIMD128 * rename useMEstimator to setUseMEstimator * *remove whitespace * * remove unused variable abss * * remove unused value * * rename W_BITS1 to W_BITS since they contain same value

Merge pull request #2476 from tsenst:optimize_performance_rlof
Rework RLOF by using HAL universal instructions * * distinguish between SSE2 and SSE 4.1 support * SSE2 now implements no paralellized _blendv functiona and allows to compile with SSE2 instructions * * add interface function do enable disable M-estimator framework * bugfix blendv functions * * make use of _mm_store and _mm_load functions in blendv_ps and blendv_epi function to fix compiler error * * implement substitute of _mm_cvtepi8_epi16 and _mm_cvtepi16_epi32 when compiling with less than SSE 4.1 support * * implement substitute of _mm_abs_epi16 when compiling with less than SSE 3 support * * move _mm_abs_epi16 to SSE4 req. * * add HAL intrinsic functions for RLOF ICA * first tests are OK * * HAL intrinsic functions -> RLOF ICA is ready now * * add HAL functions for RLOF illumination model * * add HAL for PLK ILL * * add HAL functions for plk ica * * commit HAL functions for beplk ica * * add HAL functions for berlof ica * * add HAL functions for BEPLK ILL * *remove unused #ifdef * * remove white spaces and unused variables * * remove not used variables * + remove get4BitMask function which contained pure SEE instructions * * remove buffIdx which has not been used but initialized * * fix replacement of #ifdef CV_SIMD128 with #if CV_SIMD128 * rename useMEstimator to setUseMEstimator * *remove whitespace * * remove unused variable abss * * remove unused value * * rename W_BITS1 to W_BITS since they contain same value
4b632022 · Tobias Senst · GitHub · 8ab145c4 · 4b632022 · 4b632022
Unverified Commit 4b632022 authored Mar 31, 2020 by Tobias Senst Committed by GitHub Mar 31, 2020
9 changed files
--- a/modules/optflow/include/opencv2/optflow/rlofflow.hpp
+++ b/modules/optflow/include/opencv2/optflow/rlofflow.hpp
@@ -66,12 +66,12 @@ public:
    RLOFOpticalFlowParameter()
        :solverType(ST_BILINEAR)
        ,supportRegionType(SR_CROSS)
-        ,normSigma0(3.2f)
+        ,normSigma0(std::numeric_limits<float>::max())
-        ,normSigma1(7.f)
+        ,normSigma1(std::numeric_limits<float>::max())
        ,smallWinSize(9)
        ,largeWinSize(21)
        ,crossSegmentationThreshold(25)
-        ,maxLevel(5)
+        ,maxLevel(4)
        ,useInitialFlow(false)
        ,useIlluminationModel(true)
        ,useGlobalMotionPrior(true)
@@ -90,13 +90,13 @@ public:
    */
    float normSigma0;
-    /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+    /**< &sigma parameter of the shrinked Hampel norm introduced in @cite Senst2012. If
     * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
     * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
     * region the least-square can be fast in computation.
    */
    float normSigma1;
-    /**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
+    /**< &sigma parameter of the shrinked Hampel norm introduced in @cite Senst2012. If
     * &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
     * instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
     * region the least-square can be fast in computation.
@@ -151,6 +151,14 @@ public:
     *   See @cite Senst2016 for more details.
    */
+    //! @brief Enable M-estimator or disable and use least-square estimator.
+    /** Enables M-estimator by setting sigma parameters to (3.2, 7.0). Disabling M-estimator can reduce
+     *  runtime, while enabling can improve the accuracy.
+     *  @param val If true M-estimator is used. If false least-square estimator is used.
+     *    @see setNormSigma0, setNormSigma1
+    */
+    CV_WRAP void setUseMEstimator(bool val);
    CV_WRAP void setSolverType(SolverType val);
    CV_WRAP SolverType getSolverType() const;
@@ -216,9 +224,8 @@ public:
 * For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
 * Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
 *
- * @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the
+ * @note If the grid size is set to (1,1) and the forward backward threshold <= 0 than pixelwise dense optical flow field is
- * forward backward threshold <= 0 that the dense optical flow field is purely.
+ * computed by RLOF without using interpolation.
- * computed with the RLOF.
 *
 * @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
 */

--- a/modules/optflow/perf/perf_rlof.cpp
+++ b/modules/optflow/perf/perf_rlof.cpp
@@ -47,7 +47,7 @@ typedef tuple<std::string, int> INTERP_GRID_Dense_t;
 typedef TestBaseWithParam<INTERP_GRID_Dense_t> INTERP_GRID_Dense;
 PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
    testing::Combine(
-        testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO"),
+        testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO", "INTERP_RIC"),
        testing::Values<int>(4,10))
 )
 {
@@ -63,6 +63,8 @@ PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
        interp_type = INTERP_EPIC;
    if (get<0>(GetParam()) == "INTERP_GEO")
        interp_type = INTERP_GEO;
+    if (get<0>(GetParam()) == "INTERP_RIC")
+        interp_type = INTERP_RIC;
    PERF_SAMPLE_BEGIN()
        calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
    PERF_SAMPLE_END()

--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -17,7 +17,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
 {
 public:
    TrackerInvoker(
        const Mat&      _prevImg,
        const Mat&      _prevDeriv,
        const Mat&      _nextImg,
@@ -73,7 +72,7 @@ public:
        int winMaskwidth = roundUp(winSize.width, 16);
        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
        winMaskMatBuf.setTo(1);
-        const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
+        const float FLT_SCALE = (1.f/(1 << 16));
        int cn = I.channels(), cn2 = cn*2;
        int winbufwidth = roundUp(winSize.width, 16);
@@ -137,7 +136,7 @@ public:
            float a = prevPt.x - iprevPt.x;
            float b = prevPt.y - iprevPt.y;
-            const int W_BITS = 14, W_BITS1 = 14;
+            const int W_BITS = 14;
            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
@@ -151,93 +150,8 @@ public:
            float w1 = 0, w2 = 0; // -IyI
            float dI = 0; // I^2
            float D = 0;
- #ifdef RLOF_SSE
-            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-            __m128i z = _mm_setzero_si128();
-            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
-            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
-            __m128i mmMask4_epi32;
-            __m128i mmMaskSet_epi16   = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-            get4BitMask(winSize.width, mmMask4_epi32);
-#endif
-            // extract the patch from the first image, compute covariation matrix of derivatives
-            int x, y;
-            for( y = 0; y < winSize.height; y++ )
-            {
-                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
-                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
-                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
-                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
-                short* Iptr  = IWinBuf.ptr<short>(y, 0);
-                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
-                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                x = 0;
-#ifdef RLOF_SSE
-               for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
-                {
-                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
-                    __m128i v00, v01, v10, v11, t0, t1;
-                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                    if( x + 4 > winSize.width)
-                    {
-                        t0 = _mm_and_si128(t0, mmMask4_epi32);
-                    }
-                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
-                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
-                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                    if( x + 4 > winSize.width)
-                    {
-                        v00 = _mm_and_si128(v00, mmMask4_epi32);
-                    }
-                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
-                    _mm_storeu_si128((__m128i*)dIptr, v00);
-                }
-#else
-                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
-                {
-                    if( maskPtr[x] == 0)
-                    {
-                        dIptr[0] = 0;
-                        dIptr[1] = 0;
-                        continue;
-                    }
-                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
-                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
-                                           dsrc1[cn2+1]*iw11, W_BITS1);
-                    Iptr[x] = (short)ival;
-                    dIptr[0] = (short)ixval;
-                    dIptr[1] = (short)iyval;
-                }
-#endif
-            }
+            copyWinBuffers(iw00, iw01, iw10, iw11, winSize, I, derivI, winMaskMat, IWinBuf, derivIWinBuf, iprevPt);
            cv::Point2f backUpNextPt = nextPt;
            nextPt += halfWin;
@@ -286,183 +200,136 @@ public:
                    A12 = 0;
                    A22 = 0;
                }
-#ifdef RLOF_SSE
+#if CV_SIMD128
-                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                v_int16x8 vqw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
-                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                v_int16x8 vqw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
-                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
+                v_float32x4 vqb0 = v_setzero_f32(), vqb1 = v_setzero_f32(), vqb2 = v_setzero_f32(), vqb3 = v_setzero_f32();
-                __m128 qb3 = _mm_setzero_ps();
+                v_float32x4 vsumW1 = v_setzero_f32(), vsumW2 = v_setzero_f32();
-                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                v_float32x4 vsumIy = v_setzero_f32(), vsumIx = v_setzero_f32(), vsumI = v_setzero_f32(), vsumDI = v_setzero_f32();
-                __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
+                v_float32x4 vAxx = v_setzero_f32(), vAxy = v_setzero_f32(), vAyy = v_setzero_f32();
-                __m128 mmSumIy = _mm_setzero_ps(),  mmSumIx = _mm_setzero_ps();
+                v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
-                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                v_int16x8 vmax_val_16 = v_setall_s16(std::numeric_limits<unsigned short>::max());
                float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
                int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
-                __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                v_int16x8 vgain_value = v_setall_s16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
-                __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
+                v_int16x8 vconst_value = v_setall_s16(static_cast<short>(gainVec.y));
 #endif
-                for( y = 0; y < winSize.height; y++ )
+                for(int y = 0; y < winSize.height; y++ )
                {
                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                    x = 0;
+#if CV_SIMD128
-#ifdef RLOF_SSE
+                    for(int x = 0 ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
-                    for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
-                    {
-                        // iw = iw << 14 (16384 short / 2)
-                        // iq0 =  iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00    16 bit
-                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
-                        // I [0...8]
-                        __m128i diff0, diff1;
-                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); //  element 0 to 7
-                        __m128i v00 = _mm_unpacklo_epi8(
-                            _mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen
-                                , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
-                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
-                        __m128i t0 = _mm_add_epi32
-                            (_mm_madd_epi16(
-                                _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
-                                qw0),
-                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
-                        __m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
-                        __m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
-                        __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift);
-                        __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift);
-                        __m128i Igain_epi16 =  _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
-                        // diff = J - I + I*gain.x + gain.y
-                        __m128i mmDiff_epi16 = _mm_add_epi16(
-                            _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
-                            _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
-                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
-                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
-                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
-                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
-                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
-                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-                        // for set 1  sigma 1
-                        // b3 += diff * Iptr[x]
-                        Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
-                        Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
-                        v00 = _mm_unpacklo_epi16(Ilo, Ihi);
-                        v10 = _mm_unpackhi_epi16(Ilo, Ihi);
-                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
-                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
-                        // b4+= diff
-                        __m128 mmDiff_0_3_ps  = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16));
-                        __m128 mmDiff_4_7_ps  = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16)));
-                        qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps);
-                        qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps);
-                        if( j == 0 )
                    {
-                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
+                        v_int16x8 vI = v_reinterpret_as_s16(v_load(Iptr + x)), diff0, diff1, diff2;
-                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
+                        v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
+                        v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
+                        v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
+                        v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
+                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
-                            t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                        v_int32x4 t0, t1;
-                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                        v_int16x8 t00, t01, t10, t11;
+                        v_zip(v00, v01, t00, t01);
+                        v_zip(v10, v11, t10, t11);
-                            __m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps);
+                        //subpixel interpolation
-                            __m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps);
+                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
+                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
+                        t0 = t0 >> (W_BITS - 5);
+                        t1 = t1 >> (W_BITS - 5);
-                            // 0 ... 3
+                        // diff = J - I
-                            __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                        diff0 = v_pack(t0, t1) - vI;
+                        // I*gain.x + gain.x
+                        v_mul_expand(vI, vgain_value, t0, t1);
+                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
+                        diff0 = diff0 & vmask;
+                        v_zip(diff0, diff0, diff2, diff1);
-                            // A11 - A22
+                        v_int32x4 diff0_0;
-                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
+                        v_int32x4 diff0_1;
-                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
+                        v_expand(diff0, diff0_0, diff0_1);
-                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
-                            // sumIx und sumIy
+                        v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                            mmSumIx = _mm_add_ps(mmSumIx, fx);
+                        v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
-                            mmSumIy = _mm_add_ps(mmSumIy, fy);
+                        v_zip(vIxy_0, vIxy_1, v10, v11);
+                        v_zip(diff2, diff1, v00, v01);
-                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
+                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
+                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
-                            // sumI
+                        v_int32x4 vI0, vI1;
-                            I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps);
+                        v_expand(vI, vI0, vI1);
-                            mmSumI = _mm_add_ps(mmSumI,I_ps);
+                        vqb2 += v_cvt_f32(diff0_0 * vI0);
+                        vqb2 += v_cvt_f32(diff0_1 * vI1);
-                            // sumDI
+                        vqb3 += v_cvt_f32(diff0_0);
-                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
+                        vqb3 += v_cvt_f32(diff0_1);
-                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                        if (j == 0)
-                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                        {
+                            v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_0))));
+                            v_expand(v00, t1, t0);
-                            fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps);
+                            v_float32x4 vI_ps = v_cvt_f32(vI0);
-                            fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps);
-                            // 4 ... 7
+                            v_float32x4 fy = v_cvt_f32(t0);
-                            I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                            v_float32x4 fx = v_cvt_f32(t1);
+                            vAyy = v_muladd(fy, fy, vAyy);
+                            vAxy = v_muladd(fx, fy, vAxy);
+                            vAxx = v_muladd(fx, fx, vAxx);
+                            // sumIx und sumIy
+                            vsumIx += fx;
+                            vsumIy += fy;
+                            vsumW1 += vI_ps * fx;
+                            vsumW2 += vI_ps * fy;
+                            // sumI
+                            vsumI += vI_ps;
+                            // sumDI
+                            vsumDI += vI_ps * vI_ps;
-                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
+                            v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
-                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
+                            v_expand(v01, t1, t0);
-                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
+                            vI_ps = v_cvt_f32(vI1);
+                            fy = v_cvt_f32(t0);
+                            fx = v_cvt_f32(t1);
+                            vAyy = v_muladd(fy, fy, vAyy);
+                            vAxy = v_muladd(fx, fy, vAxy);
+                            vAxx = v_muladd(fx, fx, vAxx);
                            // sumIx und sumIy
-                            mmSumIx = _mm_add_ps(mmSumIx, fx);
+                            vsumIx += fx;
-                            mmSumIy = _mm_add_ps(mmSumIy, fy);
+                            vsumIy += fy;
-                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
+                            vsumW1 += vI_ps * fx;
-                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
+                            vsumW2 += vI_ps * fy;
                            // sumI
-                            I_ps  = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps);
+                            vsumI += vI_ps;
-                            mmSumI = _mm_add_ps(mmSumI, I_ps);
-                            // sumW
                            // sumDI
-                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
+                            vsumDI += vI_ps * vI_ps;
                        }
                    }
 #else
-                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
+                    for(int x = 0 ; x < winSize.width*cn; x++, dIptr += 2 )
                    {
                        if( maskPtr[x] == 0)
                            continue;
                        int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                              Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
-                                              W_BITS1-5);
+                                              W_BITS-5);
                        int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
                        b1 += (float)(diff*dIptr[0])  * FLT_RESCALE;
@@ -492,26 +359,19 @@ public:
                    }
 #endif
                }
-#ifdef RLOF_SSE
-                float CV_DECL_ALIGNED(16) wbuf[4];
-#endif
                if( j == 0 )
                {
-#ifdef RLOF_SSE
+#if CV_SIMD128
-                        _mm_store_ps(wbuf, mmSumW1);
-                        w1  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    w1 = v_reduce_sum(vsumW1);
-                        _mm_store_ps(wbuf, mmSumW2);
+                    w2 = v_reduce_sum(vsumW2);
-                        w2  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    dI = v_reduce_sum(vsumDI);
-                        _mm_store_ps(wbuf, mmSumDI);
+                    sumI = v_reduce_sum(vsumI);
-                        dI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    sumIx = v_reduce_sum(vsumIx);
-                        _mm_store_ps(wbuf, mmSumI);
+                    sumIy = v_reduce_sum(vsumIy);
-                        sumI  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    A11 = v_reduce_sum(vAxx);
-                        _mm_store_ps(wbuf, mmSumIx);
+                    A12 = v_reduce_sum(vAxy);
-                        sumIx  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    A22 = v_reduce_sum(vAyy);
-                        _mm_store_ps(wbuf, mmSumIy);
-                        sumIy  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                        _mm_store_ps(wbuf, mmSumW);
-                        sumW  = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
 #endif
                    sumIx *= -FLT_SCALE;
                    sumIy *= -FLT_SCALE;
@@ -520,31 +380,19 @@ public:
                    w1 *= -FLT_SCALE;
                    w2 *= -FLT_SCALE;
                    dI *= FLT_SCALE;
-#ifdef RLOF_SSE
-                    float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];//
-                    _mm_store_ps(A11buf, mmAxx);
-                    _mm_store_ps(A12buf, mmAxy);
-                    _mm_store_ps(A22buf, mmAyy);
-                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
-#endif
                    A11 *= FLT_SCALE;
                    A12 *= FLT_SCALE;
                    A22 *= FLT_SCALE;
                }
-#ifdef RLOF_SSE
+#if CV_SIMD128
                float CV_DECL_ALIGNED(16) bbuf[4];
-                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                v_store_aligned(bbuf, vqb0 + vqb1);
                b1 = bbuf[0] + bbuf[2];
                b2 = bbuf[1] + bbuf[3];
-                _mm_store_ps(bbuf, qb2);
+                b3 = v_reduce_sum(vqb2);
-                b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                b4 = v_reduce_sum(vqb3);
-                _mm_store_ps(bbuf, qb3);
-                b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 #endif
                mismatchMat(0) =  b1 * FLT_SCALE;
@@ -718,8 +566,6 @@ public:
        cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
        winMaskMatBuf.setTo(1);
        const float FLT_SCALE = (1.f/(1 << 20)); // 20
        int j, cn = I.channels(), cn2 = cn*2;
@@ -775,7 +621,7 @@ public:
            float a = prevPt.x - iprevPt.x;
            float b = prevPt.y - iprevPt.y;
-            const int W_BITS = 14, W_BITS1 = 14;
+            const int W_BITS = 14;
            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
@@ -783,124 +629,9 @@ public:
            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
            float A11 = 0, A12 = 0, A22 = 0;
-#ifdef RLOF_SSE
-            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-            __m128i z = _mm_setzero_si128();
-            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
-            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
-            __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
-            __m128i mmMask4_epi32;
-            get4BitMask(winSize.width, mmMask4_epi32);
-#endif
            // extract the patch from the first image, compute covariation matrix of derivatives
            int x, y;
-            for( y = 0; y < winSize.height; y++ )
+            copyWinBuffers(iw00, iw01, iw10, iw11, winSize, I, derivI, winMaskMat, IWinBuf, derivIWinBuf, A11, A22, A12, iprevPt);
-            {
-                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
-                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
-                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
-                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
-                short* Iptr  = IWinBuf.ptr<short>(y, 0);
-                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
-                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                x = 0;
-#ifdef RLOF_SSE
-                for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
-                {
-                    __m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
-                                                  MaskSet * maskPtr[x+2],
-                                                  MaskSet * maskPtr[x+1],
-                                                  MaskSet * maskPtr[x]);
-                    __m128i v00, v01, v10, v11, t0, t1;
-                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
-                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                    if( x + 4 > winSize.width)
-                    {
-                        v00 = _mm_and_si128(v00, mmMask4_epi32);
-                    }
-                    v00 = _mm_and_si128(v00, wMask);
-                    _mm_storeu_si128((__m128i*)dIptr, v00);
-                    t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
-                    t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
-                    __m128 fy = _mm_cvtepi32_ps(t0);
-                    __m128 fx = _mm_cvtepi32_ps(t1);
-                    qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
-                    qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
-                    qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
-                }
-#else
-                for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
-                {
-                    if( maskPtr[x] == 0)
-                    {
-                        dIptr[0] = 0;
-                        dIptr[1] = 0;
-                        continue;
-                    }
-                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                          src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
-                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                           dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
-                                           dsrc1[cn2+1]*iw11, W_BITS1);
-                    Iptr[x] = (short)ival;
-                    dIptr[0] = (short)ixval;
-                    dIptr[1] = (short)iyval;
-                    A11 += (float)(ixval*ixval);
-                    A12 += (float)(ixval*iyval);
-                    A22 += (float)(iyval*iyval);
-                }
-#endif
-            }
-#ifdef RLOF_SSE
-            float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
-            _mm_store_ps(A11buf, qA11);
-            _mm_store_ps(A12buf, qA12);
-            _mm_store_ps(A22buf, qA22);
-            A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-            A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-            A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
-#endif
-            A11 *= FLT_SCALE;
-            A12 *= FLT_SCALE;
-            A22 *= FLT_SCALE;
            float D = A11*A22 - A12*A12;
            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
@@ -920,10 +651,6 @@ public:
            nextPt += halfWin;
            Point2f prevDelta(0,0);    //relates to h(t-1)
-#ifdef RLOF_SSE
-            __m128i mmMask0, mmMask1, mmMask;
-            getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
-#endif
            for( j = 0; j < criteria.maxCount; j++ )
            {
                status[ptidx] = static_cast<uchar>(j);
@@ -945,12 +672,12 @@ public:
                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
                float b1 = 0, b2 = 0;
-#ifdef RLOF_SSE
+#if CV_SIMD128
-                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                v_int16x8 vqw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
-                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                v_int16x8 vqw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
-                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
+                v_float32x4 vqb0 = v_setzero_f32(), vqb1 = v_setzero_f32();
+                v_int16x8 vmax_val_16 = v_setall_s16(std::numeric_limits<unsigned short>::max());
+                v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
 #endif
                for( y = 0; y < winSize.height; y++ )
                {
@@ -960,58 +687,38 @@ public:
                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                    x = 0;
-#ifdef RLOF_SSE
+#if CV_SIMD128
                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
                    for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
                    {
-                        if( maskPtr[x  ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
+                        v_int16x8 diff0 = v_reinterpret_as_s16(v_load(Iptr + x)), diff1, diff2;
-                        &&    maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
+                        v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
-                            continue;
+                        v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
-                        __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
+                        v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
-                        __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                        v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
-                        __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                                   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
-                        __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
-                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-                        if(  x > winSize.width*cn - 8)
-                        {
-                            Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
-                            Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
-                        }
+                        v_int32x4 t0, t1;
+                        v_int16x8 t00, t01, t10, t11;
+                        v_zip(v00, v01, t00, t01);
+                        v_zip(v10, v11, t10, t11);
-                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
+                        t0 = t0 >> (W_BITS - 5);
+                        t1 = t1 >> (W_BITS - 5);
+                        diff0 = v_pack(t0, t1) - diff0;
+                        diff0 = diff0 & vmask;
+                        v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
-                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
-                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        v_zip(vIxy_0, vIxy_1, v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
+                        v_zip(diff2, diff1, v00, v01);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-                        // It * Ix It * Iy [4 ... 7]
+                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        // for set 1 hi sigma 1
+                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
-                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
-                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
                    }
 #else
                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
@@ -1020,7 +727,7 @@ public:
                            continue;
                        int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                              Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
-                                              W_BITS1-5) - Iptr[x];
+                                              W_BITS-5) - Iptr[x];
                        b1 += (float)(diff*dIptr[0]) * FLT_RESCALE;
                        b2 += (float)(diff*dIptr[1]) * FLT_RESCALE;
@@ -1028,11 +735,11 @@ public:
 #endif
                }
-#ifdef RLOF_SSE
+#if CV_SIMD128
                float CV_DECL_ALIGNED(16) bbuf[4];
-                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                v_store_aligned(bbuf, vqb0 + vqb1);
-                b1 += bbuf[0] + bbuf[2];
+                b1 = bbuf[0] + bbuf[2];
-                b2 += bbuf[1] + bbuf[3];
+                b2 = bbuf[1] + bbuf[3];
 #endif
                b1 *= FLT_SCALE;

--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -126,7 +126,7 @@ public:
            float a = prevPt.x - iprevPt.x;
            float b = prevPt.y - iprevPt.y;
-            const int W_BITS = 14, W_BITS1 = 14;
+            const int W_BITS = 14;
            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
@@ -137,104 +137,13 @@ public:
            float D = 0;
            float minEig;
-#ifdef RLOF_SSE
+            copyWinBuffers(iw00, iw01, iw10, iw11, winSize, I, derivI, winMaskMat, IWinBuf, derivIWinBuf, iprevPt);
-            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-            __m128i z = _mm_setzero_si128();
-            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
-            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
-            __m128i mmMask4_epi32;
-            __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-            get4BitMask(winSize.width, mmMask4_epi32);
-#endif
-            // extract the patch from the first image, compute covariation matrix of derivatives
-            int x, y;
-            for (y = 0; y < winSize.height; y++)
-            {
-                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
-                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
-                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
-                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
-                short* Iptr  = IWinBuf.ptr<short>(y, 0);
-                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
-                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                x = 0;
-#ifdef RLOF_SSE
-                for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2)
-                {
-                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
-                    __m128i v00, v01, v10, v11, t0, t1;
-                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-                    if (x + 4 > winSize.width)
-                    {
-                        t0 = _mm_and_si128(t0, mmMask4_epi32);
-                    }
-                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
-                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
-                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                        _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                    if (x + 4 > winSize.width)
-                    {
-                        v00 = _mm_and_si128(v00, mmMask4_epi32);
-                    }
-                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
-                    _mm_storeu_si128((__m128i*)dIptr, v00);
-                }
-#else
-                for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
-                {
-                    if (maskPtr[x] == 0)
-                    {
-                        dIptr[0] = 0;
-                        dIptr[1] = 0;
-                        continue;
-                    }
-                    int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
-                        src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5);
-                    int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
-                        dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
-                        dsrc1[cn2 + 1] * iw11, W_BITS1);
-                    Iptr[x] = (short)ival;
-                    dIptr[0] = (short)ixval;
-                    dIptr[1] = (short)iyval;
-                }
-#endif
-            }
            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
            cv::Point2f backUpNextPt = nextPt;
            nextPt += halfWin;
            int j;
-#ifdef RLOF_SSE
-            __m128i mmMask0, mmMask1, mmMask;
-            getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
-#endif
            float MEstimatorScale = 1;
            int buffIdx = 0;
            cv::Point2f prevDelta(0, 0);
@@ -274,19 +183,18 @@ public:
                if (j == 0 )
                {
                    buffIdx = 0;
-                    for (y = 0; y < winSize.height; y++)
+                    for (int y = 0; y < winSize.height; y++)
                    {
                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                        x = 0;
+                        for (int x = 0; x < winSize.width*cn; x++, dIptr += 2)
-                        for (; x < winSize.width*cn; x++, dIptr += 2)
                        {
                            if (maskPtr[x] == 0)
                                continue;
-                            int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS1 - 5) - Iptr[x];
+                            int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS - 5) - Iptr[x];
                            residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
                        }
                    }
@@ -300,171 +208,144 @@ public:
                float fParam1 = normSigma1 * 32.f;
                fParam0 = normSigma0 * MEstimatorScale;
                fParam1 = normSigma1 * MEstimatorScale;
-#ifdef RLOF_SSE
+#if CV_SIMD128
-                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                v_int16x8 vqw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
-                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                v_int16x8 vqw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
-                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
+                v_float32x4 vqb0 = v_setzero_f32(), vqb1 = v_setzero_f32();
-                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                v_float32x4 vAxx = v_setzero_f32(), vAxy = v_setzero_f32(), vAyy = v_setzero_f32();
-                __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+                v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
-                __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+                v_int16x8 vscale = v_setall_s16(static_cast<short>(MEstimatorScale));
-                float s2Val = std::fabs(normSigma2);
+                v_int16x8 veta = v_setzero_s16();
-                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
+                v_int16x8 vzero = v_setzero_s16();
-                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
+                v_int16x8 vparam0 = v_setall_s16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
-                __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                v_int16x8 vparam1 = v_setall_s16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
-                __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
+                v_int16x8 vneg_param1 = v_setall_s16(-MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
-                __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
+                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / std::fabs(normSigma2)) / log(2.f));
-                __m128  mmOnes = _mm_set1_ps(1.f);
+                v_int16x8 vparam2 = v_setall_s16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
-                __m128i mmEta = _mm_setzero_si128();
+                v_int16x8 voness = v_setall_s16(1 << s2bitShift);
-                __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
+                v_float32x4 vparam2s = v_setall_f32(0.01f * normSigma2);
+                v_float32x4 vparam2s2 = v_setall_f32(normSigma2 * normSigma2);
+                v_float32x4 vones = v_setall_f32(1.f);
+                v_float32x4 vzeros = v_setzero_f32();
+                v_int16x8 vmax_val_16 = v_setall_s16(std::numeric_limits<unsigned short>::max());
 #endif
                buffIdx = 0;
-                for (y = 0; y < winSize.height; y++)
+                for (int y = 0; y < winSize.height; y++)
                {
                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                    x = 0;
+#if CV_SIMD128
-#ifdef RLOF_SSE
+                    for (int x = 0; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
-                    for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
+                    {
-                    {
+                        v_int16x8 diff0 = v_reinterpret_as_s16(v_load(Iptr + x)), diff1, diff2;
-                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                        v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
-                        __m128i diff0, diff1;
+                        v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
-                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
+                        v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
+                        v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
-                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        v_int32x4 t0, t1;
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+                        v_int16x8 t00, t01, t10, t11;
+                        v_zip(v00, v01, t00, t01);
-                        __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                        v_zip(v10, v11, t10, t11);
-                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                        t0 = t0 >> (W_BITS - 5);
-                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
+                        t1 = t1 >> (W_BITS - 5);
+                        diff0 = v_pack(t0, t1) - diff0;
-                        __m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16);
+                        diff0 = diff0 & vmask;
-                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
+                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
+                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        // since there is no abs vor int16x8 we have to do this hack
-                        __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
+                        v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
-                        mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
+                        v_int16x8 vset2, vset1;
-                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-                        __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
-                        __m128i bSet2_epi16, bSet1_epi16;
                        // |It| < sigma1 ?
-                        bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                        vset2 = vabs_diff < vparam1;
                        // It > 0 ?
-                        __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                        v_int16x8 vdiff_is_pos = diff0 > vzero;
                        // sigma0 < |It| < sigma1 ?
-                        bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                        vset1 = vset2 & (vabs_diff > vparam0);
                        // val = |It| -/+ sigma1
-                        __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
-                            _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
                        // It == 0     ? |It| > sigma13
-                        mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                        diff0 = vset2 & diff0;
                        // It == val ? sigma0 < |It| < sigma1
-                        mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
+                        diff0 = v_select(vset1, vtmp_param1, diff0);
+                        v_int16x8 tale_ = v_select(vset1, vparam2, voness); // mask for 0 - 3
-                        __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
                        // diff = diff * sigma2
-                        __m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+                        v_int32x4 diff_int_0, diff_int_1;
-                        __m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+                        v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
+                        diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift);
-                        __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
+                        v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
-                        mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
+                        v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
-                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+                        v_zip(vIxy_0, vIxy_1, v10, v11);
+                        v_zip(diff2, diff1, v00, v01);
-                        // Ix * diff / Iy * diff
-                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
+                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-                        // It * Ix It * Iy [4 ... 7]
-                        // for set 1 hi sigma 1
-                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
-                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
                        if (j == 0)
                        {
-                            __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                            v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
-                            __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+                            v_int32x4 vmask_0, vmask_1;
-                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
-                            __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                            v_expand(vset1, vset1_0, vset1_1);
-                            __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+                            v_expand(vset2, vset2_0, vset2_1);
+                            v_expand(vmask, vmask_0, vmask_1);
-                            __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                            v_float32x4 vtale_0 = v_select(v_reinterpret_as_f32(vset1_0), vparam2s2, vones);
-                            __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                            v_float32x4 vtale_1 = v_select(v_reinterpret_as_f32(vset1_1), vparam2s2, vones);
-                            tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                            vtale_0 = v_select(v_reinterpret_as_f32(vset2_0), vtale_0, vparam2s);
-                            tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+                            vtale_1 = v_select(v_reinterpret_as_f32(vset2_1), vtale_1, vparam2s);
-                            tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                            vtale_0 = v_select(v_reinterpret_as_f32(vmask_0), vtale_0, vzeros);
-                            tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+                            vtale_1 = v_select(v_reinterpret_as_f32(vmask_1), vtale_1, vzeros);
-                            t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
+                            v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_0))));
-                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                            v_expand(v00, t1, t0);
-                            __m128 fy = _mm_cvtepi32_ps(t0);
+                            v_float32x4 fy = v_cvt_f32(t0);
-                            __m128 fx = _mm_cvtepi32_ps(t1);
+                            v_float32x4 fx = v_cvt_f32(t1);
                            // A11 - A22
-                            __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                            v_float32x4 fxtale = fx * vtale_0;
-                            __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+                            v_float32x4 fytale = fy * vtale_0;
-                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            vAyy = v_muladd(fy, fytale, vAyy);
-                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            vAxy = v_muladd(fx, fytale, vAxy);
-                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                            vAxx = v_muladd(fx, fxtale, vAxx);
-                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
+                            v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
-                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                            v_expand(v01, t1, t0);
-                            fy = _mm_cvtepi32_ps(t0);
+                            fy = v_cvt_f32(t0);
-                            fx = _mm_cvtepi32_ps(t1);
+                            fx = v_cvt_f32(t1);
                            // A11 - A22
-                            fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                            fxtale = fx * vtale_1;
-                            fytale = _mm_mul_ps(fy, tale_4_7_ps);
+                            fytale = fy * vtale_1;
-                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            vAyy = v_muladd(fy, fytale, vAyy);
-                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            vAxy = v_muladd(fx, fytale, vAxy);
-                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                            vAxx = v_muladd(fx, fxtale, vAxx);
                        }
                    }
 #else
-                    for (; x < winSize.width*cn; x++, dIptr += 2)
+                    for (int x = 0; x < winSize.width*cn; x++, dIptr += 2)
                    {
                        if (maskPtr[x] == 0)
                            continue;
                        int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
                            Jptr1[x] * iw10 + Jptr1[x + cn] * iw11,
-                            W_BITS1 - 5) - Iptr[x];
+                            W_BITS - 5) - Iptr[x];
                        if (diff > MEstimatorScale)
                            MEstimatorScale += eta;
@@ -512,39 +393,25 @@ public:
                            A12 += (float)(ixval*iyval*tale);
                            A22 += (float)(iyval*iyval*tale);
                        }
                    }
 #endif
                }
-#ifdef RLOF_SSE
+#if CV_SIMD128
-                short etaValues[8];
+                MEstimatorScale += eta * v_reduce_sum(veta);
-                _mm_storeu_si128((__m128i*)(etaValues), mmEta);
-                MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
-                        + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
 #endif
                if (j == 0)
                {
-#ifdef RLOF_SSE
+#if CV_SIMD128
-                    float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
+                    A11 = v_reduce_sum(vAxx);
+                    A12 = v_reduce_sum(vAxy);
-                    _mm_store_ps(A11buf, mmAxx);
+                    A22 = v_reduce_sum(vAyy);
-                    _mm_store_ps(A12buf, mmAxy);
-                    _mm_store_ps(A22buf, mmAyy);
-                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
                    A11 *= FLT_SCALE;
                    A12 *= FLT_SCALE;
                    A22 *= FLT_SCALE;
                    D = A11 * A22 - A12 * A12;
                    minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) +
                        4.f*A12*A12)) / (2 * winArea);
@@ -562,17 +429,15 @@ public:
                    }
                }
-#ifdef RLOF_SSE
+#if CV_SIMD128
                float CV_DECL_ALIGNED(16) bbuf[4];
-                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                v_store_aligned(bbuf, vqb0 + vqb1);
                b1 += bbuf[0] + bbuf[2];
                b2 += bbuf[1] + bbuf[3];
 #endif
                b1 *= FLT_SCALE;
                b2 *= FLT_SCALE;
                Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D));
                delta.x = (delta.x != delta.x) ? 0 : delta.x;
@@ -587,13 +452,9 @@ public:
                    nextPts[ptidx] -= delta * 0.5f;
                    break;
                }
                prevDelta = delta;
            }
        }
    }
    const Mat*          prevImg;
@@ -672,9 +533,6 @@ public:
    void operator()(const cv::Range& range) const CV_OVERRIDE
    {
-#ifdef DEBUG_INVOKER
-        printf("rlof::radial");fflush(stdout);
-#endif
        Point2f halfWin;
        cv::Size winSize;
        const Mat& I = *prevImg;
@@ -753,7 +611,7 @@ public:
            float a = prevPt.x - iprevPt.x;
            float b = prevPt.y - iprevPt.y;
-            const int W_BITS = 14, W_BITS1 = 14;
+            const int W_BITS = 14;
            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
@@ -771,95 +629,7 @@ public:
            float dI = 0; // I^2
            float D = 0;
-#ifdef RLOF_SSE
+            copyWinBuffers(iw00, iw01, iw10, iw11, winSize, I, derivI, winMaskMat, IWinBuf, derivIWinBuf, iprevPt);
-            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-            __m128i z = _mm_setzero_si128();
-            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
-            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
-            __m128i mmMask4_epi32;
-            __m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
-            get4BitMask(winSize.width, mmMask4_epi32);
-#endif
-            // extract the patch from the first image, compute covariation matrix of derivatives
-            int x, y;
-            for (y = 0; y < winSize.height; y++)
-            {
-                const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
-                const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
-                const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
-                const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
-                short* Iptr  = IWinBuf.ptr<short>(y, 0);
-                short* dIptr = derivIWinBuf.ptr<short>(y, 0);
-                x = 0;
-#ifdef RLOF_SSE
-                const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2)
-                {
-                    __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
-                    __m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
-                    __m128i v00, v01, v10, v11, t0, t1;
-                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
-                    if (x + 4 > winSize.width)
-                    {
-                        t0 = _mm_and_si128(t0, mmMask4_epi32);
-                    }
-                    t0 = _mm_and_si128(t0, mask_0_3_epi16);
-                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
-                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                        _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                        _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                    if (x + 4 > winSize.width)
-                    {
-                        v00 = _mm_and_si128(v00, mmMask4_epi32);
-                    }
-                    v00 = _mm_and_si128(v00, mask_0_3_epi16);
-                    _mm_storeu_si128((__m128i*)dIptr, v00);
-                }
-#else
-                for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
-                {
-                    if (winMaskMat.at<uchar>(y, x) == 0)
-                    {
-                        dIptr[0] = 0;
-                        dIptr[1] = 0;
-                        continue;
-                    }
-                    int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
-                        src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5);
-                    int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
-                        dsrc1[0] * iw10 + dsrc1[ cn2] * iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
-                        dsrc1[cn2 + 1] * iw11, W_BITS1);
-                    Iptr[x] = (short)ival;
-                    dIptr[0] = (short)ixval;
-                    dIptr[1] = (short)iyval;
-                }
-#endif
-            }
            cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
@@ -871,11 +641,6 @@ public:
            cv::Point2f backUpGain = gainVec;
            cv::Size _winSize = winSize;
            int j;
-#ifdef RLOF_SSE
-            __m128i mmMask0, mmMask1, mmMask;
-            getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
-            __m128  mmOnes = _mm_set1_ps(1.f);
-#endif
            float MEstimatorScale = 1;
            int buffIdx = 0;
            float minEigValue;
@@ -929,21 +694,20 @@ public:
                if (j == 0 )
                {
                    buffIdx = 0;
-                    for (y = 0; y < winSize.height; y++)
+                    for (int y = 0; y < winSize.height; y++)
                    {
                        const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
                        const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                        const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                        const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
-                        x = 0;
+                        for (int x = 0; x < winSize.width*cn; x++, dIptr += 2)
-                        for (; x < winSize.width*cn; x++, dIptr += 2)
                        {
                            if (dIptr[0] == 0 && dIptr[1] == 0)
                                continue;
                            int diff = static_cast<int>(CV_DESCALE(    Jptr[x] * iw00 +
                                                    Jptr[x + cn] * iw01 +
                                                    Jptr1[x] * iw10 +
-                                                    Jptr1[x + cn] * iw11, W_BITS1 - 5)
+                                                    Jptr1[x + cn] * iw11, W_BITS - 5)
                                - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
                            residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
                        }
@@ -959,248 +723,205 @@ public:
                fParam0 = normSigma0 * MEstimatorScale;
                fParam1 = normSigma1 * MEstimatorScale;
-#ifdef RLOF_SSE
+#if CV_SIMD128
+                v_int16x8 vqw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
-                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+                v_int16x8 vqw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
-                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+                v_float32x4 vqb0 = v_setzero_f32(), vqb1 = v_setzero_f32(), vqb2 = v_setzero_f32(), vqb3 = v_setzero_f32();
-                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
+                v_float32x4 vsumW1 = v_setzero_f32(), vsumW2 = v_setzero_f32(), vsumW = v_setzero_f32();
-                __m128 qb3 = _mm_setzero_ps();
+                v_float32x4 vsumIy = v_setzero_f32(), vsumIx = v_setzero_f32(), vsumI = v_setzero_f32(), vsumDI = v_setzero_f32();
-                __m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
+                v_float32x4 vAxx = v_setzero_f32(), vAxy = v_setzero_f32(), vAyy = v_setzero_f32();
-                __m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
-                __m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps();
+                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / std::fabs(normSigma2)) / log(2.f));
-                __m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
+                v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
-                __m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
+                v_int16x8 vzero = v_setzero_s16();
-                __m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+                v_int16x8 voness = v_setall_s16(1 << s2bitShift);
+                v_float32x4 vones = v_setall_f32(1.f);
+                v_float32x4 vzeros = v_setzero_f32();
-                float s2Val = std::fabs(normSigma2);
+                v_int16x8 vmax_val_16 = v_setall_s16(std::numeric_limits<unsigned short>::max());
-                int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
-                __m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
+                v_int16x8 vscale = v_setall_s16(static_cast<short>(MEstimatorScale));
-                __m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
+                v_int16x8 veta = v_setzero_s16();
-                __m128  mmParam2s = _mm_set1_ps(0.01f * normSigma2);
+                v_int16x8 vparam0 = v_setall_s16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
-                __m128  mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
+                v_int16x8 vparam1 = v_setall_s16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+                v_int16x8 vneg_param1 = v_setall_s16(-MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
+                v_int16x8 vparam2 = v_setall_s16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
+                v_float32x4 vparam2s = v_setall_f32(0.01f * normSigma2);
+                v_float32x4 vparam2s2 = v_setall_f32(normSigma2 * normSigma2);
                float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
                int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
+                v_int16x8 vgain_value = v_setall_s16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
-                __m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
+                v_int16x8 vconst_value = v_setall_s16(static_cast<short>(gainVec.y));
-                __m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
-                __m128i    mmEta = _mm_setzero_si128();
-                __m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
 #endif
                buffIdx = 0;
+                for (int y = 0; y < _winSize.height; y++)
-                for (y = 0; y < _winSize.height; y++)
                {
                    const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
                    const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
                    const short* Iptr  = IWinBuf.ptr<short>(y, 0);
                    const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
                    const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
-                    x = 0;
+#if CV_SIMD128
-#ifdef RLOF_SSE
+                    for (int x = 0; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
+                    {
+                        v_int16x8 vI = v_reinterpret_as_s16(v_load(Iptr + x)), diff0, diff1, diff2;
-                    for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
+                        v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
-                    {
+                        v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
-                        __m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
+                        v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
-                        __m128i diff0, diff1;
+                        v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        __m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
+                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
-                           __m128i v00 = _mm_unpacklo_epi8(
-                        _mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
+                        v_int32x4 t0, t1;
-                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                        v_int16x8 t00, t01, t10, t11;
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
+                        v_zip(v00, v01, t00, t01);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
+                        v_zip(v10, v11, t10, t11);
-                        __m128i t0 = _mm_add_epi32
+                        //subpixel interpolation
-                        (_mm_madd_epi16(
+                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            _mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2,  ... J3 , J4
+                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            qw0),
+                        t0 = t0 >> (W_BITS - 5);
-                            _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                        t1 = t1 >> (W_BITS - 5);
-                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                            _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                        // diff = J - I
+                        diff0 = v_pack(t0, t1) - vI;
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
+                        // I*gain.x + gain.x
-                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
+                        v_mul_expand(vI, vgain_value, t0, t1);
+                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
+                        diff0 = diff0 & vmask;
-                        __m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
-                        __m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
+                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
+                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
-                        __m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
+                        // since there is no abs vor int16x8 we have to do this hack
-                        __m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
+                        v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
-                        __m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
+                        v_int16x8 vset2, vset1;
-                        // diff = J - I + I*gain.x + gain.y
-                        __m128i mmDiff_epi16 = _mm_add_epi16(
-                            _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16),    // J - I
-                            _mm_add_epi16(Igain_epi16, mmConstValue_epi16));
-                        mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
-                        __m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
-                        mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
-                        __m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1
-                        __m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-                        __m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
-                        __m128i bSet2_epi16, bSet1_epi16;
                        // |It| < sigma1 ?
-                        bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
+                        vset2 = vabs_diff < vparam1;
                        // It > 0 ?
-                        __m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
+                        v_int16x8 vdiff_is_pos = diff0 > vzero;
                        // sigma0 < |It| < sigma1 ?
-                        bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
+                        vset1 = vset2 & (vabs_diff > vparam0);
                        // val = |It| -/+ sigma1
-                        __m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
+                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
-                            _mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
                        // It == 0     ? |It| > sigma13
-                        mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
+                        diff0 = vset2 & diff0;
                        // It == val ? sigma0 < |It| < sigma1
-                        mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
+                        diff0 = v_select(vset1, vtmp_param1, diff0);
+                        v_int16x8 tale_ = v_select(vset1, vparam2, voness); // mask for 0 - 3
-                        __m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
                        // diff = diff * sigma2
-                        lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
+                        v_int32x4 diff_int_0, diff_int_1;
-                        hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
+                        v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        __m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
+                        v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                        __m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
+                        v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                        diff0 = v_pack(diff0_0, diff0_1);
-                        mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte
+                        v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
-                        diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7   | It12 It12 It13 It13...
+                        v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
-                        diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3   | It8 It8 It9 It9...
+                        v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
+                        v_zip(vIxy_0, vIxy_1, v10, v11);
-                        // Ix * diff / Iy * diff
+                        v_zip(diff2, diff1, v00, v01);
-                        v10 = _mm_mullo_epi16(Ixy_0, diff0);
-                        v11 = _mm_mulhi_epi16(Ixy_0, diff0);
+                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        v00 = _mm_unpacklo_epi16(v10, v11);
+                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                        v_int32x4 vI0, vI1;
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                        v_expand(vI, vI0, vI1);
+                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        // It * Ix It * Iy [4 ... 7]
+                        vqb2 += v_cvt_f32(diff0_1 * vI1);
-                        // for set 1 hi sigma 1
+                        vqb3 += v_cvt_f32(diff0_0);
-                        v10 = _mm_mullo_epi16(Ixy_1, diff1);
+                        vqb3 += v_cvt_f32(diff0_1);
-                        v11 = _mm_mulhi_epi16(Ixy_1, diff1);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-                        // diff * J [0 ... 7]
-                        // for set 1  sigma 1
-                        // b3 += diff * Iptr[x]
-                        v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
-                        v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
-                        qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
-                        qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32));
-                        qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32));
                        if (j == 0)
                        {
-                            __m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
+                            v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
-                            __m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
+                            v_int32x4 vmask_0, vmask_1;
-                            __m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
-                            __m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
-                            __m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
+                            v_expand(vset1, vset1_0, vset1_1);
-                            __m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
+                            v_expand(vset2, vset2_0, vset2_1);
+                            v_expand(vmask, vmask_0, vmask_1);
-                            __m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
+                            v_float32x4 vtale_0 = v_select(v_reinterpret_as_f32(vset1_0), vparam2s2, vones);
-                            __m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
+                            v_float32x4 vtale_1 = v_select(v_reinterpret_as_f32(vset1_1), vparam2s2, vones);
-                            tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
+                            vtale_0 = v_select(v_reinterpret_as_f32(vset2_0), vtale_0, vparam2s);
-                            tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
+                            vtale_1 = v_select(v_reinterpret_as_f32(vset2_1), vtale_1, vparam2s);
-                            tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
+                            vtale_0 = v_select(v_reinterpret_as_f32(vmask_0), vtale_0, vzeros);
-                            tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
+                            vtale_1 = v_select(v_reinterpret_as_f32(vmask_1), vtale_1, vzeros);
-                            t0 = _mm_srai_epi32(Ixy_0, 16);
-                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16);
-                            __m128 fy = _mm_cvtepi32_ps(t0);
+                            v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_0))));
-                            __m128 fx = _mm_cvtepi32_ps(t1);
+                            v_expand(v00, t1, t0);
-                            // 0 ... 3
+                            v_float32x4 vI_ps = v_cvt_f32(vI0);
-                            __m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                            v_float32x4 fy = v_cvt_f32(t0);
+                            v_float32x4 fx = v_cvt_f32(t1);
                            // A11 - A22
-                            __m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
+                            v_float32x4 fxtale = fx * vtale_0;
-                            __m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
+                            v_float32x4 fytale = fy * vtale_0;
-                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            vAyy = v_muladd(fy, fytale, vAyy);
-                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            vAxy = v_muladd(fx, fytale, vAxy);
-                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                            vAxx = v_muladd(fx, fxtale, vAxx);
                            // sumIx und sumIy
-                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                            vsumIx += fxtale;
-                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
+                            vsumIy += fytale;
-                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                            vsumW1 += vI_ps * fxtale;
-                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+                            vsumW2 += vI_ps * fytale;
                            // sumI
-                            __m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
+                            v_float32x4 vI_tale = vI_ps * vtale_0;
-                            mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+                            vsumI += vI_tale;
                            // sumW
-                            mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
+                            vsumW += vtale_0;
                            // sumDI
-                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+                            vsumDI += vI_ps * vI_tale;
-                            t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
-                            t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
-                            fy = _mm_cvtepi32_ps(t0);
+                            v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
-                            fx = _mm_cvtepi32_ps(t1);
+                            v_expand(v01, t1, t0);
+                            vI_ps = v_cvt_f32(vI1);
-                            // 4 ... 7
+                            fy = v_cvt_f32(t0);
-                            I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
+                            fx = v_cvt_f32(t1);
                            // A11 - A22
-                            fxtale = _mm_mul_ps(fx, tale_4_7_ps);
+                            fxtale = fx * vtale_1;
-                            fytale = _mm_mul_ps(fy, tale_4_7_ps);
+                            fytale = fy * vtale_1;
-                            mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
+                            vAyy = v_muladd(fy, fytale, vAyy);
-                            mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
+                            vAxy = v_muladd(fx, fytale, vAxy);
-                            mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
+                            vAxx = v_muladd(fx, fxtale, vAxx);
                            // sumIx und sumIy
-                            mmSumIx = _mm_add_ps(mmSumIx, fxtale);
+                            vsumIx += fxtale;
-                            mmSumIy = _mm_add_ps(mmSumIy, fytale);
+                            vsumIy += fytale;
-                            mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
+                            vsumW1 += vI_ps * fxtale;
-                            mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
+                            vsumW2 += vI_ps * fytale;
                            // sumI
-                            I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
+                            vI_tale = vI_ps * vtale_1;
-                            mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
+                            vsumI += vI_tale;
-                            mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
+                            // sumW
+                            vsumW += vtale_1;
-                            mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
+                            // sumDI
+                            vsumDI += vI_ps * vI_tale;
                        }
                    }
 #else
-                    for (; x < _winSize.width*cn; x++, dIptr += 2)
+                    for (int x = 0; x < _winSize.width*cn; x++, dIptr += 2)
                    {
                        if (maskPtr[x] == 0)
                            continue;
                        int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11,
-                            W_BITS1 - 5);
+                            W_BITS - 5);
                        short ixval = static_cast<short>(dIptr[0]);
                        short iyval = static_cast<short>(dIptr[1]);
                        int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
@@ -1264,30 +985,22 @@ public:
                    }
 #endif
                }
-#ifdef RLOF_SSE
+#if CV_SIMD128
-                short etaValues[8];
+                MEstimatorScale += eta * v_reduce_sum(veta);
-                _mm_storeu_si128((__m128i*)(etaValues), mmEta);
-                MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
-                        + etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
-                        float CV_DECL_ALIGNED(32) wbuf[4];
 #endif
                if (j == 0)
                {
-#ifdef RLOF_SSE
+#if CV_SIMD128
-                    _mm_store_ps(wbuf, mmSumW1);
+                    w1 = v_reduce_sum(vsumW1);
-                    w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    w2 = v_reduce_sum(vsumW2);
-                    _mm_store_ps(wbuf, mmSumW2);
+                    dI = v_reduce_sum(vsumDI);
-                    w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    sumI = v_reduce_sum(vsumI);
-                    _mm_store_ps(wbuf, mmSumDI);
+                    sumIx = v_reduce_sum(vsumIx);
-                    dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    sumIy = v_reduce_sum(vsumIy);
-                    _mm_store_ps(wbuf, mmSumI);
+                    sumW = v_reduce_sum(vsumW);
-                    sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    A11 = v_reduce_sum(vAxx);
-                    _mm_store_ps(wbuf, mmSumIx);
+                    A12 = v_reduce_sum(vAxy);
-                    sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
+                    A22 = v_reduce_sum(vAyy);
-                    _mm_store_ps(wbuf, mmSumIy);
-                    sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
-                    _mm_store_ps(wbuf, mmSumW);
-                    sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
 #endif
                    sumIx *= -FLT_SCALE;
                    sumIy *= -FLT_SCALE;
@@ -1296,30 +1009,19 @@ public:
                    w1 *= -FLT_SCALE;
                    w2 *= -FLT_SCALE;
                    dI *= FLT_SCALE;
-#ifdef RLOF_SSE
+#if CV_SIMD128
-                    float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
-                    _mm_store_ps(A11buf, mmAxx);
-                    _mm_store_ps(A12buf, mmAxy);
-                    _mm_store_ps(A22buf, mmAyy);
-                    A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-                    A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-                    A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
 #endif
                    A11 *= FLT_SCALE;
                    A12 *= FLT_SCALE;
                    A22 *= FLT_SCALE;
                }
-#ifdef RLOF_SSE
+#if CV_SIMD128
                float CV_DECL_ALIGNED(16) bbuf[4];
-                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+                v_store_aligned(bbuf, vqb0 + vqb1);
                b1 = bbuf[0] + bbuf[2];
                b2 = bbuf[1] + bbuf[3];
-                _mm_store_ps(bbuf, qb2);
+                b3 = v_reduce_sum(vqb2);
-                b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
+                b4 = v_reduce_sum(vqb3);
-                _mm_store_ps(bbuf, qb3);
-                b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
 #endif
                mismatchMat(0) = b1 * FLT_SCALE;
                mismatchMat(1) = b2 * FLT_SCALE;

--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -4,62 +4,306 @@
 #ifndef _RLOF_INVOKERBASE_HPP_
 #define _RLOF_INVOKERBASE_HPP_
-#if CV_SSE4_1
-#define RLOF_SSE
-#endif
-//#define DEBUG_INVOKER
 #ifndef CV_DESCALE
 #define CV_DESCALE(x, n)     (((x) + (1 << ((n)-1))) >> (n))
 #endif
 #define FLT_RESCALE 1
 #include "rlof_localflow.h"
 #include <unordered_map>
 #include "opencv2/core/hal/intrin.hpp"
 using namespace std;
 using namespace cv;
 namespace cv {
 namespace optflow {
 typedef short deriv_type;
-#ifdef RLOF_SSE
+#if CV_SIMD128
-static inline void get4BitMask(const int & width, __m128i & mask)
-{
-    int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
-    unsigned int val[4];
-    for (int n = 0; n < 4; n++)
-    {
-        val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
-    }
-    mask = _mm_set_epi32(val[3], val[2], val[1], val[0]);
-}
+static inline void getVBitMask(const int & width, v_int32x4 & mask0, v_int32x4 & mask1)
-static inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
 {
    int noBits = width - static_cast<int>(floor(width / 8.f) * 8.f);
-    unsigned short val[8];
+    unsigned int val[8];
    for (int n = 0; n < 8; n++)
    {
-        val[n] = (noBits > n) ? (0xffff) : 0;
+        val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
    }
-    t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]);
+    mask0 = v_int32x4(val[0], val[1], val[2], val[3]);
-    t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]);
+    mask1 = v_int32x4(val[4], val[5], val[6], val[7]);
-    t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
 }
 #endif
 typedef uchar tMaskType;
 #define tCVMaskType CV_8UC1
 #define MaskSet 0xffffffff
-static
+static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
-void getLocalPatch(
+    Size winSize,
+    const Mat & I, const Mat & derivI, const Mat & winMaskMat,
+    Mat & IWinBuf, Mat & derivIWinBuf,
+    Point iprevPt)
+{
+    int cn = I.channels(), cn2 = cn * 2;
+    const int W_BITS = 14;
+#if CV_SIMD128
+    v_int16x8 vqw0((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
+    v_int16x8 vqw1((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
+    v_int32x4 vdelta_d = v_setall_s32(1 << (W_BITS - 1));
+    v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
+    v_int32x4 vmax_val_32 = v_setall_s32(std::numeric_limits<unsigned int>::max());
+    v_int32x4 vmask_border_0, vmask_border_1;
+    getVBitMask(winSize.width, vmask_border_0, vmask_border_1);
+#endif
+    // extract the patch from the first image, compute covariation matrix of derivatives
+    int x, y;
+    for (y = 0; y < winSize.height; y++)
+    {
+        const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+        const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+        const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+        const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
+        short* Iptr = IWinBuf.ptr<short>(y, 0);
+        short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+        x = 0;
+#if CV_SIMD128
+        for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
+        {
+            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
+            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            if (x + 4 > winSize.width)
+            {
+                vmask0 = vmask0 & vmask_border_0;
+            }
+            if (x + 8 > winSize.width)
+            {
+                vmask1 = vmask1 & vmask_border_1;
+            }
+            v_int32x4 t0, t1;
+            v_int16x8 v00, v01, v10, v11, t00, t01, t10, t11;
+            v00 = v_reinterpret_as_s16(v_load_expand(src + x));
+            v01 = v_reinterpret_as_s16(v_load_expand(src + x + cn));
+            v10 = v_reinterpret_as_s16(v_load_expand(src1 + x));
+            v11 = v_reinterpret_as_s16(v_load_expand(src1 + x + cn));
+            v_zip(v00, v01, t00, t01);
+            v_zip(v10, v11, t10, t11);
+            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
+            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
+            t0 = t0 >> (W_BITS - 5)  & vmask0;
+            t1 = t1 >> (W_BITS - 5)  & vmask1;
+            v_store(Iptr + x, v_pack(t0, t1));
+            v00 = v_reinterpret_as_s16(v_load(dsrc));
+            v01 = v_reinterpret_as_s16(v_load(dsrc + cn2));
+            v10 = v_reinterpret_as_s16(v_load(dsrc1));
+            v11 = v_reinterpret_as_s16(v_load(dsrc1 + cn2));
+            v_zip(v00, v01, t00, t01);
+            v_zip(v10, v11, t10, t11);
+            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
+            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
+            t0 = t0 >> W_BITS;
+            t1 = t1 >> W_BITS;
+            v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v_store(dIptr, v00);
+            v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
+            v01 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2 + cn2));
+            v10 = v_reinterpret_as_s16(v_load(dsrc1 + 4 * 2));
+            v11 = v_reinterpret_as_s16(v_load(dsrc1 + 4 * 2 + cn2));
+            v_zip(v00, v01, t00, t01);
+            v_zip(v10, v11, t10, t11);
+            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
+            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
+            t0 = t0 >> W_BITS;
+            t1 = t1 >> W_BITS;
+            v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v_store(dIptr + 4 * 2, v00);
+        }
+#else
+        for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
+        {
+            if (maskPtr[x] == 0)
+            {
+                dIptr[0] = 0;
+                dIptr[1] = 0;
+                continue;
+            }
+            int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+                src1[x] * iw10 + src1[x + cn] * iw11, W_BITS - 5);
+            int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+                dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS);
+            int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
+                dsrc1[cn2 + 1] * iw11, W_BITS);
+            Iptr[x] = (short)ival;
+            dIptr[0] = (short)ixval;
+            dIptr[1] = (short)iyval;
+        }
+#endif
+    }
+}
+static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
+    Size winSize,
+    const Mat & I, const Mat & derivI, const Mat & winMaskMat,
+    Mat & IWinBuf, Mat & derivIWinBuf,
+    float & A11, float & A22, float & A12,
+    Point iprevPt)
+{
+    const float FLT_SCALE = (1.f / (1 << 20));
+    int cn = I.channels(), cn2 = cn * 2;
+    const int W_BITS = 14;
+#if CV_SIMD128
+    v_int16x8 vqw0((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
+    v_int16x8 vqw1((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
+    v_int32x4 vdelta_d = v_setall_s32(1 << (W_BITS - 1));
+    v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
+    v_int32x4 vmax_val_32 = v_setall_s32(std::numeric_limits<unsigned int>::max());
+    v_int32x4 vmask_border0, vmask_border1;
+    v_float32x4 vA11 = v_setzero_f32(), vA22 = v_setzero_f32(), vA12 = v_setzero_f32();
+    getVBitMask(winSize.width, vmask_border0, vmask_border1);
+#endif
+    // extract the patch from the first image, compute covariation matrix of derivatives
+    for (int y = 0; y < winSize.height; y++)
+    {
+        const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
+        const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
+        const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
+        const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
+        short* Iptr = IWinBuf.ptr<short>(y, 0);
+        short* dIptr = derivIWinBuf.ptr<short>(y, 0);
+        const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
+#if CV_SIMD128
+        for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
+        {
+            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
+            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            if (x + 4 > winSize.width)
+            {
+                vmask0 = vmask0 & vmask_border0;
+            }
+            if (x + 8 > winSize.width)
+            {
+                vmask1 = vmask1 & vmask_border1;
+            }
+            v_int32x4 t0, t1;
+            v_int16x8 v00, v01, v10, v11, t00, t01, t10, t11;
+            v00 = v_reinterpret_as_s16(v_load_expand(src + x));
+            v01 = v_reinterpret_as_s16(v_load_expand(src + x + cn));
+            v10 = v_reinterpret_as_s16(v_load_expand(src1 + x));
+            v11 = v_reinterpret_as_s16(v_load_expand(src1 + x + cn));
+            v_zip(v00, v01, t00, t01);
+            v_zip(v10, v11, t10, t11);
+            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
+            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
+            t0 = t0 >> (W_BITS - 5);
+            t1 = t1 >> (W_BITS - 5);
+            t0 = t0 & vmask0;
+            t1 = t1 & vmask1;
+            v_store(Iptr + x, v_pack(t0, t1));
+            v00 = v_reinterpret_as_s16(v_load(dsrc));
+            v01 = v_reinterpret_as_s16(v_load(dsrc + cn2));
+            v10 = v_reinterpret_as_s16(v_load(dsrc1));
+            v11 = v_reinterpret_as_s16(v_load(dsrc1 + cn2));
+            v_zip(v00, v01, t00, t01);
+            v_zip(v10, v11, t10, t11);
+            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
+            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
+            t0 = t0 >> W_BITS;
+            t1 = t1 >> W_BITS;
+            v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v_store(dIptr, v00);
+            v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
+            v_expand(v00, t1, t0);
+            v_float32x4 fy = v_cvt_f32(t0);
+            v_float32x4 fx = v_cvt_f32(t1);
+            vA22 = v_muladd(fy, fy, vA22);
+            vA12 = v_muladd(fx, fy, vA12);
+            vA11 = v_muladd(fx, fx, vA11);
+            v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
+            v01 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2 + cn2));
+            v10 = v_reinterpret_as_s16(v_load(dsrc1 + 4 * 2));
+            v11 = v_reinterpret_as_s16(v_load(dsrc1 + 4 * 2 + cn2));
+            v_zip(v00, v01, t00, t01);
+            v_zip(v10, v11, t10, t11);
+            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
+            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
+            t0 = t0 >> W_BITS;
+            t1 = t1 >> W_BITS;
+            v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v_store(dIptr + 4 * 2, v00);
+            v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
+            v_expand(v00, t1, t0);
+            fy = v_cvt_f32(t0);
+            fx = v_cvt_f32(t1);
+            vA22 = v_muladd(fy, fy, vA22);
+            vA12 = v_muladd(fx, fy, vA12);
+            vA11 = v_muladd(fx, fx, vA11);
+        }
+#else
+        for (int x = 0; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
+        {
+            if (maskPtr[x] == 0)
+            {
+                dIptr[0] = 0;
+                dIptr[1] = 0;
+                continue;
+            }
+            int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
+                src1[x] * iw10 + src1[x + cn] * iw11, W_BITS - 5);
+            int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
+                dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS);
+            int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
+                dsrc1[cn2 + 1] * iw11, W_BITS);
+            Iptr[x] = (short)ival;
+            dIptr[0] = (short)ixval;
+            dIptr[1] = (short)iyval;
+            A11 += (float)(ixval*ixval);
+            A12 += (float)(ixval*iyval);
+            A22 += (float)(iyval*iyval);
+        }
+#endif
+    }
+#if CV_SIMD128
+    A11 += v_reduce_sum(vA11);
+    A12 += v_reduce_sum(vA12);
+    A22 += v_reduce_sum(vA22);
+#endif
+    A11 *= FLT_SCALE;
+    A12 *= FLT_SCALE;
+    A22 *= FLT_SCALE;
+}
+static void getLocalPatch(
        const cv::Mat & src,
        const cv::Point2i & prevPoint, // feature points
        cv::Mat & winPointMask,

--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -469,6 +469,8 @@ void calcLocalOpticalFlowCore(
        if (isrobust(param) == false)
        {
            if (param.useIlluminationModel)
+            {
+                if (param.solverType == SolverType::ST_STANDART)
                {
                    cv::parallel_for_(cv::Range(0, npoints),
                        plk::radial::TrackerInvoker(
@@ -482,6 +484,20 @@ void calcLocalOpticalFlowCore(
                            param.crossSegmentationThreshold));
                }
                else
+                {
+                    cv::parallel_for_(cv::Range(0, npoints),
+                        beplk::radial::TrackerInvoker(
+                            prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
+                            prevPts, nextPts, &status[0], &err[0], &gainPts[0],
+                            level, maxLevel, winSizes,
+                            param.maxIteration,
+                            param.useInitialFlow,
+                            param.supportRegionType,
+                            param.crossSegmentationThreshold,
+                            param.minEigenValue));
+                }
+            }
+            else
            {
                if (param.solverType == SolverType::ST_STANDART)
                {

--- a/modules/optflow/src/rlofflow.cpp
+++ b/modules/optflow/src/rlofflow.cpp
@@ -6,6 +6,7 @@
 #include "rlof/geo_interpolation.hpp"
 #include "opencv2/ximgproc.hpp"
 namespace cv {
 namespace optflow {
@@ -14,6 +15,19 @@ Ptr<RLOFOpticalFlowParameter> RLOFOpticalFlowParameter::create()
    return Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
 }
+void RLOFOpticalFlowParameter::setUseMEstimator(bool val)
+{
+    if (val)
+    {
+        normSigma0 = 3.2f;
+        normSigma1 = 7.f;
+    }
+    else
+    {
+        normSigma0 = std::numeric_limits<float>::max();
+        normSigma1 = std::numeric_limits<float>::max();
+    }
+}
 void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;}
 SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;}
@@ -198,7 +212,7 @@ public:
            gd->setLambda(lambda);
            gd->setFGSLambda(fgs_lambda);
            gd->setFGSSigma(fgs_sigma);
-            gd->setUsePostProcessing(false);
+            gd->setUsePostProcessing(use_post_proc);
            gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
        }
        else if (interp_type == InterpolationType::INTERP_RIC)
@@ -209,7 +223,7 @@ public:
            gd->setFGSSigma(fgs_sigma);
            gd->setSuperpixelSize(sp_size);
            gd->setSuperpixelMode(slic_type);
-            gd->setUseGlobalSmootherFilter(false);
+            gd->setUseGlobalSmootherFilter(use_post_proc);
            gd->setUseVariationalRefinement(false);
            gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
        }
@@ -225,6 +239,10 @@ public:
            cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20);
            cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20);
            cv::merge(vecMats2, dense_flow);
+            if (use_post_proc)
+            {
+                ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
+            }
        }
        if (use_variational_refinement)
        {
@@ -235,10 +253,6 @@ public:
            variationalrefine->setOmega(1.9f);
            variationalrefine->calc(prevGrey, currGrey, flow);
        }
-        if (use_post_proc)
-        {
-            ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
-        }
    }
    virtual void collectGarbage() CV_OVERRIDE

--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -196,6 +196,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
    param->supportRegionType = SR_CROSS;
    param->useIlluminationModel = true;
    param->solverType = ST_BILINEAR;
+    param->setUseMEstimator(true);
    algo->setRLOFOpticalFlowParameter(param);
    algo->calc(frame1, frame2, prevPts, currPts, status, err);
    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.3f);
@@ -216,8 +217,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
    algo->calc(frame1, frame2, prevPts, currPts, status, err);
    EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f);
-    param->normSigma0 = numeric_limits<float>::max();
+    param->setUseMEstimator(false);
-    param->normSigma1 = numeric_limits<float>::max();
    param->useIlluminationModel = true;
    param->solverType = ST_BILINEAR;
@@ -250,6 +250,7 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
    Mat flow;
    Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
    Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
+    param->setUseMEstimator(true);
    param->supportRegionType = SR_CROSS;
    param->solverType = ST_BILINEAR;
    algo->setRLOFOpticalFlowParameter(param);