Merge pull request #708 from sbokov:variational_refinement

30f718d1 · Maksim Shabunin · 51a4f6e4 · b96a5f59 · 30f718d1 · 30f718d1
Commit 30f718d1 authored Jul 04, 2016 by Maksim Shabunin
12 changed files
--- a/modules/optflow/doc/optflow.bib
+++ b/modules/optflow/doc/optflow.bib
@@ -44,3 +44,11 @@
  journal={arXiv preprint arXiv:1603.03590},
  year={2016}
 }
+
+@inproceedings{Brox2004,
+  title={High accuracy optical flow estimation based on a theory for warping},
+  author={Brox, Thomas and Bruhn, Andr{\'e}s and Papenberg, Nils and Weickert, Joachim},
+  booktitle={European Conference on Computer Vision (ECCV)},
+  pages={25--36},
+  year={2004}
+}
--- a/modules/optflow/include/opencv2/optflow.hpp
+++ b/modules/optflow/include/opencv2/optflow.hpp
@@ -154,6 +154,64 @@ to the flow in the horizontal direction (u), second - vertical (v).
 */
 CV_EXPORTS_W bool writeOpticalFlow( const String& path, InputArray flow );

+/** @brief Variational optical flow refinement
+
+This class implements variational refinement of the input flow field, i.e.
+it uses input flow to initialize the minimization of the following functional:
+\f$E(U) = \int_{\Omega} \delta \Psi(E_I) + \gamma \Psi(E_G) + \alpha \Psi(E_S) \f$,
+where \f$E_I,E_G,E_S\f$ are color constancy, gradient constancy and smoothness terms
+respectively. \f$\Psi(s^2)=\sqrt{s^2+\epsilon^2}\f$ is a robust penalizer to limit the
+influence of outliers. A complete formulation and a description of the minimization
+procedure can be found in @cite Brox2004
+*/
+class CV_EXPORTS_W VariationalRefinement : public DenseOpticalFlow
+{
+public:
+    /** @brief calc function overload to handle separate horizontal (u) and vertical (v) flow components
+    (to avoid extra splits/merges) */
+    CV_WRAP virtual void calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v) = 0;
+
+    /** @brief Number of outer (fixed-point) iterations in the minimization procedure.
+    @see setFixedPointIterations */
+    CV_WRAP virtual int getFixedPointIterations() const = 0;
+    /** @copybrief getFixedPointIterations @see getFixedPointIterations */
+    CV_WRAP virtual void setFixedPointIterations(int val) = 0;
+
+    /** @brief Number of inner successive over-relaxation (SOR) iterations
+        in the minimization procedure to solve the respective linear system.
+    @see setSorIterations */
+    CV_WRAP virtual int getSorIterations() const = 0;
+    /** @copybrief getSorIterations @see getSorIterations */
+    CV_WRAP virtual void setSorIterations(int val) = 0;
+
+    /** @brief Relaxation factor in SOR
+    @see setOmega */
+    CV_WRAP virtual float getOmega() const = 0;
+    /** @copybrief getOmega @see getOmega */
+    CV_WRAP virtual void setOmega(float val) = 0;
+
+    /** @brief Weight of the smoothness term
+    @see setAlpha */
+    CV_WRAP virtual float getAlpha() const = 0;
+    /** @copybrief getAlpha @see getAlpha */
+    CV_WRAP virtual void setAlpha(float val) = 0;
+
+    /** @brief Weight of the color constancy term
+    @see setDelta */
+    CV_WRAP virtual float getDelta() const = 0;
+    /** @copybrief getDelta @see getDelta */
+    CV_WRAP virtual void setDelta(float val) = 0;
+
+    /** @brief Weight of the gradient constancy term
+    @see setGamma */
+    CV_WRAP virtual float getGamma() const = 0;
+    /** @copybrief getGamma @see getGamma */
+    CV_WRAP virtual void setGamma(float val) = 0;
+};
+
+/** @brief Creates an instance of VariationalRefinement
+*/
+CV_EXPORTS_W Ptr<VariationalRefinement> createVariationalFlowRefinement();

 /** @brief DeepFlow optical flow algorithm implementation.

@@ -194,40 +252,80 @@ CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_SparseToDense();
 /** @brief DIS optical flow algorithm.

 This class implements the Dense Inverse Search (DIS) optical flow algorithm. More
-details about the algorithm can be found at @cite Kroeger2016 .
+details about the algorithm can be found at @cite Kroeger2016 . Includes three presets with preselected
+parameters to provide reasonable trade-off between speed and quality. However, even the slowest preset is
+still relatively fast, use DeepFlow if you need better quality and don't care about speed.
 */
 class CV_EXPORTS_W DISOpticalFlow : public DenseOpticalFlow
 {
 public:
-    /** @brief Finest level of the gaussian pyramid on which the flow is computed (zero level
-        corresponds to the original image resolution).The final flow is obtained by bilinear upscaling.
+    enum
+    {
+        PRESET_ULTRAFAST = 0,
+        PRESET_FAST = 1,
+        PRESET_MEDIUM = 2
+    };
+
+    /** @brief Finest level of the Gaussian pyramid on which the flow is computed (zero level
+        corresponds to the original image resolution). The final flow is obtained by bilinear upscaling.
        @see setFinestScale */
-    virtual int getFinestScale() const = 0;
+    CV_WRAP virtual int getFinestScale() const = 0;
    /** @copybrief getFinestScale @see getFinestScale */
-    virtual void setFinestScale(int val) = 0;
+    CV_WRAP virtual void setFinestScale(int val) = 0;

-    /** @brief Size of an image patch for matching (in pixels)
+    /** @brief Size of an image patch for matching (in pixels). Normally, default 8x8 patches work well
+        enough in most cases.
        @see setPatchSize */
-    virtual int getPatchSize() const = 0;
+    CV_WRAP virtual int getPatchSize() const = 0;
    /** @copybrief getPatchSize @see getPatchSize */
-    virtual void setPatchSize(int val) = 0;
+    CV_WRAP virtual void setPatchSize(int val) = 0;

-    /** @brief Stride between neighbor patches. Must be less than patch size.
+    /** @brief Stride between neighbor patches. Must be less than patch size. Lower values correspond
+        to higher flow quality.
        @see setPatchStride */
-    virtual int getPatchStride() const = 0;
+    CV_WRAP virtual int getPatchStride() const = 0;
    /** @copybrief getPatchStride @see getPatchStride */
-    virtual void setPatchStride(int val) = 0;
+    CV_WRAP virtual void setPatchStride(int val) = 0;

-    /** @brief number of gradient descent iterations in the patch inverse search stage
+    /** @brief Maximum number of gradient descent iterations in the patch inverse search stage. Higher values
+        may improve quality in some cases.
        @see setGradientDescentIterations */
-    virtual int getGradientDescentIterations() const = 0;
+    CV_WRAP virtual int getGradientDescentIterations() const = 0;
    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
-    virtual void setGradientDescentIterations(int val) = 0;
+    CV_WRAP virtual void setGradientDescentIterations(int val) = 0;
+
+    /** @brief Number of fixed point iterations of variational refinement per scale. Set to zero to
+        disable variational refinement completely. Higher values will typically result in more smooth and
+        high-quality flow.
+    @see setGradientDescentIterations */
+    CV_WRAP virtual int getVariationalRefinementIterations() const = 0;
+    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
+    CV_WRAP virtual void setVariationalRefinementIterations(int val) = 0;
+
+    /** @brief Whether to use mean-normalization of patches when computing patch distance. It is turned on
+        by default as it typically provides a noticeable quality boost because of increased robustness to
+        illumanition variations. Turn it off if you are certain that your sequence does't contain any changes
+        in illumination.
+    @see setUseMeanNormalization */
+    CV_WRAP virtual bool getUseMeanNormalization() const = 0;
+    /** @copybrief getUseMeanNormalization @see getUseMeanNormalization */
+    CV_WRAP virtual void setUseMeanNormalization(bool val) = 0;
+
+    /** @brief Whether to use spatial propagation of good optical flow vectors. This option is turned on by
+        default, as it tends to work better on average and can sometimes help recover from major errors
+        introduced by the coarse-to-fine scheme employed by the DIS optical flow algorithm. Turning this
+        option off can make the output flow field a bit smoother, however.
+    @see setUseSpatialPropagation */
+    CV_WRAP virtual bool getUseSpatialPropagation() const = 0;
+    /** @copybrief getUseSpatialPropagation @see getUseSpatialPropagation */
+    CV_WRAP virtual void setUseSpatialPropagation(bool val) = 0;
 };

 /** @brief Creates an instance of DISOpticalFlow
+
+@param preset one of PRESET_ULTRAFAST, PRESET_FAST and PRESET_MEDIUM
 */
-CV_EXPORTS_W Ptr<DISOpticalFlow> createOptFlow_DIS();
+CV_EXPORTS_W Ptr<DISOpticalFlow> createOptFlow_DIS(int preset = DISOpticalFlow::PRESET_FAST);

 //! @}


--- a/modules/optflow/perf/perf_deepflow.cpp
+++ b/modules/optflow/perf/perf_deepflow.cpp
+/*
+ *  By downloading, copying, installing or using the software you agree to this license.
+ *  If you do not agree to this license, do not download, install,
+ *  copy or use the software.
+ *
+ *
+ *  License Agreement
+ *  For Open Source Computer Vision Library
+ *  (3 - clause BSD License)
+ *
+ *  Redistribution and use in source and binary forms, with or without modification,
+ *  are permitted provided that the following conditions are met :
+ *
+ *  *Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and / or other materials provided with the distribution.
+ *
+ *  * Neither the names of the copyright holders nor the names of the contributors
+ *  may be used to endorse or promote products derived from this software
+ *  without specific prior written permission.
+ *
+ *  This software is provided by the copyright holders and contributors "as is" and
+ *  any express or implied warranties, including, but not limited to, the implied
+ *  warranties of merchantability and fitness for a particular purpose are disclaimed.
+ *  In no event shall copyright holders or contributors be liable for any direct,
+ *  indirect, incidental, special, exemplary, or consequential damages
+ *  (including, but not limited to, procurement of substitute goods or services;
+ *  loss of use, data, or profits; or business interruption) however caused
+ *  and on any theory of liability, whether in contract, strict liability,
+ *  or tort(including negligence or otherwise) arising in any way out of
+ *  the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "perf_precomp.hpp"
+
+using std::tr1::tuple;
+using std::tr1::get;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::optflow;
+
+typedef tuple<Size> DFParams;
+typedef TestBaseWithParam<DFParams> DenseOpticalFlow_DeepFlow;
+
+PERF_TEST_P(DenseOpticalFlow_DeepFlow, perf, Values(szVGA, sz720p))
+{
+    DFParams params = GetParam();
+    Size sz = get<0>(params);
+
+    Mat frame1(sz, CV_8U);
+    Mat frame2(sz, CV_8U);
+    Mat flow;
+
+    randu(frame1, 0, 255);
+    randu(frame2, 0, 255);
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+    TEST_CYCLE_N(1)
+    {
+        Ptr<DenseOpticalFlow> algo = createOptFlow_DeepFlow();
+        algo->calc(frame1, frame2, flow);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
--- a/modules/optflow/perf/perf_disflow.cpp
+++ b/modules/optflow/perf/perf_disflow.cpp
+/*
+ *  By downloading, copying, installing or using the software you agree to this license.
+ *  If you do not agree to this license, do not download, install,
+ *  copy or use the software.
+ *
+ *
+ *  License Agreement
+ *  For Open Source Computer Vision Library
+ *  (3 - clause BSD License)
+ *
+ *  Redistribution and use in source and binary forms, with or without modification,
+ *  are permitted provided that the following conditions are met :
+ *
+ *  *Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and / or other materials provided with the distribution.
+ *
+ *  * Neither the names of the copyright holders nor the names of the contributors
+ *  may be used to endorse or promote products derived from this software
+ *  without specific prior written permission.
+ *
+ *  This software is provided by the copyright holders and contributors "as is" and
+ *  any express or implied warranties, including, but not limited to, the implied
+ *  warranties of merchantability and fitness for a particular purpose are disclaimed.
+ *  In no event shall copyright holders or contributors be liable for any direct,
+ *  indirect, incidental, special, exemplary, or consequential damages
+ *  (including, but not limited to, procurement of substitute goods or services;
+ *  loss of use, data, or profits; or business interruption) however caused
+ *  and on any theory of liability, whether in contract, strict liability,
+ *  or tort(including negligence or otherwise) arising in any way out of
+ *  the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "perf_precomp.hpp"
+
+using std::tr1::tuple;
+using std::tr1::get;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::optflow;
+
+void MakeArtificialExample(Mat &dst_frame1, Mat &dst_frame2);
+
+typedef tuple<String, Size> DISParams;
+typedef TestBaseWithParam<DISParams> DenseOpticalFlow_DIS;
+
+PERF_TEST_P(DenseOpticalFlow_DIS, perf,
+            Combine(Values("PRESET_ULTRAFAST", "PRESET_FAST", "PRESET_MEDIUM"), Values(szVGA, sz720p, sz1080p)))
+{
+    DISParams params = GetParam();
+
+    // use strings to print preset names in the perf test results:
+    String preset_string = get<0>(params);
+    int preset = DISOpticalFlow::PRESET_FAST;
+    if (preset_string == "PRESET_ULTRAFAST")
+        preset = DISOpticalFlow::PRESET_ULTRAFAST;
+    else if (preset_string == "PRESET_FAST")
+        preset = DISOpticalFlow::PRESET_FAST;
+    else if (preset_string == "PRESET_MEDIUM")
+        preset = DISOpticalFlow::PRESET_MEDIUM;
+    Size sz = get<1>(params);
+
+    Mat frame1(sz, CV_8U);
+    Mat frame2(sz, CV_8U);
+    Mat flow;
+
+    MakeArtificialExample(frame1, frame2);
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+    TEST_CYCLE_N(10)
+    {
+        Ptr<DenseOpticalFlow> algo = createOptFlow_DIS(preset);
+        algo->calc(frame1, frame2, flow);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+void MakeArtificialExample(Mat &dst_frame1, Mat &dst_frame2)
+{
+    int src_scale = 2;
+    int OF_scale = 6;
+    double sigma = dst_frame1.cols / 300;
+
+    Mat tmp(Size(dst_frame1.cols / (int)pow(2, src_scale), dst_frame1.rows / (int)pow(2, src_scale)), CV_8U);
+    randu(tmp, 0, 255);
+    resize(tmp, dst_frame1, dst_frame1.size(), 0.0, 0.0, INTER_LINEAR);
+    resize(tmp, dst_frame2, dst_frame2.size(), 0.0, 0.0, INTER_LINEAR);
+
+    Mat displacement_field(Size(dst_frame1.cols / (int)pow(2, OF_scale), dst_frame1.rows / (int)pow(2, OF_scale)),
+                           CV_32FC2);
+    randn(displacement_field, 0.0, sigma);
+    resize(displacement_field, displacement_field, dst_frame2.size(), 0.0, 0.0, INTER_CUBIC);
+    for (int i = 0; i < displacement_field.rows; i++)
+        for (int j = 0; j < displacement_field.cols; j++)
+            displacement_field.at<Vec2f>(i, j) += Vec2f((float)j, (float)i);
+
+    remap(dst_frame2, dst_frame2, displacement_field, Mat(), INTER_LINEAR, BORDER_REPLICATE);
+}
--- a/modules/optflow/perf/perf_main.cpp
+++ b/modules/optflow/perf/perf_main.cpp
+#include "perf_precomp.hpp"
+
+CV_PERF_TEST_MAIN(optflow)
--- a/modules/optflow/perf/perf_precomp.hpp
+++ b/modules/optflow/perf/perf_precomp.hpp
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/optflow.hpp"
+#include "opencv2/highgui.hpp"
+
+#endif
--- a/modules/optflow/perf/perf_variational_refinement.cpp
+++ b/modules/optflow/perf/perf_variational_refinement.cpp
+/*
+*  By downloading, copying, installing or using the software you agree to this license.
+*  If you do not agree to this license, do not download, install,
+*  copy or use the software.
+*
+*
+*  License Agreement
+*  For Open Source Computer Vision Library
+*  (3 - clause BSD License)
+*
+*  Redistribution and use in source and binary forms, with or without modification,
+*  are permitted provided that the following conditions are met :
+*
+*  *Redistributions of source code must retain the above copyright notice,
+*  this list of conditions and the following disclaimer.
+*
+*  * Redistributions in binary form must reproduce the above copyright notice,
+*  this list of conditions and the following disclaimer in the documentation
+*  and / or other materials provided with the distribution.
+*
+*  * Neither the names of the copyright holders nor the names of the contributors
+*  may be used to endorse or promote products derived from this software
+*  without specific prior written permission.
+*
+*  This software is provided by the copyright holders and contributors "as is" and
+*  any express or implied warranties, including, but not limited to, the implied
+*  warranties of merchantability and fitness for a particular purpose are disclaimed.
+*  In no event shall copyright holders or contributors be liable for any direct,
+*  indirect, incidental, special, exemplary, or consequential damages
+*  (including, but not limited to, procurement of substitute goods or services;
+*  loss of use, data, or profits; or business interruption) however caused
+*  and on any theory of liability, whether in contract, strict liability,
+*  or tort(including negligence or otherwise) arising in any way out of
+*  the use of this software, even if advised of the possibility of such damage.
+*/
+
+#include "perf_precomp.hpp"
+
+using std::tr1::tuple;
+using std::tr1::get;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::optflow;
+
+typedef tuple<Size, int, int> VarRefParams;
+typedef TestBaseWithParam<VarRefParams> DenseOpticalFlow_VariationalRefinement;
+
+PERF_TEST_P(DenseOpticalFlow_VariationalRefinement, perf, Combine(Values(szQVGA, szVGA), Values(5, 10), Values(5, 10)))
+{
+    VarRefParams params = GetParam();
+    Size sz = get<0>(params);
+    int sorIter = get<1>(params);
+    int fixedPointIter = get<2>(params);
+
+    Mat frame1(sz, CV_8U);
+    Mat frame2(sz, CV_8U);
+    Mat flow(sz, CV_32FC2);
+
+    randu(frame1, 0, 255);
+    randu(frame2, 0, 255);
+    flow.setTo(0.0f);
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+    TEST_CYCLE_N(10)
+    {
+        Ptr<VariationalRefinement> var = createVariationalFlowRefinement();
+        var->setAlpha(20.0f);
+        var->setGamma(10.0f);
+        var->setDelta(5.0f);
+        var->setSorIterations(sorIter);
+        var->setFixedPointIterations(fixedPointIter);
+        var->calc(frame1, frame2, flow);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
--- a/modules/optflow/samples/optical_flow_evaluation.cpp
+++ b/modules/optflow/samples/optical_flow_evaluation.cpp
@@ -11,7 +11,7 @@ using namespace optflow;
 const String keys = "{help h usage ? |      | print this message   }"
        "{@image1        |      | image1               }"
        "{@image2        |      | image2               }"
-        "{@algorithm     |      | [farneback, simpleflow, tvl1, deepflow, sparsetodenseflow or DISflow] }"
+        "{@algorithm     |      | [farneback, simpleflow, tvl1, deepflow, sparsetodenseflow, DISflow_ultrafast, DISflow_fast, DISflow_medium] }"
        "{@groundtruth   |      | path to the .flo file  (optional), Middlebury format }"
        "{m measure      |endpoint| error measure - [endpoint or angular] }"
        "{r region       |all   | region to compute stats about [all, discontinuities, untextured] }"
@@ -229,7 +229,7 @@ int main( int argc, char** argv )
    if ( i2.depth() != CV_8U )
        i2.convertTo(i2, CV_8U);

-    if ( (method == "farneback" || method == "tvl1" || method == "deepflow" || method == "DISflow") && i1.channels() == 3 )
+    if ( (method == "farneback" || method == "tvl1" || method == "deepflow" || method == "DISflow_ultrafast" || method == "DISflow_fast" || method == "DISflow_medium") && i1.channels() == 3 )
    {   // 1-channel images are expected
        cvtColor(i1, i1, COLOR_BGR2GRAY);
        cvtColor(i2, i2, COLOR_BGR2GRAY);
@@ -252,8 +252,12 @@ int main( int argc, char** argv )
        algorithm = createOptFlow_DeepFlow();
    else if ( method == "sparsetodenseflow" )
        algorithm = createOptFlow_SparseToDense();
-    else if ( method == "DISflow" )
-        algorithm = createOptFlow_DIS();
+    else if ( method == "DISflow_ultrafast" )
+        algorithm = createOptFlow_DIS(DISOpticalFlow::PRESET_ULTRAFAST);
+    else if (method == "DISflow_fast")
+        algorithm = createOptFlow_DIS(DISOpticalFlow::PRESET_FAST);
+    else if (method == "DISflow_medium")
+        algorithm = createOptFlow_DIS(DISOpticalFlow::PRESET_MEDIUM);
    else
    {
        printf("Wrong method!\n");

--- a/modules/optflow/src/dis_flow.cpp
+++ b/modules/optflow/src/dis_flow.cpp
@@ -40,10 +40,11 @@
 //
 //M*/

+#include "opencv2/core/hal/intrin.hpp"
 #include "precomp.hpp"
-#include "opencv2/imgproc.hpp"
 using namespace std;
 #define EPS 0.001F
+#define INF 1E+10F

 namespace cv
 {
@@ -58,13 +59,21 @@ class DISOpticalFlowImpl : public DISOpticalFlow
    void calc(InputArray I0, InputArray I1, InputOutputArray flow);
    void collectGarbage();

-  protected: // algorithm parameters
+  protected: //!< algorithm parameters
    int finest_scale, coarsest_scale;
    int patch_size;
    int patch_stride;
    int grad_descent_iter;
+    int variational_refinement_iter;
+    bool use_mean_normalization;
+    bool use_spatial_propagation;

-  public: // getters and setters
+  protected: //!< some auxiliary variables
+    int border_size;
+    int w, h;   //!< flow buffer width and height on the current scale
+    int ws, hs; //!< sparse flow buffer width and height on the current scale
+
+  public:
    int getFinestScale() const { return finest_scale; }
    void setFinestScale(int val) { finest_scale = val; }
    int getPatchSize() const { return patch_size; }
@@ -73,60 +82,111 @@ class DISOpticalFlowImpl : public DISOpticalFlow
    void setPatchStride(int val) { patch_stride = val; }
    int getGradientDescentIterations() const { return grad_descent_iter; }
    void setGradientDescentIterations(int val) { grad_descent_iter = val; }
+    int getVariationalRefinementIterations() const { return variational_refinement_iter; }
+    void setVariationalRefinementIterations(int val) { variational_refinement_iter = val; }
+    bool getUseMeanNormalization() const { return use_mean_normalization; }
+    void setUseMeanNormalization(bool val) { use_mean_normalization = val; }
+    bool getUseSpatialPropagation() const { return use_spatial_propagation; }
+    void setUseSpatialPropagation(bool val) { use_spatial_propagation = val; }
+
+  protected:                      //!< internal buffers
+    vector<Mat_<uchar> > I0s;     //!< Gaussian pyramid for the current frame
+    vector<Mat_<uchar> > I1s;     //!< Gaussian pyramid for the next frame
+    vector<Mat_<uchar> > I1s_ext; //!< I1s with borders

-  private:                   // internal buffers
-    vector< Mat_<uchar> > I0s; // gaussian pyramid for the current frame
-    vector< Mat_<uchar> > I1s; // gaussian pyramid for the next frame
+    vector<Mat_<short> > I0xs; //!< Gaussian pyramid for the x gradient of the current frame
+    vector<Mat_<short> > I0ys; //!< Gaussian pyramid for the y gradient of the current frame

-    vector< Mat_<short> > I0xs; // gaussian pyramid for the x gradient of the current frame
-    vector< Mat_<short> > I0ys; // gaussian pyramid for the y gradient of the current frame
+    vector<Mat_<float> > Ux; //!< x component of the flow vectors
+    vector<Mat_<float> > Uy; //!< y component of the flow vectors

-    vector< Mat_<float> > Ux; // x component of the flow vectors
-    vector< Mat_<float> > Uy; // y component of the flow vectors
+    Mat_<Vec2f> U; //!< a buffer for the merged flow

-    Mat_<Vec2f> U; // buffers for the merged flow
+    Mat_<float> Sx; //!< intermediate sparse flow representation (x component)
+    Mat_<float> Sy; //!< intermediate sparse flow representation (y component)

-    Mat_<float> Sx; // x component of the sparse flow vectors (before densification)
-    Mat_<float> Sy; // y component of the sparse flow vectors (before densification)
+    /* Structure tensor components: */
+    Mat_<float> I0xx_buf; //!< sum of squares of x gradient values
+    Mat_<float> I0yy_buf; //!< sum of squares of y gradient values
+    Mat_<float> I0xy_buf; //!< sum of x and y gradient products

-    // structure tensor components and auxiliary buffers:
-    Mat_<float> I0xx_buf; // sum of squares of x gradient values
-    Mat_<float> I0yy_buf; // sum of squares of y gradient values
-    Mat_<float> I0xy_buf; // sum of x and y gradient products
+    /* Extra buffers that are useful if patch mean-normalization is used: */
+    Mat_<float> I0x_buf; //!< sum of of x gradient values
+    Mat_<float> I0y_buf; //!< sum of of y gradient values

-    Mat_<float> I0xx_buf_aux; // for computing sums using the summed area table
+    /* Auxiliary buffers used in structure tensor computation: */
+    Mat_<float> I0xx_buf_aux;
    Mat_<float> I0yy_buf_aux;
    Mat_<float> I0xy_buf_aux;
-    ////////////////////////////////////////////////////////////
+    Mat_<float> I0x_buf_aux;
+    Mat_<float> I0y_buf_aux;

-  private: // private methods
+    vector<Ptr<VariationalRefinement> > variational_refinement_processors;
+
+  private: //!< private methods and parallel sections
    void prepareBuffers(Mat &I0, Mat &I1);
-    void precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &I0x, Mat &I0y);
-    void patchInverseSearch(Mat &dst_Sx, Mat &dst_Sy, Mat &src_Ux, Mat &src_Uy, Mat &I0, Mat &I0x, Mat &I0y, Mat &I1);
-    void densification(Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy, Mat &I0, Mat &I1);
+    void precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, Mat &dst_I0y, Mat &I0x,
+                                   Mat &I0y);
+
+    struct PatchInverseSearch_ParBody : public ParallelLoopBody
+    {
+        DISOpticalFlowImpl *dis;
+        int nstripes, stripe_sz;
+        int hs;
+        Mat *Sx, *Sy, *Ux, *Uy, *I0, *I1, *I0x, *I0y;
+        int num_iter;
+
+        PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _hs, Mat &dst_Sx, Mat &dst_Sy,
+                                   Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1, Mat &_I0x, Mat &_I0y, int _num_iter);
+        void operator()(const Range &range) const;
+    };
+
+    struct Densification_ParBody : public ParallelLoopBody
+    {
+        DISOpticalFlowImpl *dis;
+        int nstripes, stripe_sz;
+        int h;
+        Mat *Ux, *Uy, *Sx, *Sy, *I0, *I1;
+
+        Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h, Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx,
+                              Mat &src_Sy, Mat &_I0, Mat &_I1);
+        void operator()(const Range &range) const;
+    };
 };

 DISOpticalFlowImpl::DISOpticalFlowImpl()
 {
-    finest_scale = 3;
-    patch_size = 9;
+    finest_scale = 2;
+    patch_size = 8;
    patch_stride = 4;
-    grad_descent_iter = 12;
+    grad_descent_iter = 16;
+    variational_refinement_iter = 5;
+    border_size = 16;
+    use_mean_normalization = true;
+    use_spatial_propagation = true;
+
+    /* Use separate variational refinement instances for different scales to avoid repeated memory allocation: */
+    int max_possible_scales = 10;
+    for (int i = 0; i < max_possible_scales; i++)
+        variational_refinement_processors.push_back(createVariationalFlowRefinement());
 }

 void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1)
 {
    I0s.resize(coarsest_scale + 1);
    I1s.resize(coarsest_scale + 1);
+    I1s_ext.resize(coarsest_scale + 1);
    I0xs.resize(coarsest_scale + 1);
    I0ys.resize(coarsest_scale + 1);
    Ux.resize(coarsest_scale + 1);
    Uy.resize(coarsest_scale + 1);
+
    int fraction = 1;
    int cur_rows = 0, cur_cols = 0;

    for (int i = 0; i <= coarsest_scale; i++)
    {
+        /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
        if (i == finest_scale)
        {
            cur_rows = I0.rows / fraction;
@@ -136,14 +196,21 @@ void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1)
            I1s[i].create(cur_rows, cur_cols);
            resize(I1, I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);

+            /* These buffers are reused in each scale so we initialize them once on the finest scale: */
            Sx.create(cur_rows / patch_stride, cur_cols / patch_stride);
            Sy.create(cur_rows / patch_stride, cur_cols / patch_stride);
            I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
            I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
            I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
+            I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
+            I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
+
            I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride);
            I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride);
            I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride);
+            I0x_buf_aux.create(cur_rows, cur_cols / patch_stride);
+            I0y_buf_aux.create(cur_rows, cur_cols / patch_stride);
+
            U.create(cur_rows, cur_cols);
        }
        else if (i > finest_scale)
@@ -160,89 +227,112 @@ void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1)

        if (i >= finest_scale)
        {
+            I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size);
+            copyMakeBorder(I1s[i], I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
            I0xs[i].create(cur_rows, cur_cols);
            I0ys[i].create(cur_rows, cur_cols);
            spatialGradient(I0s[i], I0xs[i], I0ys[i]);
            Ux[i].create(cur_rows, cur_cols);
            Uy[i].create(cur_rows, cur_cols);
+            variational_refinement_processors[i]->setAlpha(20.0f);
+            variational_refinement_processors[i]->setDelta(5.0f);
+            variational_refinement_processors[i]->setGamma(10.0f);
+            variational_refinement_processors[i]->setSorIterations(5);
+            variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
        }
    }
 }

-void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &I0x, Mat &I0y)
+/* This function computes the structure tensor elements (local sums of I0x^2, I0x*I0y and I0y^2).
+ * A simple box filter is not used instead because we need to compute these sums on a sparse grid
+ * and store them densely in the output buffers.
+ */
+void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x,
+                                                   Mat &dst_I0y, Mat &I0x, Mat &I0y)
 {
-    short *I0x_ptr = I0x.ptr<short>();
-    short *I0y_ptr = I0y.ptr<short>();
-
    float *I0xx_ptr = dst_I0xx.ptr<float>();
    float *I0yy_ptr = dst_I0yy.ptr<float>();
    float *I0xy_ptr = dst_I0xy.ptr<float>();
+    float *I0x_ptr = dst_I0x.ptr<float>();
+    float *I0y_ptr = dst_I0y.ptr<float>();

    float *I0xx_aux_ptr = I0xx_buf_aux.ptr<float>();
    float *I0yy_aux_ptr = I0yy_buf_aux.ptr<float>();
    float *I0xy_aux_ptr = I0xy_buf_aux.ptr<float>();
+    float *I0x_aux_ptr = I0x_buf_aux.ptr<float>();
+    float *I0y_aux_ptr = I0y_buf_aux.ptr<float>();

-    int w = I0x.cols;
-    int h = I0x.rows;
-    int psz2 = patch_size / 2;
-    int psz = 2 * psz2;
-    // width of the sparse OF fields:
-    int ws = 1 + (w - patch_size) / patch_stride;
-
-    // separable box filter for computing patch sums on a sparse
-    // grid (determined by patch_stride)
+    /* Separable box filter: horizontal pass */
    for (int i = 0; i < h; i++)
    {
-        float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f;
-        short *x_row = I0x_ptr + i * w, *y_row = I0y_ptr + i * w;
+        float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
+        short *x_row = I0x.ptr<short>(i);
+        short *y_row = I0y.ptr<short>(i);
        for (int j = 0; j < patch_size; j++)
        {
            sum_xx += x_row[j] * x_row[j];
            sum_yy += y_row[j] * y_row[j];
            sum_xy += x_row[j] * y_row[j];
+            sum_x += x_row[j];
+            sum_y += y_row[j];
        }
        I0xx_aux_ptr[i * ws] = sum_xx;
        I0yy_aux_ptr[i * ws] = sum_yy;
        I0xy_aux_ptr[i * ws] = sum_xy;
+        I0x_aux_ptr[i * ws] = sum_x;
+        I0y_aux_ptr[i * ws] = sum_y;
        int js = 1;
        for (int j = patch_size; j < w; j++)
        {
            sum_xx += (x_row[j] * x_row[j] - x_row[j - patch_size] * x_row[j - patch_size]);
            sum_yy += (y_row[j] * y_row[j] - y_row[j - patch_size] * y_row[j - patch_size]);
            sum_xy += (x_row[j] * y_row[j] - x_row[j - patch_size] * y_row[j - patch_size]);
-            if ((j - psz) % patch_stride == 0)
+            sum_x += (x_row[j] - x_row[j - patch_size]);
+            sum_y += (y_row[j] - y_row[j - patch_size]);
+            if ((j - patch_size + 1) % patch_stride == 0)
            {
                I0xx_aux_ptr[i * ws + js] = sum_xx;
                I0yy_aux_ptr[i * ws + js] = sum_yy;
                I0xy_aux_ptr[i * ws + js] = sum_xy;
+                I0x_aux_ptr[i * ws + js] = sum_x;
+                I0y_aux_ptr[i * ws + js] = sum_y;
                js++;
            }
        }
    }

-    AutoBuffer<float> sum_xx_buf(ws), sum_yy_buf(ws), sum_xy_buf(ws);
+    AutoBuffer<float> sum_xx_buf(ws), sum_yy_buf(ws), sum_xy_buf(ws), sum_x_buf(ws), sum_y_buf(ws);
    float *sum_xx = (float *)sum_xx_buf;
    float *sum_yy = (float *)sum_yy_buf;
    float *sum_xy = (float *)sum_xy_buf;
+    float *sum_x = (float *)sum_x_buf;
+    float *sum_y = (float *)sum_y_buf;
    for (int j = 0; j < ws; j++)
    {
        sum_xx[j] = 0.0f;
        sum_yy[j] = 0.0f;
        sum_xy[j] = 0.0f;
+        sum_x[j] = 0.0f;
+        sum_y[j] = 0.0f;
    }

+    /* Separable box filter: vertical pass */
    for (int i = 0; i < patch_size; i++)
        for (int j = 0; j < ws; j++)
        {
            sum_xx[j] += I0xx_aux_ptr[i * ws + j];
            sum_yy[j] += I0yy_aux_ptr[i * ws + j];
            sum_xy[j] += I0xy_aux_ptr[i * ws + j];
+            sum_x[j] += I0x_aux_ptr[i * ws + j];
+            sum_y[j] += I0y_aux_ptr[i * ws + j];
        }
    for (int j = 0; j < ws; j++)
    {
        I0xx_ptr[j] = sum_xx[j];
        I0yy_ptr[j] = sum_yy[j];
        I0xy_ptr[j] = sum_xy[j];
+        I0x_ptr[j] = sum_x[j];
+        I0y_ptr[j] = sum_y[j];
    }
    int is = 1;
    for (int i = patch_size; i < h; i++)
@@ -252,166 +342,605 @@ void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy,
            sum_xx[j] += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
            sum_yy[j] += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
            sum_xy[j] += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
+            sum_x[j] += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
+            sum_y[j] += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
        }
-        if ((i - psz) % patch_stride == 0)
+        if ((i - patch_size + 1) % patch_stride == 0)
        {
            for (int j = 0; j < ws; j++)
            {
                I0xx_ptr[is * ws + j] = sum_xx[j];
                I0yy_ptr[is * ws + j] = sum_yy[j];
                I0xy_ptr[is * ws + j] = sum_xy[j];
+                I0x_ptr[is * ws + j] = sum_x[j];
+                I0y_ptr[is * ws + j] = sum_y[j];
            }
            is++;
        }
    }
 }

-void DISOpticalFlowImpl::patchInverseSearch(Mat &dst_Sx, Mat &dst_Sy, Mat &src_Ux, Mat &src_Uy, Mat &I0, Mat &I0x,
-                                            Mat &I0y, Mat &I1)
+DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes,
+                                                                           int _hs, Mat &dst_Sx, Mat &dst_Sy,
+                                                                           Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1,
+                                                                           Mat &_I0x, Mat &_I0y, int _num_iter)
+    : dis(&_dis), nstripes(_nstripes), hs(_hs), Sx(&dst_Sx), Sy(&dst_Sy), Ux(&src_Ux), Uy(&src_Uy), I0(&_I0), I1(&_I1),
+      I0x(&_I0x), I0y(&_I0y), num_iter(_num_iter)
+{
+    stripe_sz = (int)ceil(hs / (double)nstripes);
+}
+
+/////////////////////////////////////////////* Patch processing functions */////////////////////////////////////////////
+
+/* Some auxiliary macros */
+#define HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION                                                                         \
+    v_float32x4 w00v = v_setall_f32(w00);                                                                              \
+    v_float32x4 w01v = v_setall_f32(w01);                                                                              \
+    v_float32x4 w10v = v_setall_f32(w10);                                                                              \
+    v_float32x4 w11v = v_setall_f32(w11);                                                                              \
+                                                                                                                       \
+    v_uint8x16 I0_row_16, I1_row_16, I1_row_shifted_16, I1_row_next_16, I1_row_next_shifted_16;                        \
+    v_uint16x8 I0_row_8, I1_row_8, I1_row_shifted_8, I1_row_next_8, I1_row_next_shifted_8, tmp;                        \
+    v_uint32x4 I0_row_4_left, I1_row_4_left, I1_row_shifted_4_left, I1_row_next_4_left, I1_row_next_shifted_4_left;    \
+    v_uint32x4 I0_row_4_right, I1_row_4_right, I1_row_shifted_4_right, I1_row_next_4_right,                            \
+      I1_row_next_shifted_4_right;                                                                                     \
+    v_float32x4 I_diff_left, I_diff_right;                                                                             \
+                                                                                                                       \
+    /* Preload and expand the first row of I1: */                                                                      \
+    I1_row_16 = v_load(I1_ptr);                                                                                        \
+    I1_row_shifted_16 = v_extract<1>(I1_row_16, I1_row_16);                                                            \
+    v_expand(I1_row_16, I1_row_8, tmp);                                                                                \
+    v_expand(I1_row_shifted_16, I1_row_shifted_8, tmp);                                                                \
+    v_expand(I1_row_8, I1_row_4_left, I1_row_4_right);                                                                 \
+    v_expand(I1_row_shifted_8, I1_row_shifted_4_left, I1_row_shifted_4_right);                                         \
+    I1_ptr += I1_stride;
+
+#define HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION                                                                      \
+    /* Load the next row of I1: */                                                                                     \
+    I1_row_next_16 = v_load(I1_ptr);                                                                                   \
+    /* Circular shift left by 1 element: */                                                                            \
+    I1_row_next_shifted_16 = v_extract<1>(I1_row_next_16, I1_row_next_16);                                             \
+    /* Expand to 8 ushorts (we only need the first 8 values): */                                                       \
+    v_expand(I1_row_next_16, I1_row_next_8, tmp);                                                                      \
+    v_expand(I1_row_next_shifted_16, I1_row_next_shifted_8, tmp);                                                      \
+    /* Separate the left and right halves: */                                                                          \
+    v_expand(I1_row_next_8, I1_row_next_4_left, I1_row_next_4_right);                                                  \
+    v_expand(I1_row_next_shifted_8, I1_row_next_shifted_4_left, I1_row_next_shifted_4_right);                          \
+                                                                                                                       \
+    /* Load current row of I0: */                                                                                      \
+    I0_row_16 = v_load(I0_ptr);                                                                                        \
+    v_expand(I0_row_16, I0_row_8, tmp);                                                                                \
+    v_expand(I0_row_8, I0_row_4_left, I0_row_4_right);                                                                 \
+                                                                                                                       \
+    /* Compute diffs between I0 and bilinearly interpolated I1: */                                                     \
+    I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) +                                              \
+                  w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) +                                      \
+                  w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) +                                         \
+                  w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) -                                 \
+                  v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left));                                                      \
+    I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) +                                            \
+                   w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) +                                    \
+                   w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) +                                       \
+                   w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) -                               \
+                   v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right));
+
+#define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW                                                                     \
+    I0_ptr += I0_stride;                                                                                               \
+    I1_ptr += I1_stride;                                                                                               \
+                                                                                                                       \
+    I1_row_4_left = I1_row_next_4_left;                                                                                \
+    I1_row_4_right = I1_row_next_4_right;                                                                              \
+    I1_row_shifted_4_left = I1_row_next_shifted_4_left;                                                                \
+    I1_row_shifted_4_right = I1_row_next_shifted_4_right;
+
+/* This function essentially performs one iteration of gradient descent when finding the most similar patch in I1 for a
+ * given one in I0. It assumes that I0_ptr and I1_ptr already point to the corresponding patches and w00, w01, w10, w11
+ * are precomputed bilinear interpolation weights. It returns the SSD (sum of squared differences) between these patches
+ * and computes the values (dst_dUx, dst_dUy) that are used in the flow vector update. HAL acceleration is implemented
+ * only for the default patch size (8x8). Everything is processed in floats as using fixed-point approximations harms
+ * the quality significantly.
+ */
+inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr, short *I0y_ptr,
+                          int I0_stride, int I1_stride, float w00, float w01, float w10, float w11, int patch_sz)
+{
+    float SSD = 0.0f;
+#ifdef CV_SIMD128
+    if (patch_sz == 8)
+    {
+        /* Variables to accumulate the sums */
+        v_float32x4 Ux_vec = v_setall_f32(0);
+        v_float32x4 Uy_vec = v_setall_f32(0);
+        v_float32x4 SSD_vec = v_setall_f32(0);
+
+        v_int16x8 I0x_row, I0y_row;
+        v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
+
+        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
+        for (int row = 0; row < 8; row++)
+        {
+            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
+            I0x_row = v_load(I0x_ptr);
+            v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
+            I0y_row = v_load(I0y_ptr);
+            v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
+
+            /* Update the sums: */
+            Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
+            Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
+            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+
+            I0x_ptr += I0_stride;
+            I0y_ptr += I0_stride;
+            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
+        }
+
+        /* Final reduce operations: */
+        dst_dUx = v_reduce_sum(Ux_vec);
+        dst_dUy = v_reduce_sum(Uy_vec);
+        SSD = v_reduce_sum(SSD_vec);
+    }
+    else
+    {
+#endif
+        dst_dUx = 0.0f;
+        dst_dUy = 0.0f;
+        float diff;
+        for (int i = 0; i < patch_sz; i++)
+            for (int j = 0; j < patch_sz; j++)
+            {
+                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
+                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
+                       I0_ptr[i * I0_stride + j];
+
+                SSD += diff * diff;
+                dst_dUx += diff * I0x_ptr[i * I0_stride + j];
+                dst_dUy += diff * I0y_ptr[i * I0_stride + j];
+            }
+#ifdef CV_SIMD128
+    }
+#endif
+    return SSD;
+}
+
+/* Same as processPatch, but with patch mean normalization, which improves robustness under changing
+ * lighting conditions
+ */
+inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr,
+                                  short *I0y_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
+                                  float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
+{
+    float sum_diff = 0.0, sum_diff_sq = 0.0;
+    float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
+    float n = (float)patch_sz * patch_sz;
+
+#ifdef CV_SIMD128
+    if (patch_sz == 8)
+    {
+        /* Variables to accumulate the sums */
+        v_float32x4 sum_I0x_mul_vec = v_setall_f32(0);
+        v_float32x4 sum_I0y_mul_vec = v_setall_f32(0);
+        v_float32x4 sum_diff_vec = v_setall_f32(0);
+        v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
+
+        v_int16x8 I0x_row, I0y_row;
+        v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
+
+        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
+        for (int row = 0; row < 8; row++)
+        {
+            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
+            I0x_row = v_load(I0x_ptr);
+            v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
+            I0y_row = v_load(I0y_ptr);
+            v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
+
+            /* Update the sums: */
+            sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
+            sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
+            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            sum_diff_vec += I_diff_left + I_diff_right;
+
+            I0x_ptr += I0_stride;
+            I0y_ptr += I0_stride;
+            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
+        }
+
+        /* Final reduce operations: */
+        sum_I0x_mul = v_reduce_sum(sum_I0x_mul_vec);
+        sum_I0y_mul = v_reduce_sum(sum_I0y_mul_vec);
+        sum_diff = v_reduce_sum(sum_diff_vec);
+        sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
+    }
+    else
+    {
+#endif
+        float diff;
+        for (int i = 0; i < patch_sz; i++)
+            for (int j = 0; j < patch_sz; j++)
+            {
+                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
+                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
+                       I0_ptr[i * I0_stride + j];
+
+                sum_diff += diff;
+                sum_diff_sq += diff * diff;
+
+                sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j];
+                sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j];
+            }
+#ifdef CV_SIMD128
+    }
+#endif
+    dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
+    dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
+    return sum_diff_sq - sum_diff * sum_diff / n;
+}
+
+/* Similar to processPatch, but compute only the sum of squared differences (SSD) between the patches */
+inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
+                        float w11, int patch_sz)
+{
+    float SSD = 0.0f;
+#ifdef CV_SIMD128
+    if (patch_sz == 8)
+    {
+        v_float32x4 SSD_vec = v_setall_f32(0);
+        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
+        for (int row = 0; row < 8; row++)
+        {
+            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
+            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
+        }
+        SSD = v_reduce_sum(SSD_vec);
+    }
+    else
+    {
+#endif
+        float diff;
+        for (int i = 0; i < patch_sz; i++)
+            for (int j = 0; j < patch_sz; j++)
+            {
+                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
+                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
+                       I0_ptr[i * I0_stride + j];
+                SSD += diff * diff;
+            }
+#ifdef CV_SIMD128
+    }
+#endif
+    return SSD;
+}
+
+/* Same as computeSSD, but with patch mean normalization */
+inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01,
+                                float w10, float w11, int patch_sz)
+{
+    float sum_diff = 0.0f, sum_diff_sq = 0.0f;
+    float n = (float)patch_sz * patch_sz;
+#ifdef CV_SIMD128
+    if (patch_sz == 8)
+    {
+        v_float32x4 sum_diff_vec = v_setall_f32(0);
+        v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
+        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
+        for (int row = 0; row < 8; row++)
+        {
+            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
+            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            sum_diff_vec += I_diff_left + I_diff_right;
+            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
+        }
+        sum_diff = v_reduce_sum(sum_diff_vec);
+        sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
+    }
+    else
+    {
+#endif
+        float diff;
+        for (int i = 0; i < patch_sz; i++)
+            for (int j = 0; j < patch_sz; j++)
+            {
+                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
+                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
+                       I0_ptr[i * I0_stride + j];
+
+                sum_diff += diff;
+                sum_diff_sq += diff * diff;
+            }
+#ifdef CV_SIMD128
+    }
+#endif
+    return sum_diff_sq - sum_diff * sum_diff / n;
+}
+
+#undef HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION
+#undef HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION
+#undef HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const
 {
-    float *Ux_ptr = src_Ux.ptr<float>();
-    float *Uy_ptr = src_Uy.ptr<float>();
-    float *Sx_ptr = dst_Sx.ptr<float>();
-    float *Sy_ptr = dst_Sy.ptr<float>();
-    uchar *I0_ptr = I0.ptr<uchar>();
-    uchar *I1_ptr = I1.ptr<uchar>();
-    short *I0x_ptr = I0x.ptr<short>();
-    short *I0y_ptr = I0y.ptr<short>();
-    int w = I0.cols;
-    int h = I1.rows;
-    int psz2 = patch_size / 2;
-    // width and height of the sparse OF fields:
-    int ws = 1 + (w - patch_size) / patch_stride;
-    int hs = 1 + (h - patch_size) / patch_stride;
-
-    precomputeStructureTensor(I0xx_buf, I0yy_buf, I0xy_buf, I0x, I0y);
-    float *xx_ptr = I0xx_buf.ptr<float>();
-    float *yy_ptr = I0yy_buf.ptr<float>();
-    float *xy_ptr = I0xy_buf.ptr<float>();
-
-    // perform a fixed number of gradient descent iterations for each patch:
-    int i = psz2;
-    for (int is = 0; is < hs; is++)
-    {
-        int j = psz2;
-        for (int js = 0; js < ws; js++)
-        {
-            float cur_Ux = Ux_ptr[i * w + j];
-            float cur_Uy = Uy_ptr[i * w + j];
-            float detH = xx_ptr[is * ws + js] * yy_ptr[is * ws + js] - xy_ptr[is * ws + js] * xy_ptr[is * ws + js];
+    // force separate processing of stripes if we are using spatial propagation:
+    if(dis->use_spatial_propagation && range.end>range.start+1)
+    {
+        for(int n=range.start;n<range.end;n++)
+            (*this)(Range(n,n+1));
+        return;
+    }
+    int psz = dis->patch_size;
+    int psz2 = psz / 2;
+    int w_ext = dis->w + 2 * dis->border_size; //!< width of I1_ext
+    int bsz = dis->border_size;
+
+    /* Input dense flow */
+    float *Ux_ptr = Ux->ptr<float>();
+    float *Uy_ptr = Uy->ptr<float>();
+
+    /* Output sparse flow */
+    float *Sx_ptr = Sx->ptr<float>();
+    float *Sy_ptr = Sy->ptr<float>();
+
+    uchar *I0_ptr = I0->ptr<uchar>();
+    uchar *I1_ptr = I1->ptr<uchar>();
+    short *I0x_ptr = I0x->ptr<short>();
+    short *I0y_ptr = I0y->ptr<short>();
+
+    /* Precomputed structure tensor */
+    float *xx_ptr = dis->I0xx_buf.ptr<float>();
+    float *yy_ptr = dis->I0yy_buf.ptr<float>();
+    float *xy_ptr = dis->I0xy_buf.ptr<float>();
+    /* And extra buffers for mean-normalization: */
+    float *x_ptr = dis->I0x_buf.ptr<float>();
+    float *y_ptr = dis->I0y_buf.ptr<float>();
+
+    int i, j, dir;
+    int start_is, end_is, start_js, end_js;
+    int start_i, start_j;
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + dis->h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + dis->w - 1.0f;
+    float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
+
+#define INIT_BILINEAR_WEIGHTS(Ux, Uy)                                                                                  \
+    i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit);                                                       \
+    j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit);                                                       \
+                                                                                                                       \
+    w11 = (i_I1 - floor(i_I1)) * (j_I1 - floor(j_I1));                                                                 \
+    w10 = (i_I1 - floor(i_I1)) * (floor(j_I1) + 1 - j_I1);                                                             \
+    w01 = (floor(i_I1) + 1 - i_I1) * (j_I1 - floor(j_I1));                                                             \
+    w00 = (floor(i_I1) + 1 - i_I1) * (floor(j_I1) + 1 - j_I1);
+
+#define COMPUTE_SSD(dst, Ux, Uy)                                                                                       \
+    INIT_BILINEAR_WEIGHTS(Ux, Uy);                                                                                     \
+    if (dis->use_mean_normalization)                                                                                   \
+        dst = computeSSDMeanNorm(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00,  \
+                                 w01, w10, w11, psz);                                                                  \
+    else                                                                                                               \
+        dst = computeSSD(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, w01,     \
+                         w10, w11, psz);
+
+    int num_inner_iter = (int)floor(dis->grad_descent_iter / (float)num_iter);
+    for (int iter = 0; iter < num_iter; iter++)
+    {
+        if (iter % 2 == 0)
+        {
+            dir = 1;
+            start_is = min(range.start * stripe_sz, hs);
+            end_is = min(range.end * stripe_sz, hs);
+            start_js = 0;
+            end_js = dis->ws;
+            start_i = start_is * dis->patch_stride;
+            start_j = 0;
+        }
+        else
+        {
+            dir = -1;
+            start_is = min(range.end * stripe_sz, hs) - 1;
+            end_is = min(range.start * stripe_sz, hs) - 1;
+            start_js = dis->ws - 1;
+            end_js = -1;
+            start_i = start_is * dis->patch_stride;
+            start_j = (dis->ws - 1) * dis->patch_stride;
+        }
+
+        i = start_i;
+        for (int is = start_is; dir * is < dir * end_is; is += dir)
+        {
+            j = start_j;
+            for (int js = start_js; dir * js < dir * end_js; js += dir)
+            {
+                if (iter == 0)
+                {
+                    /* Using result form the previous pyramid level as the very first approximation: */
+                    Sx_ptr[is * dis->ws + js] = Ux_ptr[(i + psz2) * dis->w + j + psz2];
+                    Sy_ptr[is * dis->ws + js] = Uy_ptr[(i + psz2) * dis->w + j + psz2];
+                }
+
+                if (dis->use_spatial_propagation)
+                {
+                    /* Updating the current Sx_ptr, Sy_ptr to the best candidate: */
+                    float min_SSD, cur_SSD;
+                    COMPUTE_SSD(min_SSD, Sx_ptr[is * dis->ws + js], Sy_ptr[is * dis->ws + js]);
+                    if (dir * js > dir * start_js)
+                    {
+                        COMPUTE_SSD(cur_SSD, Sx_ptr[is * dis->ws + js - dir], Sy_ptr[is * dis->ws + js - dir]);
+                        if (cur_SSD < min_SSD)
+                        {
+                            min_SSD = cur_SSD;
+                            Sx_ptr[is * dis->ws + js] = Sx_ptr[is * dis->ws + js - dir];
+                            Sy_ptr[is * dis->ws + js] = Sy_ptr[is * dis->ws + js - dir];
+                        }
+                    }
+                    /* Flow vectors won't actually propagate across different stripes, which is the reason for keeping
+                     * the number of stripes constant. It works well enough in practice and doesn't introduce any
+                     * visible seams.
+                     */
+                    if (dir * is > dir * start_is)
+                    {
+                        COMPUTE_SSD(cur_SSD, Sx_ptr[(is - dir) * dis->ws + js], Sy_ptr[(is - dir) * dis->ws + js]);
+                        if (cur_SSD < min_SSD)
+                        {
+                            min_SSD = cur_SSD;
+                            Sx_ptr[is * dis->ws + js] = Sx_ptr[(is - dir) * dis->ws + js];
+                            Sy_ptr[is * dis->ws + js] = Sy_ptr[(is - dir) * dis->ws + js];
+                        }
+                    }
+                }
+
+                /* Use the best candidate as a starting point for the gradient descent: */
+                float cur_Ux = Sx_ptr[is * dis->ws + js];
+                float cur_Uy = Sy_ptr[is * dis->ws + js];
+
+                /* Computing the inverse of the structure tensor: */
+                float detH = xx_ptr[is * dis->ws + js] * yy_ptr[is * dis->ws + js] -
+                             xy_ptr[is * dis->ws + js] * xy_ptr[is * dis->ws + js];
                if (abs(detH) < EPS)
                    detH = EPS;
-            float invH11 = yy_ptr[is * ws + js] / detH;
-            float invH12 = -xy_ptr[is * ws + js] / detH;
-            float invH22 = xx_ptr[is * ws + js] / detH;
-            float prev_sum_diff = 100000000.0f;
-            for (int t = 0; t < grad_descent_iter; t++)
-            {
-                float dUx = 0, dUy = 0;
-                float diff = 0;
-                float sum_diff = 0.0f;
-                for (int pos_y = i - psz2; pos_y <= i + psz2; pos_y++)
-                    for (int pos_x = j - psz2; pos_x <= j + psz2; pos_x++)
-                    {
-                        float pos_x_shifted = min(max(pos_x + cur_Ux, 0.0f), w - 1.0f - EPS);
-                        float pos_y_shifted = min(max(pos_y + cur_Uy, 0.0f), h - 1.0f - EPS);
-                        int pos_x_lower = (int)pos_x_shifted;
-                        int pos_x_upper = pos_x_lower + 1;
-                        int pos_y_lower = (int)pos_y_shifted;
-                        int pos_y_upper = pos_y_lower + 1;
-                        diff = (pos_x_shifted - pos_x_lower) * (pos_y_shifted - pos_y_lower) *
-                                 I1_ptr[pos_y_upper * w + pos_x_upper] +
-                               (pos_x_upper - pos_x_shifted) * (pos_y_shifted - pos_y_lower) *
-                                 I1_ptr[pos_y_upper * w + pos_x_lower] +
-                               (pos_x_shifted - pos_x_lower) * (pos_y_upper - pos_y_shifted) *
-                                 I1_ptr[pos_y_lower * w + pos_x_upper] +
-                               (pos_x_upper - pos_x_shifted) * (pos_y_upper - pos_y_shifted) *
-                                 I1_ptr[pos_y_lower * w + pos_x_lower] -
-                               I0_ptr[pos_y * w + pos_x];
-                        sum_diff += diff * diff;
-                        dUx += I0x_ptr[pos_y * w + pos_x] * diff;
-                        dUy += I0y_ptr[pos_y * w + pos_x] * diff;
-                    }
-                cur_Ux -= invH11 * dUx + invH12 * dUy;
-                cur_Uy -= invH12 * dUx + invH22 * dUy;
-                if (sum_diff > prev_sum_diff)
+                float invH11 = yy_ptr[is * dis->ws + js] / detH;
+                float invH12 = -xy_ptr[is * dis->ws + js] / detH;
+                float invH22 = xx_ptr[is * dis->ws + js] / detH;
+                float prev_SSD = INF, SSD;
+                float x_grad_sum = x_ptr[is * dis->ws + js];
+                float y_grad_sum = y_ptr[is * dis->ws + js];
+
+                for (int t = 0; t < num_inner_iter; t++)
+                {
+                    INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
+                    if (dis->use_mean_normalization)
+                        SSD = processPatchMeanNorm(dUx, dUy, I0_ptr + i * dis->w + j,
+                                                   I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * dis->w + j,
+                                                   I0y_ptr + i * dis->w + j, dis->w, w_ext, w00, w01, w10, w11, psz,
+                                                   x_grad_sum, y_grad_sum);
+                    else
+                        SSD = processPatch(dUx, dUy, I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                           I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j, dis->w, w_ext, w00, w01,
+                                           w10, w11, psz);
+
+                    dx = invH11 * dUx + invH12 * dUy;
+                    dy = invH12 * dUx + invH22 * dUy;
+                    cur_Ux -= dx;
+                    cur_Uy -= dy;
+
+                    /* Break when patch distance stops decreasing */
+                    if (SSD >= prev_SSD)
                        break;
-                prev_sum_diff = sum_diff;
+                    prev_SSD = SSD;
                }
-            if (norm(Vec2f(cur_Ux - Ux_ptr[i * w + j], cur_Uy - Uy_ptr[i * w + j])) <= patch_size)
+
+                /* If gradient descent converged to a flow vector that is very far from the initial approximation
+                 * (more than patch size) then we don't use it. Noticeably improves the robustness.
+                 */
+                if (norm(Vec2f(cur_Ux - Sx_ptr[is * dis->ws + js], cur_Uy - Sy_ptr[is * dis->ws + js])) <= psz)
                {
-                Sx_ptr[is * ws + js] = cur_Ux;
-                Sy_ptr[is * ws + js] = cur_Uy;
+                    Sx_ptr[is * dis->ws + js] = cur_Ux;
+                    Sy_ptr[is * dis->ws + js] = cur_Uy;
                }
-            else
-            {
-                Sx_ptr[is * ws + js] = Ux_ptr[i * w + j];
-                Sy_ptr[is * ws + js] = Uy_ptr[i * w + j];
+                j += dir * dis->patch_stride;
            }
-            j += patch_stride;
+            i += dir * dis->patch_stride;
        }
-        i += patch_stride;
    }
+#undef INIT_BILINEAR_WEIGHTS
+#undef COMPUTE_SSD
 }

-void DISOpticalFlowImpl::densification(Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy, Mat &I0, Mat &I1)
+DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h,
+                                                                 Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy,
+                                                                 Mat &_I0, Mat &_I1)
+    : dis(&_dis), nstripes(_nstripes), h(_h), Ux(&dst_Ux), Uy(&dst_Uy), Sx(&src_Sx), Sy(&src_Sy), I0(&_I0), I1(&_I1)
 {
-    float *Ux_ptr = dst_Ux.ptr<float>();
-    float *Uy_ptr = dst_Uy.ptr<float>();
-    float *Sx_ptr = src_Sx.ptr<float>();
-    float *Sy_ptr = src_Sy.ptr<float>();
-    uchar *I0_ptr = I0.ptr<uchar>();
-    uchar *I1_ptr = I1.ptr<uchar>();
-    int w = I0.cols;
-    int h = I0.rows;
-    int psz2 = patch_size / 2;
-    // width of the sparse OF fields:
-    int ws = 1 + (w - patch_size) / patch_stride;
-
-    int start_x;
-    int start_y;
-
-    start_y = psz2;
-    for (int i = 0; i < h; i++)
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+
+/* This function transforms a sparse optical flow field obtained by PatchInverseSearch (which computes flow values
+ * on a sparse grid defined by patch_stride) into a dense optical flow field by weighted averaging of values from the
+ * overlapping patches.
+ */
+void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const
+{
+    int start_i = min(range.start * stripe_sz, h);
+    int end_i = min(range.end * stripe_sz, h);
+
+    /* Input sparse flow */
+    float *Sx_ptr = Sx->ptr<float>();
+    float *Sy_ptr = Sy->ptr<float>();
+
+    /* Output dense flow */
+    float *Ux_ptr = Ux->ptr<float>();
+    float *Uy_ptr = Uy->ptr<float>();
+
+    uchar *I0_ptr = I0->ptr<uchar>();
+    uchar *I1_ptr = I1->ptr<uchar>();
+
+    int psz = dis->patch_size;
+    int pstr = dis->patch_stride;
+    int i_l, i_u;
+    int j_l, j_u;
+    float i_m, j_m, diff;
+
+    /* These values define the set of sparse grid locations that contain patches overlapping with the current dense flow
+     * location */
+    int start_is, end_is;
+    int start_js, end_js;
+
+/* Some helper macros for updating this set of sparse grid locations */
+#define UPDATE_SPARSE_I_COORDINATES                                                                                    \
+    if (i % pstr == 0 && i + psz <= h)                                                                                 \
+        end_is++;                                                                                                      \
+    if (i - psz >= 0 && (i - psz) % pstr == 0 && start_is < end_is)                                                    \
+        start_is++;
+
+#define UPDATE_SPARSE_J_COORDINATES                                                                                    \
+    if (j % pstr == 0 && j + psz <= dis->w)                                                                            \
+        end_js++;                                                                                                      \
+    if (j - psz >= 0 && (j - psz) % pstr == 0 && start_js < end_js)                                                    \
+        start_js++;
+
+    start_is = 0;
+    end_is = -1;
+    for (int i = 0; i < start_i; i++)
    {
-        if (i - psz2 > start_y && start_y + patch_stride < h - psz2)
-            start_y += patch_stride;
-        start_x = psz2;
-        for (int j = 0; j < w; j++)
+        UPDATE_SPARSE_I_COORDINATES;
+    }
+    for (int i = start_i; i < end_i; i++)
    {
+        UPDATE_SPARSE_I_COORDINATES;
+        start_js = 0;
+        end_js = -1;
+        for (int j = 0; j < dis->w; j++)
+        {
+            UPDATE_SPARSE_J_COORDINATES;
            float coef, sum_coef = 0.0f;
            float sum_Ux = 0.0f;
            float sum_Uy = 0.0f;

-            if (j - psz2 > start_x && start_x + patch_stride < w - psz2)
-                start_x += patch_stride;
-
-            for (int pos_y = start_y; pos_y <= min(i + psz2, h - psz2 - 1); pos_y += patch_stride)
-                for (int pos_x = start_x; pos_x <= min(j + psz2, w - psz2 - 1); pos_x += patch_stride)
+            /* Iterate through all the patches that overlap the current location (i,j) */
+            for (int is = start_is; is <= end_is; is++)
+                for (int js = start_js; js <= end_js; js++)
                {
-                    float diff;
-                    int is = (pos_y - psz2) / patch_stride;
-                    int js = (pos_x - psz2) / patch_stride;
-                    float j_shifted = min(max(j + Sx_ptr[is * ws + js], 0.0f), w - 1.0f - EPS);
-                    float i_shifted = min(max(i + Sy_ptr[is * ws + js], 0.0f), h - 1.0f - EPS);
-                    int j_lower = (int)j_shifted;
-                    int j_upper = j_lower + 1;
-                    int i_lower = (int)i_shifted;
-                    int i_upper = i_lower + 1;
-                    diff = (j_shifted - j_lower) * (i_shifted - i_lower) * I1_ptr[i_upper * w + j_upper] +
-                           (j_upper - j_shifted) * (i_shifted - i_lower) * I1_ptr[i_upper * w + j_lower] +
-                           (j_shifted - j_lower) * (i_upper - i_shifted) * I1_ptr[i_lower * w + j_upper] +
-                           (j_upper - j_shifted) * (i_upper - i_shifted) * I1_ptr[i_lower * w + j_lower] -
-                           I0_ptr[i * w + j];
-                    coef = 1 / max(1.0f, diff * diff);
-                    sum_Ux += coef * Sx_ptr[is * ws + js];
-                    sum_Uy += coef * Sy_ptr[is * ws + js];
+                    j_m = min(max(j + Sx_ptr[is * dis->ws + js], 0.0f), dis->w - 1.0f - EPS);
+                    i_m = min(max(i + Sy_ptr[is * dis->ws + js], 0.0f), dis->h - 1.0f - EPS);
+                    j_l = (int)j_m;
+                    j_u = j_l + 1;
+                    i_l = (int)i_m;
+                    i_u = i_l + 1;
+                    diff = (j_m - j_l) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_u] +
+                           (j_u - j_m) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_l] +
+                           (j_m - j_l) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_u] +
+                           (j_u - j_m) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_l] - I0_ptr[i * dis->w + j];
+                    coef = 1 / max(1.0f, abs(diff));
+                    sum_Ux += coef * Sx_ptr[is * dis->ws + js];
+                    sum_Uy += coef * Sy_ptr[is * dis->ws + js];
                    sum_coef += coef;
                }
-            Ux_ptr[i * w + j] = sum_Ux / sum_coef;
-            Uy_ptr[i * w + j] = sum_Uy / sum_coef;
+            Ux_ptr[i * dis->w + j] = sum_Ux / sum_coef;
+            Uy_ptr[i * dis->w + j] = sum_Uy / sum_coef;
        }
    }
+#undef UPDATE_SPARSE_I_COORDINATES
+#undef UPDATE_SPARSE_J_COORDINATES
 }

 void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
@@ -419,12 +948,15 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo
    CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
    CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1);
    CV_Assert(I0.sameSize(I1));
+    CV_Assert(I0.isContinuous());
+    CV_Assert(I1.isContinuous());

    Mat I0Mat = I0.getMat();
    Mat I1Mat = I1.getMat();
    flow.create(I1Mat.size(), CV_32FC2);
    Mat &flowMat = flow.getMatRef();
-    coarsest_scale = (int)(log((2 * I0Mat.cols) / (4.0 * patch_size))/log(2.0) + 0.5) - 1;
+    coarsest_scale = (int)(log((2 * I0Mat.cols) / (4.0 * patch_size)) / log(2.0) + 0.5) - 1;
+    int num_stripes = getNumThreads();

    prepareBuffers(I0Mat, I1Mat);
    Ux[coarsest_scale].setTo(0.0f);
@@ -432,9 +964,31 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo

    for (int i = coarsest_scale; i >= finest_scale; i--)
    {
-        patchInverseSearch(Sx, Sy, Ux[i], Uy[i], I0s[i], I0xs[i], I0ys[i], I1s[i]);
-        densification(Ux[i], Uy[i], Sx, Sy, I0s[i], I1s[i]);
-        // TODO: variational refinement step
+        w = I0s[i].cols;
+        h = I0s[i].rows;
+        ws = 1 + (w - patch_size) / patch_stride;
+        hs = 1 + (h - patch_size) / patch_stride;
+
+        precomputeStructureTensor(I0xx_buf, I0yy_buf, I0xy_buf, I0x_buf, I0y_buf, I0xs[i], I0ys[i]);
+        if (use_spatial_propagation)
+        {
+            /* Use a fixed number of stripes regardless the number of threads to make inverse search
+             * with spatial propagation reproducible
+             */
+            parallel_for_(Range(0, 8), PatchInverseSearch_ParBody(*this, 8, hs, Sx, Sy, Ux[i], Uy[i], I0s[i],
+                                                                  I1s_ext[i], I0xs[i], I0ys[i], 2));
+        }
+        else
+        {
+            parallel_for_(Range(0, num_stripes),
+                          PatchInverseSearch_ParBody(*this, num_stripes, hs, Sx, Sy, Ux[i], Uy[i], I0s[i], I1s_ext[i],
+                                                     I0xs[i], I0ys[i], 1));
+        }
+
+        parallel_for_(Range(0, num_stripes),
+                      Densification_ParBody(*this, num_stripes, I0s[i].rows, Ux[i], Uy[i], Sx, Sy, I0s[i], I1s[i]));
+        if (variational_refinement_iter > 0)
+            variational_refinement_processors[i]->calcUV(I0s[i], I1s[i], Ux[i], Uy[i]);

        if (i > finest_scale)
        {
@@ -454,6 +1008,7 @@ void DISOpticalFlowImpl::collectGarbage()
 {
    I0s.clear();
    I1s.clear();
+    I1s_ext.clear();
    I0xs.clear();
    I0ys.clear();
    Ux.clear();
@@ -467,8 +1022,39 @@ void DISOpticalFlowImpl::collectGarbage()
    I0xx_buf_aux.release();
    I0yy_buf_aux.release();
    I0xy_buf_aux.release();
+
+    for (int i = finest_scale; i <= coarsest_scale; i++)
+        variational_refinement_processors[i]->collectGarbage();
+    variational_refinement_processors.clear();
 }

-Ptr<DISOpticalFlow> createOptFlow_DIS() { return makePtr<DISOpticalFlowImpl>(); }
+Ptr<DISOpticalFlow> createOptFlow_DIS(int preset)
+{
+    Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>();
+    dis->setPatchSize(8);
+    if (preset == DISOpticalFlow::PRESET_ULTRAFAST)
+    {
+        dis->setFinestScale(2);
+        dis->setPatchStride(4);
+        dis->setGradientDescentIterations(12);
+        dis->setVariationalRefinementIterations(0);
+    }
+    else if (preset == DISOpticalFlow::PRESET_FAST)
+    {
+        dis->setFinestScale(2);
+        dis->setPatchStride(4);
+        dis->setGradientDescentIterations(16);
+        dis->setVariationalRefinementIterations(5);
+    }
+    else if (preset == DISOpticalFlow::PRESET_MEDIUM)
+    {
+        dis->setFinestScale(1);
+        dis->setPatchStride(3);
+        dis->setGradientDescentIterations(25);
+        dis->setVariationalRefinementIterations(5);
+    }
+
+    return dis;
+}
 }
 }
--- a/modules/optflow/src/variational_refinement.cpp
+++ b/modules/optflow/src/variational_refinement.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/hal/intrin.hpp"
+#include "precomp.hpp"
+using namespace std;
+
+namespace cv
+{
+namespace optflow
+{
+
+class VariationalRefinementImpl : public VariationalRefinement
+{
+  public:
+    VariationalRefinementImpl();
+
+    void calc(InputArray I0, InputArray I1, InputOutputArray flow);
+    void calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v);
+    void collectGarbage();
+
+  protected: //!< algorithm parameters
+    int fixedPointIterations, sorIterations;
+    float omega;
+    float alpha, delta, gamma;
+    float zeta, epsilon;
+
+  public:
+    int getFixedPointIterations() const { return fixedPointIterations; }
+    void setFixedPointIterations(int val) { fixedPointIterations = val; }
+    int getSorIterations() const { return sorIterations; }
+    void setSorIterations(int val) { sorIterations = val; }
+    float getOmega() const { return omega; }
+    void setOmega(float val) { omega = val; }
+    float getAlpha() const { return alpha; }
+    void setAlpha(float val) { alpha = val; }
+    float getDelta() const { return delta; }
+    void setDelta(float val) { delta = val; }
+    float getGamma() const { return gamma; }
+    void setGamma(float val) { gamma = val; }
+
+  protected: //!< internal buffers
+    /* This struct defines a special data layout for Mat_<float>. Original buffer is split into two: one for "red"
+     * elements (sum of indices is even) and one for "black" (sum of indices is odd) in a checkerboard pattern. It
+     * allows for more efficient processing in SOR iterations, more natural SIMD vectorization and parallelization
+     * (Red-Black SOR). Additionally, it simplifies border handling by adding repeated borders to both red and
+     * black buffers.
+     */
+    struct RedBlackBuffer
+    {
+        Mat_<float> red;   //!< (i+j)%2==0
+        Mat_<float> black; //!< (i+j)%2==1
+
+        /* Width of even and odd rows may be different */
+        int red_even_len, red_odd_len;
+        int black_even_len, black_odd_len;
+
+        void create(Size s);
+        void release();
+    };
+
+    Mat_<float> Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz;                            //!< image derivative buffers
+    RedBlackBuffer Ix_rb, Iy_rb, Iz_rb, Ixx_rb, Ixy_rb, Iyy_rb, Ixz_rb, Iyz_rb; //!< corresponding red-black buffers
+
+    RedBlackBuffer A11, A12, A22, b1, b2; //!< main linear system coefficients
+    RedBlackBuffer weights;               //!< smoothness term weights in the current fixed point iteration
+
+    Mat_<float> mapX, mapY; //!< auxiliary buffers for remapping
+
+    RedBlackBuffer tempW_u, tempW_v; //!< flow buffers that are modified in each fixed point iteration
+    RedBlackBuffer dW_u, dW_v;       //!< optical flow increment
+    RedBlackBuffer W_u_rb, W_v_rb;   //!< red-black-buffer version of the input flow
+
+  private: //!< private methods and parallel sections
+    void splitCheckerboard(RedBlackBuffer &dst, Mat &src);
+    void mergeCheckerboard(Mat &dst, RedBlackBuffer &src);
+    void updateRepeatedBorders(RedBlackBuffer &dst);
+    void warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat &flow_v);
+    void prepareBuffers(Mat &I0, Mat &I1, Mat &W_u, Mat &W_v);
+
+    /* Parallelizing arbitrary operations with 3 input/output arguments */
+    typedef void (VariationalRefinementImpl::*Op)(void *op1, void *op2, void *op3);
+    struct ParallelOp_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        vector<Op> ops;
+        vector<void *> op1s;
+        vector<void *> op2s;
+        vector<void *> op3s;
+
+        ParallelOp_ParBody(VariationalRefinementImpl &_var, vector<Op> _ops, vector<void *> &_op1s,
+                           vector<void *> &_op2s, vector<void *> &_op3s);
+        void operator()(const Range &range) const;
+    };
+    void gradHorizAndSplitOp(void *src, void *dst, void *dst_split)
+    {
+        Sobel(*(Mat *)src, *(Mat *)dst, -1, 1, 0, 1, 1, 0.00, BORDER_REPLICATE);
+        splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst);
+    }
+    void gradVertAndSplitOp(void *src, void *dst, void *dst_split)
+    {
+        Sobel(*(Mat *)src, *(Mat *)dst, -1, 0, 1, 1, 1, 0.00, BORDER_REPLICATE);
+        splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst);
+    }
+    void averageOp(void *src1, void *src2, void *dst)
+    {
+        addWeighted(*(Mat *)src1, 0.5, *(Mat *)src2, 0.5, 0.0, *(Mat *)dst, CV_32F);
+    }
+    void subtractOp(void *src1, void *src2, void *dst)
+    {
+        subtract(*(Mat *)src1, *(Mat *)src2, *(Mat *)dst, noArray(), CV_32F);
+    }
+
+    struct ComputeDataTerm_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        int nstripes, stripe_sz;
+        int h;
+        RedBlackBuffer *dW_u, *dW_v;
+        bool red_pass;
+
+        ComputeDataTerm_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_dW_u,
+                                RedBlackBuffer &_dW_v, bool _red_pass);
+        void operator()(const Range &range) const;
+    };
+
+    struct ComputeSmoothnessTermHorPass_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        int nstripes, stripe_sz;
+        int h;
+        RedBlackBuffer *W_u, *W_v, *curW_u, *curW_v;
+        bool red_pass;
+
+        ComputeSmoothnessTermHorPass_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h,
+                                             RedBlackBuffer &_W_u, RedBlackBuffer &_W_v, RedBlackBuffer &_tempW_u,
+                                             RedBlackBuffer &_tempW_v, bool _red_pass);
+        void operator()(const Range &range) const;
+    };
+
+    struct ComputeSmoothnessTermVertPass_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        int nstripes, stripe_sz;
+        int h;
+        RedBlackBuffer *W_u, *W_v;
+        bool red_pass;
+
+        ComputeSmoothnessTermVertPass_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h,
+                                              RedBlackBuffer &W_u, RedBlackBuffer &_W_v, bool _red_pass);
+        void operator()(const Range &range) const;
+    };
+
+    struct RedBlackSOR_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        int nstripes, stripe_sz;
+        int h;
+        RedBlackBuffer *dW_u, *dW_v;
+        bool red_pass;
+
+        RedBlackSOR_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_dW_u,
+                            RedBlackBuffer &_dW_v, bool _red_pass);
+        void operator()(const Range &range) const;
+    };
+};
+
+VariationalRefinementImpl::VariationalRefinementImpl()
+{
+    fixedPointIterations = 5;
+    sorIterations = 5;
+    alpha = 20.0f;
+    delta = 5.0f;
+    gamma = 10.0f;
+    omega = 1.6f;
+    zeta = 0.1f;
+    epsilon = 0.001f;
+}
+
+/* This function converts an input Mat into the RedBlackBuffer format, which involves
+ * splitting the input buffer into two and adding repeated borders. Assumes that enough
+ * memory in dst is already allocated.
+ */
+void VariationalRefinementImpl::splitCheckerboard(RedBlackBuffer &dst, Mat &src)
+{
+    int buf_j, j;
+    int buf_w = (int)ceil(src.cols / 2.0) + 2; //!< max width of red/black buffers with borders
+
+    /* Rows of red and black buffers can have different actual width, some extra repeated values are
+     * added for padding in such cases.
+     */
+    for (int i = 0; i < src.rows; i++)
+    {
+        float *src_buf = src.ptr<float>(i);
+        float *r_buf = dst.red.ptr<float>(i + 1);
+        float *b_buf = dst.black.ptr<float>(i + 1);
+        r_buf[0] = b_buf[0] = src_buf[0];
+        buf_j = 1;
+        if (i % 2 == 0)
+        {
+            for (j = 0; j < src.cols - 1; j += 2)
+            {
+                r_buf[buf_j] = src_buf[j];
+                b_buf[buf_j] = src_buf[j + 1];
+                buf_j++;
+            }
+            if (j < src.cols)
+                r_buf[buf_j] = b_buf[buf_j] = src_buf[j];
+            else
+                j--;
+        }
+        else
+        {
+            for (j = 0; j < src.cols - 1; j += 2)
+            {
+                b_buf[buf_j] = src_buf[j];
+                r_buf[buf_j] = src_buf[j + 1];
+                buf_j++;
+            }
+            if (j < src.cols)
+                r_buf[buf_j] = b_buf[buf_j] = src_buf[j];
+            else
+                j--;
+        }
+        r_buf[buf_w - 1] = b_buf[buf_w - 1] = src_buf[j];
+    }
+
+    /* Fill top and bottom borders: */
+    {
+        float *r_buf_border = dst.red.ptr<float>(dst.red.rows - 1);
+        float *b_buf_border = dst.black.ptr<float>(dst.black.rows - 1);
+        float *r_buf = dst.red.ptr<float>(dst.red.rows - 2);
+        float *b_buf = dst.black.ptr<float>(dst.black.rows - 2);
+        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
+        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
+    }
+    {
+        float *r_buf_border = dst.red.ptr<float>(0);
+        float *b_buf_border = dst.black.ptr<float>(0);
+        float *r_buf = dst.red.ptr<float>(1);
+        float *b_buf = dst.black.ptr<float>(1);
+        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
+        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
+    }
+}
+
+/* The inverse of splitCheckerboard, i.e. converting the RedBlackBuffer back into Mat.
+ * Assumes that enough memory in dst is already allocated.
+ */
+void VariationalRefinementImpl::mergeCheckerboard(Mat &dst, RedBlackBuffer &src)
+{
+    int buf_j, j;
+    for (int i = 0; i < dst.rows; i++)
+    {
+        float *src_r_buf = src.red.ptr<float>(i + 1);
+        float *src_b_buf = src.black.ptr<float>(i + 1);
+        float *dst_buf = dst.ptr<float>(i);
+        buf_j = 1;
+
+        if (i % 2 == 0)
+        {
+            for (j = 0; j < dst.cols - 1; j += 2)
+            {
+                dst_buf[j] = src_r_buf[buf_j];
+                dst_buf[j + 1] = src_b_buf[buf_j];
+                buf_j++;
+            }
+            if (j < dst.cols)
+                dst_buf[j] = src_r_buf[buf_j];
+        }
+        else
+        {
+            for (j = 0; j < dst.cols - 1; j += 2)
+            {
+                dst_buf[j] = src_b_buf[buf_j];
+                dst_buf[j + 1] = src_r_buf[buf_j];
+                buf_j++;
+            }
+            if (j < dst.cols)
+                dst_buf[j] = src_b_buf[buf_j];
+        }
+    }
+}
+
+/* An auxiliary function that updates the borders. Used to enforce that border values repeat
+ * the ones adjacent to the border.
+ */
+void VariationalRefinementImpl::updateRepeatedBorders(RedBlackBuffer &dst)
+{
+    int buf_w = dst.red.cols;
+    for (int i = 0; i < dst.red.rows - 2; i++)
+    {
+        float *r_buf = dst.red.ptr<float>(i + 1);
+        float *b_buf = dst.black.ptr<float>(i + 1);
+
+        if (i % 2 == 0)
+        {
+            b_buf[0] = r_buf[1];
+            if (dst.red_even_len > dst.black_even_len)
+                b_buf[dst.black_even_len + 1] = r_buf[dst.red_even_len];
+            else
+                r_buf[dst.red_even_len + 1] = b_buf[dst.black_even_len];
+        }
+        else
+        {
+            r_buf[0] = b_buf[1];
+            if (dst.red_odd_len < dst.black_odd_len)
+                r_buf[dst.red_odd_len + 1] = b_buf[dst.black_odd_len];
+            else
+                b_buf[dst.black_odd_len + 1] = r_buf[dst.red_odd_len];
+        }
+    }
+    {
+        float *r_buf_border = dst.red.ptr<float>(dst.red.rows - 1);
+        float *b_buf_border = dst.black.ptr<float>(dst.black.rows - 1);
+        float *r_buf = dst.red.ptr<float>(dst.red.rows - 2);
+        float *b_buf = dst.black.ptr<float>(dst.black.rows - 2);
+        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
+        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
+    }
+    {
+        float *r_buf_border = dst.red.ptr<float>(0);
+        float *b_buf_border = dst.black.ptr<float>(0);
+        float *r_buf = dst.red.ptr<float>(1);
+        float *b_buf = dst.black.ptr<float>(1);
+        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
+        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
+    }
+}
+
+void VariationalRefinementImpl::RedBlackBuffer::create(Size s)
+{
+    /* Allocate enough memory to include borders */
+    int w = (int)ceil(s.width / 2.0) + 2;
+    red.create(s.height + 2, w);
+    black.create(s.height + 2, w);
+
+    if (s.width % 2 == 0)
+        red_even_len = red_odd_len = black_even_len = black_odd_len = w - 2;
+    else
+    {
+        red_even_len = black_odd_len = w - 2;
+        red_odd_len = black_even_len = w - 3;
+    }
+}
+
+void VariationalRefinementImpl::RedBlackBuffer::release()
+{
+    red.release();
+    black.release();
+}
+
+VariationalRefinementImpl::ParallelOp_ParBody::ParallelOp_ParBody(VariationalRefinementImpl &_var, vector<Op> _ops,
+                                                                  vector<void *> &_op1s, vector<void *> &_op2s,
+                                                                  vector<void *> &_op3s)
+    : var(&_var), ops(_ops), op1s(_op1s), op2s(_op2s), op3s(_op3s)
+{
+}
+
+void VariationalRefinementImpl::ParallelOp_ParBody::operator()(const Range &range) const
+{
+    for (int i = range.start; i < range.end; i++)
+        (var->*ops[i])(op1s[i], op2s[i], op3s[i]);
+}
+
+void VariationalRefinementImpl::warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat &flow_v)
+{
+    for (int i = 0; i < flow_u.rows; i++)
+    {
+        float *pFlowU = flow_u.ptr<float>(i);
+        float *pFlowV = flow_v.ptr<float>(i);
+        float *pMapX = mapX.ptr<float>(i);
+        float *pMapY = mapY.ptr<float>(i);
+        for (int j = 0; j < flow_u.cols; j++)
+        {
+            pMapX[j] = j + pFlowU[j];
+            pMapY[j] = i + pFlowV[j];
+        }
+    }
+    remap(src, dst, mapX, mapY, INTER_LINEAR, BORDER_REPLICATE);
+}
+
+void VariationalRefinementImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &W_u, Mat &W_v)
+{
+    Size s = I0.size();
+    A11.create(s);
+    A12.create(s);
+    A22.create(s);
+    b1.create(s);
+    b2.create(s);
+    weights.create(s);
+    weights.red.setTo(0.0f);
+    weights.black.setTo(0.0f);
+    tempW_u.create(s);
+    tempW_v.create(s);
+    dW_u.create(s);
+    dW_v.create(s);
+    W_u_rb.create(s);
+    W_v_rb.create(s);
+
+    Ix.create(s);
+    Iy.create(s);
+    Iz.create(s);
+    Ixx.create(s);
+    Ixy.create(s);
+    Iyy.create(s);
+    Ixz.create(s);
+    Iyz.create(s);
+
+    Ix_rb.create(s);
+    Iy_rb.create(s);
+    Iz_rb.create(s);
+    Ixx_rb.create(s);
+    Ixy_rb.create(s);
+    Iyy_rb.create(s);
+    Ixz_rb.create(s);
+    Iyz_rb.create(s);
+
+    mapX.create(s);
+    mapY.create(s);
+
+    /* Floating point warps work significantly better than fixed-point */
+    Mat I1flt, warpedI;
+    I1.convertTo(I1flt, CV_32F);
+    warpImage(warpedI, I1flt, W_u, W_v);
+
+    /* Computing an average of the current and warped next frames (to compute the derivatives on) and
+     * temporal derivative Iz
+     */
+    Mat averagedI;
+    {
+        vector<void *> op1s;
+        op1s.push_back((void *)&I0);
+        op1s.push_back((void *)&warpedI);
+        vector<void *> op2s;
+        op2s.push_back((void *)&warpedI);
+        op2s.push_back((void *)&I0);
+        vector<void *> op3s;
+        op3s.push_back((void *)&averagedI);
+        op3s.push_back((void *)&Iz);
+        vector<Op> ops;
+        ops.push_back(&VariationalRefinementImpl::averageOp);
+        ops.push_back(&VariationalRefinementImpl::subtractOp);
+        parallel_for_(Range(0, 2), ParallelOp_ParBody(*this, ops, op1s, op2s, op3s));
+    }
+    splitCheckerboard(Iz_rb, Iz);
+
+    /* Computing first-order derivatives */
+    {
+        vector<void *> op1s;
+        op1s.push_back((void *)&averagedI);
+        op1s.push_back((void *)&averagedI);
+        op1s.push_back((void *)&Iz);
+        op1s.push_back((void *)&Iz);
+        vector<void *> op2s;
+        op2s.push_back((void *)&Ix);
+        op2s.push_back((void *)&Iy);
+        op2s.push_back((void *)&Ixz);
+        op2s.push_back((void *)&Iyz);
+        vector<void *> op3s;
+        op3s.push_back((void *)&Ix_rb);
+        op3s.push_back((void *)&Iy_rb);
+        op3s.push_back((void *)&Ixz_rb);
+        op3s.push_back((void *)&Iyz_rb);
+        vector<Op> ops;
+        ops.push_back(&VariationalRefinementImpl::gradHorizAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradHorizAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
+        parallel_for_(Range(0, 4), ParallelOp_ParBody(*this, ops, op1s, op2s, op3s));
+    }
+
+    /* Computing second-order derivatives */
+    {
+        vector<void *> op1s;
+        op1s.push_back((void *)&Ix);
+        op1s.push_back((void *)&Ix);
+        op1s.push_back((void *)&Iy);
+        vector<void *> op2s;
+        op2s.push_back((void *)&Ixx);
+        op2s.push_back((void *)&Ixy);
+        op2s.push_back((void *)&Iyy);
+        vector<void *> op3s;
+        op3s.push_back((void *)&Ixx_rb);
+        op3s.push_back((void *)&Ixy_rb);
+        op3s.push_back((void *)&Iyy_rb);
+        vector<Op> ops;
+        ops.push_back(&VariationalRefinementImpl::gradHorizAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
+        parallel_for_(Range(0, 3), ParallelOp_ParBody(*this, ops, op1s, op2s, op3s));
+    }
+}
+
+VariationalRefinementImpl::ComputeDataTerm_ParBody::ComputeDataTerm_ParBody(VariationalRefinementImpl &_var,
+                                                                            int _nstripes, int _h,
+                                                                            RedBlackBuffer &_dW_u,
+                                                                            RedBlackBuffer &_dW_v, bool _red_pass)
+    : var(&_var), nstripes(_nstripes), h(_h), dW_u(&_dW_u), dW_v(&_dW_v), red_pass(_red_pass)
+{
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+
+/* This function computes parts of the main linear system coefficients A11,A12,A22,b1,b1
+ * that correspond to the data term, which includes color and gradient constancy assumptions.
+ */
+void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range &range) const
+{
+    int start_i = min(range.start * stripe_sz, h);
+    int end_i = min(range.end * stripe_sz, h);
+
+    float zeta_squared = var->zeta * var->zeta;
+    float epsilon_squared = var->epsilon * var->epsilon;
+    float gamma2 = var->gamma / 2;
+    float delta2 = var->delta / 2;
+
+    float *pIx, *pIy, *pIz;
+    float *pIxx, *pIxy, *pIyy, *pIxz, *pIyz;
+    float *pdU, *pdV;
+    float *pa11, *pa12, *pa22, *pb1, *pb2;
+
+    float derivNorm, derivNorm2;
+    float Ik1z, Ik1zx, Ik1zy;
+    float weight;
+    int len;
+    for (int i = start_i; i < end_i; i++)
+    {
+#define INIT_ROW_POINTERS(color)                                                                                       \
+    pIx = var->Ix_rb.color.ptr<float>(i + 1) + 1;                                                                      \
+    pIy = var->Iy_rb.color.ptr<float>(i + 1) + 1;                                                                      \
+    pIz = var->Iz_rb.color.ptr<float>(i + 1) + 1;                                                                      \
+    pIxx = var->Ixx_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pIxy = var->Ixy_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pIyy = var->Iyy_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pIxz = var->Ixz_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pIyz = var->Iyz_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pa11 = var->A11.color.ptr<float>(i + 1) + 1;                                                                       \
+    pa12 = var->A12.color.ptr<float>(i + 1) + 1;                                                                       \
+    pa22 = var->A22.color.ptr<float>(i + 1) + 1;                                                                       \
+    pb1 = var->b1.color.ptr<float>(i + 1) + 1;                                                                         \
+    pb2 = var->b2.color.ptr<float>(i + 1) + 1;                                                                         \
+    pdU = dW_u->color.ptr<float>(i + 1) + 1;                                                                           \
+    pdV = dW_v->color.ptr<float>(i + 1) + 1;                                                                           \
+    if (i % 2 == 0)                                                                                                    \
+        len = var->Ix_rb.color##_even_len;                                                                             \
+    else                                                                                                               \
+        len = var->Ix_rb.color##_odd_len;
+
+        if (red_pass)
+        {
+            INIT_ROW_POINTERS(red);
+        }
+        else
+        {
+            INIT_ROW_POINTERS(black);
+        }
+#undef INIT_ROW_POINTERS
+
+        int j = 0;
+#ifdef CV_SIMD128
+        v_float32x4 zeta_vec = v_setall_f32(zeta_squared);
+        v_float32x4 eps_vec = v_setall_f32(epsilon_squared);
+        v_float32x4 delta_vec = v_setall_f32(delta2);
+        v_float32x4 gamma_vec = v_setall_f32(gamma2);
+        v_float32x4 zero_vec = v_setall_f32(0.0f);
+        v_float32x4 pIx_vec, pIy_vec, pIz_vec, pdU_vec, pdV_vec;
+        v_float32x4 pIxx_vec, pIxy_vec, pIyy_vec, pIxz_vec, pIyz_vec;
+        v_float32x4 derivNorm_vec, derivNorm2_vec, weight_vec;
+        v_float32x4 Ik1z_vec, Ik1zx_vec, Ik1zy_vec;
+        v_float32x4 pa11_vec, pa12_vec, pa22_vec, pb1_vec, pb2_vec;
+
+        for (; j < len - 3; j += 4)
+        {
+            pIx_vec = v_load(pIx + j);
+            pIy_vec = v_load(pIy + j);
+            pIz_vec = v_load(pIz + j);
+            pdU_vec = v_load(pdU + j);
+            pdV_vec = v_load(pdV + j);
+
+            derivNorm_vec = pIx_vec * pIx_vec + pIy_vec * pIy_vec + zeta_vec;
+            Ik1z_vec = pIz_vec + pIx_vec * pdU_vec + pIy_vec * pdV_vec;
+            weight_vec = (delta_vec / v_sqrt(Ik1z_vec * Ik1z_vec / derivNorm_vec + eps_vec)) / derivNorm_vec;
+
+            pa11_vec = weight_vec * (pIx_vec * pIx_vec) + zeta_vec;
+            pa12_vec = weight_vec * (pIx_vec * pIy_vec);
+            pa22_vec = weight_vec * (pIy_vec * pIy_vec) + zeta_vec;
+            pb1_vec = zero_vec - weight_vec * (pIz_vec * pIx_vec);
+            pb2_vec = zero_vec - weight_vec * (pIz_vec * pIy_vec);
+
+            pIxx_vec = v_load(pIxx + j);
+            pIxy_vec = v_load(pIxy + j);
+            pIyy_vec = v_load(pIyy + j);
+            pIxz_vec = v_load(pIxz + j);
+            pIyz_vec = v_load(pIyz + j);
+
+            derivNorm_vec = pIxx_vec * pIxx_vec + pIxy_vec * pIxy_vec + zeta_vec;
+            derivNorm2_vec = pIyy_vec * pIyy_vec + pIxy_vec * pIxy_vec + zeta_vec;
+            Ik1zx_vec = pIxz_vec + pIxx_vec * pdU_vec + pIxy_vec * pdV_vec;
+            Ik1zy_vec = pIyz_vec + pIxy_vec * pdU_vec + pIyy_vec * pdV_vec;
+            weight_vec = gamma_vec / v_sqrt(Ik1zx_vec * Ik1zx_vec / derivNorm_vec +
+                                            Ik1zy_vec * Ik1zy_vec / derivNorm2_vec + eps_vec);
+
+            pa11_vec += weight_vec * (pIxx_vec * pIxx_vec / derivNorm_vec + pIxy_vec * pIxy_vec / derivNorm2_vec);
+            pa12_vec += weight_vec * (pIxx_vec * pIxy_vec / derivNorm_vec + pIxy_vec * pIyy_vec / derivNorm2_vec);
+            pa22_vec += weight_vec * (pIxy_vec * pIxy_vec / derivNorm_vec + pIyy_vec * pIyy_vec / derivNorm2_vec);
+            pb1_vec -= weight_vec * (pIxx_vec * pIxz_vec / derivNorm_vec + pIxy_vec * pIyz_vec / derivNorm2_vec);
+            pb2_vec -= weight_vec * (pIxy_vec * pIxz_vec / derivNorm_vec + pIyy_vec * pIyz_vec / derivNorm2_vec);
+
+            v_store(pa11 + j, pa11_vec);
+            v_store(pa12 + j, pa12_vec);
+            v_store(pa22 + j, pa22_vec);
+            v_store(pb1 + j, pb1_vec);
+            v_store(pb2 + j, pb2_vec);
+        }
+#endif
+        for (; j < len; j++)
+        {
+            /* Step 1: Compute color constancy terms */
+            /* Normalization factor:*/
+            derivNorm = pIx[j] * pIx[j] + pIy[j] * pIy[j] + zeta_squared;
+            /* Color constancy penalty (computed by Taylor expansion):*/
+            Ik1z = pIz[j] + pIx[j] * pdU[j] + pIy[j] * pdV[j];
+            /* Weight of the color constancy term in the current fixed-point iteration divided by derivNorm: */
+            weight = (delta2 / sqrt(Ik1z * Ik1z / derivNorm + epsilon_squared)) / derivNorm;
+            /* Add respective color constancy terms to the linear system coefficients: */
+            pa11[j] = weight * (pIx[j] * pIx[j]) + zeta_squared;
+            pa12[j] = weight * (pIx[j] * pIy[j]);
+            pa22[j] = weight * (pIy[j] * pIy[j]) + zeta_squared;
+            pb1[j] = -weight * (pIz[j] * pIx[j]);
+            pb2[j] = -weight * (pIz[j] * pIy[j]);
+
+            /* Step 2: Compute gradient constancy terms */
+            /* Normalization factor for x gradient: */
+            derivNorm = pIxx[j] * pIxx[j] + pIxy[j] * pIxy[j] + zeta_squared;
+            /* Normalization factor for y gradient: */
+            derivNorm2 = pIyy[j] * pIyy[j] + pIxy[j] * pIxy[j] + zeta_squared;
+            /* Gradient constancy penalties (computed by Taylor expansion): */
+            Ik1zx = pIxz[j] + pIxx[j] * pdU[j] + pIxy[j] * pdV[j];
+            Ik1zy = pIyz[j] + pIxy[j] * pdU[j] + pIyy[j] * pdV[j];
+            /* Weight of the gradient constancy term in the current fixed-point iteration: */
+            weight = gamma2 / sqrt(Ik1zx * Ik1zx / derivNorm + Ik1zy * Ik1zy / derivNorm2 + epsilon_squared);
+            /* Add respective gradient constancy components to the linear system coefficients: */
+            pa11[j] += weight * (pIxx[j] * pIxx[j] / derivNorm + pIxy[j] * pIxy[j] / derivNorm2);
+            pa12[j] += weight * (pIxx[j] * pIxy[j] / derivNorm + pIxy[j] * pIyy[j] / derivNorm2);
+            pa22[j] += weight * (pIxy[j] * pIxy[j] / derivNorm + pIyy[j] * pIyy[j] / derivNorm2);
+            pb1[j] += -weight * (pIxx[j] * pIxz[j] / derivNorm + pIxy[j] * pIyz[j] / derivNorm2);
+            pb2[j] += -weight * (pIxy[j] * pIxz[j] / derivNorm + pIyy[j] * pIyz[j] / derivNorm2);
+        }
+    }
+}
+
+VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::ComputeSmoothnessTermHorPass_ParBody(
+  VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_W_u, RedBlackBuffer &_W_v,
+  RedBlackBuffer &_tempW_u, RedBlackBuffer &_tempW_v, bool _red_pass)
+    : var(&_var), nstripes(_nstripes), h(_h), W_u(&_W_u), W_v(&_W_v), curW_u(&_tempW_u), curW_v(&_tempW_v),
+      red_pass(_red_pass)
+{
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+
+/* This function updates the linear system coefficients A11,A22,b1,b1 according to the
+ * flow smoothness term and computes corresponding weights for the current fixed point iteration.
+ * A11,A22,b1,b1 are updated only partially (horizontal pass). Doing both horizontal and vertical
+ * passes in one loop complicates parallelization (different threads write to the same elements).
+ */
+void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator()(const Range &range) const
+{
+    int start_i = min(range.start * stripe_sz, h);
+    int end_i = min(range.end * stripe_sz, h);
+
+    float epsilon_squared = var->epsilon * var->epsilon;
+    float alpha2 = var->alpha / 2;
+    float *pWeight;
+    float *pA_u, *pA_u_next, *pA_v, *pA_v_next;
+    float *pB_u, *pB_u_next, *pB_v, *pB_v_next;
+    float *cW_u, *cW_u_next, *cW_u_next_row;
+    float *cW_v, *cW_v_next, *cW_v_next_row;
+    float *pW_u, *pW_u_next;
+    float *pW_v, *pW_v_next;
+    float ux, uy, vx, vy;
+    int len;
+    bool touches_right_border = true;
+
+#define INIT_ROW_POINTERS(cur_color, next_color, next_offs_even, next_offs_odd, bool_default)                          \
+    pWeight = var->weights.cur_color.ptr<float>(i + 1) + 1;                                                            \
+    pA_u = var->A11.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pB_u = var->b1.cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    cW_u = curW_u->cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    pW_u = W_u->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+    pA_v = var->A22.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pB_v = var->b2.cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    cW_v = curW_v->cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    pW_v = W_v->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+                                                                                                                       \
+    cW_u_next_row = curW_u->next_color.ptr<float>(i + 2) + 1;                                                          \
+    cW_v_next_row = curW_v->next_color.ptr<float>(i + 2) + 1;                                                          \
+                                                                                                                       \
+    if (i % 2 == 0)                                                                                                    \
+    {                                                                                                                  \
+        pA_u_next = var->A11.next_color.ptr<float>(i + 1) + next_offs_even;                                            \
+        pB_u_next = var->b1.next_color.ptr<float>(i + 1) + next_offs_even;                                             \
+        cW_u_next = curW_u->next_color.ptr<float>(i + 1) + next_offs_even;                                             \
+        pW_u_next = W_u->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
+        pA_v_next = var->A22.next_color.ptr<float>(i + 1) + next_offs_even;                                            \
+        pB_v_next = var->b2.next_color.ptr<float>(i + 1) + next_offs_even;                                             \
+        cW_v_next = curW_v->next_color.ptr<float>(i + 1) + next_offs_even;                                             \
+        pW_v_next = W_v->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
+        len = var->A11.cur_color##_even_len;                                                                           \
+        if (var->A11.cur_color##_even_len != var->A11.cur_color##_odd_len)                                             \
+            touches_right_border = bool_default;                                                                       \
+        else                                                                                                           \
+            touches_right_border = !bool_default;                                                                      \
+    }                                                                                                                  \
+    else                                                                                                               \
+    {                                                                                                                  \
+        pA_u_next = var->A11.next_color.ptr<float>(i + 1) + next_offs_odd;                                             \
+        pB_u_next = var->b1.next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
+        cW_u_next = curW_u->next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
+        pW_u_next = W_u->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
+        pA_v_next = var->A22.next_color.ptr<float>(i + 1) + next_offs_odd;                                             \
+        pB_v_next = var->b2.next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
+        cW_v_next = curW_v->next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
+        pW_v_next = W_v->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
+        len = var->A11.cur_color##_odd_len;                                                                            \
+        if (var->A11.cur_color##_even_len != var->A11.cur_color##_odd_len)                                             \
+            touches_right_border = !bool_default;                                                                      \
+        else                                                                                                           \
+            touches_right_border = bool_default;                                                                       \
+    }
+
+    for (int i = start_i; i < end_i; i++)
+    {
+        if (red_pass)
+        {
+            INIT_ROW_POINTERS(red, black, 1, 2, true);
+        }
+        else
+        {
+            INIT_ROW_POINTERS(black, red, 2, 1, false);
+        }
+#undef INIT_ROW_POINTERS
+
+#define COMPUTE                                                                                                        \
+    /* Gradients for the flow on the current fixed-point iteration: */                                                 \
+    ux = cW_u_next[j] - cW_u[j];                                                                                       \
+    vx = cW_v_next[j] - cW_v[j];                                                                                       \
+    uy = cW_u_next_row[j] - cW_u[j];                                                                                   \
+    vy = cW_v_next_row[j] - cW_v[j];                                                                                   \
+    /* Weight of the smoothness term in the current fixed-point iteration: */                                          \
+    pWeight[j] = alpha2 / sqrt(ux * ux + vx * vx + uy * uy + vy * vy + epsilon_squared);                               \
+    /* Gradients for initial raw flow multiplied by weight:*/                                                          \
+    ux = pWeight[j] * (pW_u_next[j] - pW_u[j]);                                                                        \
+    vx = pWeight[j] * (pW_v_next[j] - pW_v[j]);
+
+#define UPDATE                                                                                                         \
+    pB_u[j] += ux;                                                                                                     \
+    pA_u[j] += pWeight[j];                                                                                             \
+    pB_v[j] += vx;                                                                                                     \
+    pA_v[j] += pWeight[j];                                                                                             \
+    pB_u_next[j] -= ux;                                                                                                \
+    pA_u_next[j] += pWeight[j];                                                                                        \
+    pB_v_next[j] -= vx;                                                                                                \
+    pA_v_next[j] += pWeight[j];
+
+        int j = 0;
+#ifdef CV_SIMD128
+        v_float32x4 alpha2_vec = v_setall_f32(alpha2);
+        v_float32x4 eps_vec = v_setall_f32(epsilon_squared);
+        v_float32x4 cW_u_vec, cW_v_vec;
+        v_float32x4 pWeight_vec, ux_vec, vx_vec, uy_vec, vy_vec;
+
+        for (; j < len - 4; j += 4)
+        {
+            cW_u_vec = v_load(cW_u + j);
+            cW_v_vec = v_load(cW_v + j);
+
+            ux_vec = v_load(cW_u_next + j) - cW_u_vec;
+            vx_vec = v_load(cW_v_next + j) - cW_v_vec;
+            uy_vec = v_load(cW_u_next_row + j) - cW_u_vec;
+            vy_vec = v_load(cW_v_next_row + j) - cW_v_vec;
+            pWeight_vec =
+              alpha2_vec / v_sqrt(ux_vec * ux_vec + vx_vec * vx_vec + uy_vec * uy_vec + vy_vec * vy_vec + eps_vec);
+            v_store(pWeight + j, pWeight_vec);
+
+            ux_vec = pWeight_vec * (v_load(pW_u_next + j) - v_load(pW_u + j));
+            vx_vec = pWeight_vec * (v_load(pW_v_next + j) - v_load(pW_v + j));
+
+            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
+            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
+            v_store(pB_u + j, v_load(pB_u + j) + ux_vec);
+            v_store(pB_v + j, v_load(pB_v + j) + vx_vec);
+
+            v_store(pA_u_next + j, v_load(pA_u_next + j) + pWeight_vec);
+            v_store(pA_v_next + j, v_load(pA_v_next + j) + pWeight_vec);
+            v_store(pB_u_next + j, v_load(pB_u_next + j) - ux_vec);
+            v_store(pB_v_next + j, v_load(pB_v_next + j) - vx_vec);
+        }
+#endif
+        for (; j < len - 1; j++)
+        {
+            COMPUTE;
+            UPDATE;
+        }
+
+        /* Omit the update on the rightmost elements */
+        if (touches_right_border)
+        {
+            COMPUTE;
+        }
+        else
+        {
+            COMPUTE;
+            UPDATE;
+        }
+    }
+#undef COMPUTE
+#undef UPDATE
+}
+
+VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::ComputeSmoothnessTermVertPass_ParBody(
+  VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_W_u, RedBlackBuffer &_W_v, bool _red_pass)
+    : var(&_var), nstripes(_nstripes), W_u(&_W_u), W_v(&_W_v), red_pass(_red_pass)
+{
+    /* Omit the last row in the vertical pass */
+    h = _h - 1;
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+
+/* This function adds the last remaining terms to the linear system coefficients A11,A22,b1,b1. */
+void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator()(const Range &range) const
+{
+    int start_i = min(range.start * stripe_sz, h);
+    int end_i = min(range.end * stripe_sz, h);
+
+    float *pWeight;
+    float *pA_u, *pA_u_next_row, *pA_v, *pA_v_next_row;
+    float *pB_u, *pB_u_next_row, *pB_v, *pB_v_next_row;
+    float *pW_u, *pW_u_next_row, *pW_v, *pW_v_next_row;
+    float vy, uy;
+    int len;
+
+    for (int i = start_i; i < end_i; i++)
+    {
+#define INIT_ROW_POINTERS(cur_color, next_color)                                                                       \
+    pWeight = var->weights.cur_color.ptr<float>(i + 1) + 1;                                                            \
+    pA_u = var->A11.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pB_u = var->b1.cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    pW_u = W_u->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+    pA_v = var->A22.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pB_v = var->b2.cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    pW_v = W_v->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+                                                                                                                       \
+    pA_u_next_row = var->A11.next_color.ptr<float>(i + 2) + 1;                                                         \
+    pB_u_next_row = var->b1.next_color.ptr<float>(i + 2) + 1;                                                          \
+    pW_u_next_row = W_u->next_color.ptr<float>(i + 2) + 1;                                                             \
+    pA_v_next_row = var->A22.next_color.ptr<float>(i + 2) + 1;                                                         \
+    pB_v_next_row = var->b2.next_color.ptr<float>(i + 2) + 1;                                                          \
+    pW_v_next_row = W_v->next_color.ptr<float>(i + 2) + 1;                                                             \
+                                                                                                                       \
+    if (i % 2 == 0)                                                                                                    \
+        len = var->A11.cur_color##_even_len;                                                                           \
+    else                                                                                                               \
+        len = var->A11.cur_color##_odd_len;
+
+        if (red_pass)
+        {
+            INIT_ROW_POINTERS(red, black);
+        }
+        else
+        {
+            INIT_ROW_POINTERS(black, red);
+        }
+#undef INIT_ROW_POINTERS
+
+        int j = 0;
+#ifdef CV_SIMD128
+        v_float32x4 pWeight_vec, uy_vec, vy_vec;
+        for (; j < len - 3; j += 4)
+        {
+            pWeight_vec = v_load(pWeight + j);
+            uy_vec = pWeight_vec * (v_load(pW_u_next_row + j) - v_load(pW_u + j));
+            vy_vec = pWeight_vec * (v_load(pW_v_next_row + j) - v_load(pW_v + j));
+
+            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
+            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
+            v_store(pB_u + j, v_load(pB_u + j) + uy_vec);
+            v_store(pB_v + j, v_load(pB_v + j) + vy_vec);
+
+            v_store(pA_u_next_row + j, v_load(pA_u_next_row + j) + pWeight_vec);
+            v_store(pA_v_next_row + j, v_load(pA_v_next_row + j) + pWeight_vec);
+            v_store(pB_u_next_row + j, v_load(pB_u_next_row + j) - uy_vec);
+            v_store(pB_v_next_row + j, v_load(pB_v_next_row + j) - vy_vec);
+        }
+#endif
+        for (; j < len; j++)
+        {
+            uy = pWeight[j] * (pW_u_next_row[j] - pW_u[j]);
+            vy = pWeight[j] * (pW_v_next_row[j] - pW_v[j]);
+            pB_u[j] += uy;
+            pA_u[j] += pWeight[j];
+            pB_v[j] += vy;
+            pA_v[j] += pWeight[j];
+            pB_u_next_row[j] -= uy;
+            pA_u_next_row[j] += pWeight[j];
+            pB_v_next_row[j] -= vy;
+            pA_v_next_row[j] += pWeight[j];
+        }
+    }
+}
+
+VariationalRefinementImpl::RedBlackSOR_ParBody::RedBlackSOR_ParBody(VariationalRefinementImpl &_var, int _nstripes,
+                                                                    int _h, RedBlackBuffer &_dW_u,
+                                                                    RedBlackBuffer &_dW_v, bool _red_pass)
+    : var(&_var), nstripes(_nstripes), h(_h), dW_u(&_dW_u), dW_v(&_dW_v), red_pass(_red_pass)
+{
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+
+/* This function implements the Red-Black SOR (successive-over relaxation) method for solving the main
+ * linear system in the current fixed-point iteration.
+ */
+void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &range) const
+{
+    int start = min(range.start * stripe_sz, h);
+    int end = min(range.end * stripe_sz, h);
+
+    float *pa11, *pa12, *pa22, *pb1, *pb2, *pW, *pdu, *pdv;
+    float *pW_next, *pdu_next, *pdv_next;
+    float *pW_prev_row, *pdu_prev_row, *pdv_prev_row;
+    float *pdu_next_row, *pdv_next_row;
+
+    float sigmaU, sigmaV;
+    int j, len;
+    for (int i = start; i < end; i++)
+    {
+#define INIT_ROW_POINTERS(cur_color, next_color, next_offs_even, next_offs_odd)                                        \
+    pW = var->weights.cur_color.ptr<float>(i + 1) + 1;                                                                 \
+    pa11 = var->A11.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pa12 = var->A12.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pa22 = var->A22.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pb1 = var->b1.cur_color.ptr<float>(i + 1) + 1;                                                                     \
+    pb2 = var->b2.cur_color.ptr<float>(i + 1) + 1;                                                                     \
+    pdu = dW_u->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+    pdv = dW_v->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+                                                                                                                       \
+    pdu_next_row = dW_u->next_color.ptr<float>(i + 2) + 1;                                                             \
+    pdv_next_row = dW_v->next_color.ptr<float>(i + 2) + 1;                                                             \
+                                                                                                                       \
+    pW_prev_row = var->weights.next_color.ptr<float>(i) + 1;                                                           \
+    pdu_prev_row = dW_u->next_color.ptr<float>(i) + 1;                                                                 \
+    pdv_prev_row = dW_v->next_color.ptr<float>(i) + 1;                                                                 \
+                                                                                                                       \
+    if (i % 2 == 0)                                                                                                    \
+    {                                                                                                                  \
+        pW_next = var->weights.next_color.ptr<float>(i + 1) + next_offs_even;                                          \
+        pdu_next = dW_u->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
+        pdv_next = dW_v->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
+        len = var->A11.cur_color##_even_len;                                                                           \
+    }                                                                                                                  \
+    else                                                                                                               \
+    {                                                                                                                  \
+        pW_next = var->weights.next_color.ptr<float>(i + 1) + next_offs_odd;                                           \
+        pdu_next = dW_u->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
+        pdv_next = dW_v->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
+        len = var->A11.cur_color##_odd_len;                                                                            \
+    }
+        if (red_pass)
+        {
+            INIT_ROW_POINTERS(red, black, 1, 2);
+        }
+        else
+        {
+            INIT_ROW_POINTERS(black, red, 2, 1);
+        }
+#undef INIT_ROW_POINTERS
+
+        j = 0;
+#ifdef CV_SIMD128
+        v_float32x4 pW_prev_vec = v_setall_f32(pW_next[-1]);
+        v_float32x4 pdu_prev_vec = v_setall_f32(pdu_next[-1]);
+        v_float32x4 pdv_prev_vec = v_setall_f32(pdv_next[-1]);
+        v_float32x4 omega_vec = v_setall_f32(var->omega);
+        v_float32x4 pW_vec, pW_next_vec, pW_prev_row_vec;
+        v_float32x4 pdu_next_vec, pdu_prev_row_vec, pdu_next_row_vec;
+        v_float32x4 pdv_next_vec, pdv_prev_row_vec, pdv_next_row_vec;
+        v_float32x4 pW_shifted_vec, pdu_shifted_vec, pdv_shifted_vec;
+        v_float32x4 pa12_vec, sigmaU_vec, sigmaV_vec, pdu_vec, pdv_vec;
+        for (; j < len - 3; j += 4)
+        {
+            pW_vec = v_load(pW + j);
+            pW_next_vec = v_load(pW_next + j);
+            pW_prev_row_vec = v_load(pW_prev_row + j);
+            pdu_next_vec = v_load(pdu_next + j);
+            pdu_prev_row_vec = v_load(pdu_prev_row + j);
+            pdu_next_row_vec = v_load(pdu_next_row + j);
+            pdv_next_vec = v_load(pdv_next + j);
+            pdv_prev_row_vec = v_load(pdv_prev_row + j);
+            pdv_next_row_vec = v_load(pdv_next_row + j);
+            pa12_vec = v_load(pa12 + j);
+            pW_shifted_vec = v_reinterpret_as_f32(
+              v_extract<3>(v_reinterpret_as_s32(pW_prev_vec), v_reinterpret_as_s32(pW_next_vec)));
+            pdu_shifted_vec = v_reinterpret_as_f32(
+              v_extract<3>(v_reinterpret_as_s32(pdu_prev_vec), v_reinterpret_as_s32(pdu_next_vec)));
+            pdv_shifted_vec = v_reinterpret_as_f32(
+              v_extract<3>(v_reinterpret_as_s32(pdv_prev_vec), v_reinterpret_as_s32(pdv_next_vec)));
+
+            sigmaU_vec = pW_shifted_vec * pdu_shifted_vec + pW_vec * pdu_next_vec + pW_prev_row_vec * pdu_prev_row_vec +
+                         pW_vec * pdu_next_row_vec;
+            sigmaV_vec = pW_shifted_vec * pdv_shifted_vec + pW_vec * pdv_next_vec + pW_prev_row_vec * pdv_prev_row_vec +
+                         pW_vec * pdv_next_row_vec;
+
+            pdu_vec = v_load(pdu + j);
+            pdv_vec = v_load(pdv + j);
+            pdu_vec += omega_vec * ((sigmaU_vec + v_load(pb1 + j) - pdv_vec * pa12_vec) / v_load(pa11 + j) - pdu_vec);
+            pdv_vec += omega_vec * ((sigmaV_vec + v_load(pb2 + j) - pdu_vec * pa12_vec) / v_load(pa22 + j) - pdv_vec);
+            v_store(pdu + j, pdu_vec);
+            v_store(pdv + j, pdv_vec);
+
+            pW_prev_vec = pW_next_vec;
+            pdu_prev_vec = pdu_next_vec;
+            pdv_prev_vec = pdv_next_vec;
+        }
+#endif
+        for (; j < len; j++)
+        {
+            sigmaU = pW_next[j - 1] * pdu_next[j - 1] + pW[j] * pdu_next[j] + pW_prev_row[j] * pdu_prev_row[j] +
+                     pW[j] * pdu_next_row[j];
+            sigmaV = pW_next[j - 1] * pdv_next[j - 1] + pW[j] * pdv_next[j] + pW_prev_row[j] * pdv_prev_row[j] +
+                     pW[j] * pdv_next_row[j];
+            pdu[j] += var->omega * ((sigmaU + pb1[j] - pdv[j] * pa12[j]) / pa11[j] - pdu[j]);
+            pdv[j] += var->omega * ((sigmaV + pb2[j] - pdu[j] * pa12[j]) / pa22[j] - pdv[j]);
+        }
+    }
+}
+
+void VariationalRefinementImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
+{
+    CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
+    CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1);
+    CV_Assert(I0.sameSize(I1));
+    CV_Assert(!flow.empty() && flow.depth() == CV_32F && flow.channels() == 2);
+    CV_Assert(I0.sameSize(flow));
+
+    Mat uv[2];
+    Mat &flowMat = flow.getMatRef();
+    split(flowMat, uv);
+    calcUV(I0, I1, uv[0], uv[1]);
+    merge(uv, 2, flowMat);
+}
+
+void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v)
+{
+    CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
+    CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1);
+    CV_Assert(I0.sameSize(I1));
+    CV_Assert(!flow_u.empty() && flow_u.depth() == CV_32F && flow_u.channels() == 1);
+    CV_Assert(!flow_v.empty() && flow_v.depth() == CV_32F && flow_v.channels() == 1);
+    CV_Assert(I0.sameSize(flow_u));
+    CV_Assert(flow_u.sameSize(flow_v));
+
+    int num_stripes = getNumThreads();
+    Mat I0Mat = I0.getMat();
+    Mat I1Mat = I1.getMat();
+    Mat &W_u = flow_u.getMatRef();
+    Mat &W_v = flow_v.getMatRef();
+    prepareBuffers(I0Mat, I1Mat, W_u, W_v);
+
+    splitCheckerboard(W_u_rb, W_u);
+    splitCheckerboard(W_v_rb, W_v);
+    W_u_rb.red.copyTo(tempW_u.red);
+    W_u_rb.black.copyTo(tempW_u.black);
+    W_v_rb.red.copyTo(tempW_v.red);
+    W_v_rb.black.copyTo(tempW_v.black);
+    dW_u.red.setTo(0.0f);
+    dW_u.black.setTo(0.0f);
+    dW_v.red.setTo(0.0f);
+    dW_v.black.setTo(0.0f);
+
+    for (int i = 0; i < fixedPointIterations; i++)
+    {
+        parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true));
+        parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false));
+
+        parallel_for_(Range(0, num_stripes), ComputeSmoothnessTermHorPass_ParBody(
+                                               *this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, tempW_u, tempW_v, true));
+        parallel_for_(Range(0, num_stripes), ComputeSmoothnessTermHorPass_ParBody(
+                                               *this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, tempW_u, tempW_v, false));
+
+        parallel_for_(Range(0, num_stripes),
+                      ComputeSmoothnessTermVertPass_ParBody(*this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, true));
+        parallel_for_(Range(0, num_stripes),
+                      ComputeSmoothnessTermVertPass_ParBody(*this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, false));
+
+        for (int j = 0; j < sorIterations; j++)
+        {
+            parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true));
+            parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false));
+        }
+
+        tempW_u.red = W_u_rb.red + dW_u.red;
+        tempW_u.black = W_u_rb.black + dW_u.black;
+        updateRepeatedBorders(tempW_u);
+        tempW_v.red = W_v_rb.red + dW_v.red;
+        tempW_v.black = W_v_rb.black + dW_v.black;
+        updateRepeatedBorders(tempW_v);
+    }
+    mergeCheckerboard(W_u, tempW_u);
+    mergeCheckerboard(W_v, tempW_v);
+}
+void VariationalRefinementImpl::collectGarbage()
+{
+    Ix.release();
+    Iy.release();
+    Iz.release();
+    Ixx.release();
+    Ixy.release();
+    Iyy.release();
+    Ixz.release();
+    Iyz.release();
+
+    Ix_rb.release();
+    Iy_rb.release();
+    Iz_rb.release();
+    Ixx_rb.release();
+    Ixy_rb.release();
+    Iyy_rb.release();
+    Ixz_rb.release();
+    Iyz_rb.release();
+
+    A11.release();
+    A12.release();
+    A22.release();
+    b1.release();
+    b2.release();
+    weights.release();
+
+    mapX.release();
+    mapY.release();
+
+    tempW_u.release();
+    tempW_v.release();
+    dW_u.release();
+    dW_v.release();
+    W_u_rb.release();
+    W_v_rb.release();
+}
+
+Ptr<VariationalRefinement> createVariationalFlowRefinement() { return makePtr<VariationalRefinementImpl>(); }
+}
+}
--- a/modules/optflow/test/test_simpleflow.cpp
+++ b/modules/optflow/test/test_simpleflow.cpp
@@ -40,74 +40,39 @@
 //M*/

 #include "test_precomp.hpp"
-
-#include <string>
+#include <fstream>

 using namespace std;
+using namespace cv;
+using namespace cvtest;
+using namespace optflow;

-/* ///////////////////// simpleflow_test ///////////////////////// */
-
-class CV_SimpleFlowTest : public cvtest::BaseTest
-{
-public:
-    CV_SimpleFlowTest();
-protected:
-    void run(int);
-};
-
+static string getDataDir() { return TS::ptr()->get_data_path(); }

-CV_SimpleFlowTest::CV_SimpleFlowTest() {}
-
-static bool readOpticalFlowFromFile(FILE* file, cv::Mat& flow) {
-  char header[5];
-  if (fread(header, 1, 4, file) < 4 && (string)header != "PIEH") {
-    return false;
-  }
+static bool isFlowCorrect(float u) { return !cvIsNaN(u) && (fabs(u) < 1e9); }

-  int cols, rows;
-  if (fread(&cols, sizeof(int), 1, file) != 1||
-      fread(&rows, sizeof(int), 1, file) != 1) {
-    return false;
-  }
-
-  flow = cv::Mat::zeros(rows, cols, CV_32FC2);
-
-  for (int i = 0; i < rows; ++i) {
-    for (int j = 0; j < cols; ++j) {
-      cv::Vec2f flow_at_point;
-      if (fread(&(flow_at_point[0]), sizeof(float), 1, file) != 1 ||
-          fread(&(flow_at_point[1]), sizeof(float), 1, file) != 1) {
-        return false;
-      }
-      flow.at<cv::Vec2f>(i, j) = flow_at_point;
-    }
-  }
-
-  return true;
-}
-
-static bool isFlowCorrect(float u) {
-  return !cvIsNaN(u) && (fabs(u) < 1e9);
-}
-
-static float calc_rmse(cv::Mat flow1, cv::Mat flow2) {
+static float calcRMSE(Mat flow1, Mat flow2)
+{
    float sum = 0;
    int counter = 0;
    const int rows = flow1.rows;
    const int cols = flow1.cols;

-  for (int y = 0; y < rows; ++y) {
-    for (int x = 0; x < cols; ++x) {
-      cv::Vec2f flow1_at_point = flow1.at<cv::Vec2f>(y, x);
-      cv::Vec2f flow2_at_point = flow2.at<cv::Vec2f>(y, x);
+    for (int y = 0; y < rows; ++y)
+    {
+        for (int x = 0; x < cols; ++x)
+        {
+            Vec2f flow1_at_point = flow1.at<Vec2f>(y, x);
+            Vec2f flow2_at_point = flow2.at<Vec2f>(y, x);

            float u1 = flow1_at_point[0];
            float v1 = flow1_at_point[1];
            float u2 = flow2_at_point[0];
            float v2 = flow2_at_point[1];

-      if (isFlowCorrect(u1) && isFlowCorrect(u2) && isFlowCorrect(v1) && isFlowCorrect(v2)) {
-        sum += (u1-u2)*(u1-u2) + (v1-v2)*(v1-v2);
+            if (isFlowCorrect(u1) && isFlowCorrect(u2) && isFlowCorrect(v1) && isFlowCorrect(v2))
+            {
+                sum += (u1 - u2) * (u1 - u2) + (v1 - v2) * (v1 - v2);
                counter++;
            }
        }
@@ -115,76 +80,111 @@ static float calc_rmse(cv::Mat flow1, cv::Mat flow2) {
    return (float)sqrt(sum / (1e-9 + counter));
 }

-void CV_SimpleFlowTest::run(int) {
-    const float MAX_RMSE = 0.6f;
-    const string frame1_path = ts->get_data_path() + "optflow/RubberWhale1.png";
-    const string frame2_path = ts->get_data_path() + "optflow/RubberWhale2.png";
-    const string gt_flow_path = ts->get_data_path() + "optflow/RubberWhale.flo";
-
-    cv::Mat frame1 = cv::imread(frame1_path);
-    cv::Mat frame2 = cv::imread(frame2_path);
-
-    if (frame1.empty()) {
-      ts->printf(cvtest::TS::LOG, "could not read image %s\n", frame2_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
-
-    if (frame2.empty()) {
-      ts->printf(cvtest::TS::LOG, "could not read image %s\n", frame2_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
-
-    if (frame1.rows != frame2.rows && frame1.cols != frame2.cols) {
-      ts->printf(cvtest::TS::LOG, "images should be of equal sizes (%s and %s)",
-                 frame1_path.c_str(), frame2_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
-
-    if (frame1.type() != 16 || frame2.type() != 16) {
-      ts->printf(cvtest::TS::LOG, "images should be of equal type CV_8UC3 (%s and %s)",
-                 frame1_path.c_str(), frame2_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
-
-    cv::Mat flow_gt;
+bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT)
+{
+    const string frame1_path = getDataDir() + "optflow/RubberWhale1.png";
+    const string frame2_path = getDataDir() + "optflow/RubberWhale2.png";
+    const string gt_flow_path = getDataDir() + "optflow/RubberWhale.flo";

-    FILE* gt_flow_file = fopen(gt_flow_path.c_str(), "rb");
-    if (gt_flow_file == NULL) {
-      ts->printf(cvtest::TS::LOG, "could not read ground-thuth flow from file %s",
-                 gt_flow_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
+    dst_frame_1 = imread(frame1_path);
+    dst_frame_2 = imread(frame2_path);
+    dst_GT = readOpticalFlow(gt_flow_path);

-    if (!readOpticalFlowFromFile(gt_flow_file, flow_gt)) {
-      ts->printf(cvtest::TS::LOG, "error while reading flow data from file %s",
-                 gt_flow_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
-    fclose(gt_flow_file);
+    if (dst_frame_1.empty() || dst_frame_2.empty() || dst_GT.empty())
+        return false;
+    else
+        return true;
+}

-    cv::Mat flow;
-    cv::optflow::calcOpticalFlowSF(frame1, frame2, flow, 3, 2, 4);
+TEST(DenseOpticalFlow_SimpleFlow, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    float target_RMSE = 0.37f;
+
+    Mat flow;
+    Ptr<DenseOpticalFlow> algo;
+    algo = createOptFlow_SimpleFlow();
+    algo->calc(frame1, frame2, flow);
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
+}

-    float rmse = calc_rmse(flow_gt, flow);
+TEST(DenseOpticalFlow_DeepFlow, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    float target_RMSE = 0.35f;
+    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
+    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
+
+    Mat flow;
+    Ptr<DenseOpticalFlow> algo;
+    algo = createOptFlow_DeepFlow();
+    algo->calc(frame1, frame2, flow);
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
+}

-    ts->printf(cvtest::TS::LOG, "Optical flow estimation RMSE for SimpleFlow algorithm : %lf\n",
-               rmse);
+TEST(DenseOpticalFlow_SparseToDenseFlow, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    float target_RMSE = 0.52f;
+
+    Mat flow;
+    Ptr<DenseOpticalFlow> algo;
+    algo = createOptFlow_SparseToDense();
+    algo->calc(frame1, frame2, flow);
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
+}

-    if (rmse > MAX_RMSE) {
-      ts->printf( cvtest::TS::LOG,
-                 "Too big rmse error : %lf ( >= %lf )\n", rmse, MAX_RMSE);
-      ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-      return;
+TEST(DenseOpticalFlow_DIS, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    int presets[] = {DISOpticalFlow::PRESET_ULTRAFAST, DISOpticalFlow::PRESET_FAST, DISOpticalFlow::PRESET_MEDIUM};
+    float target_RMSE[] = {0.86f, 0.74f, 0.49f};
+    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
+    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
+
+    Ptr<DenseOpticalFlow> algo;
+
+    // iterate over presets:
+    for (int i = 0; i < 3; i++)
+    {
+        Mat flow;
+        algo = createOptFlow_DIS(presets[i]);
+        algo->calc(frame1, frame2, flow);
+        ASSERT_EQ(GT.rows, flow.rows);
+        ASSERT_EQ(GT.cols, flow.cols);
+        EXPECT_LE(calcRMSE(GT, flow), target_RMSE[i]);
    }
 }

-
-TEST(Video_OpticalFlowSimpleFlow, accuracy) { CV_SimpleFlowTest test; test.safe_run(); }
-
-/* End of file. */
+TEST(DenseOpticalFlow_VariationalRefinement, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    float target_RMSE = 0.86f;
+    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
+    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
+
+    Ptr<VariationalRefinement> var_ref;
+    var_ref = createVariationalFlowRefinement();
+    var_ref->setAlpha(20.0f);
+    var_ref->setDelta(5.0f);
+    var_ref->setGamma(10.0f);
+    var_ref->setSorIterations(25);
+    var_ref->setFixedPointIterations(25);
+    Mat flow(frame1.size(), CV_32FC2);
+    flow.setTo(0.0f);
+    var_ref->calc(frame1, frame2, flow);
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
+}
--- a/modules/optflow/test/test_sparsetodenseflow.cpp
+++ b/modules/optflow/test/test_sparsetodenseflow.cpp
@@ -40,107 +40,120 @@
 //M*/

 #include "test_precomp.hpp"
-
-#include <string>
+#include <fstream>

 using namespace std;
+using namespace std::tr1;
 using namespace cv;
+using namespace cvtest;
+using namespace perf;
+using namespace testing;
+using namespace optflow;

-/* ///////////////////// sparsetodenseflow_test ///////////////////////// */
+typedef tuple<Size> OFParams;
+typedef TestWithParam<OFParams> DenseOpticalFlow_DIS;
+typedef TestWithParam<OFParams> DenseOpticalFlow_VariationalRefinement;

-class CV_SparseToDenseFlowTest : public cvtest::BaseTest
+TEST_P(DenseOpticalFlow_DIS, MultithreadReproducibility)
 {
-protected:
-    void run(int);
-};
-
-static bool isFlowCorrect(float u) {
-  return !cvIsNaN(u) && (fabs(u) < 1e9);
-}
-
-static float calc_rmse(Mat flow1, Mat flow2) {
-  float sum = 0;
-  int counter = 0;
-  const int rows = flow1.rows;
-  const int cols = flow1.cols;
-
-  for (int y = 0; y < rows; ++y) {
-    for (int x = 0; x < cols; ++x) {
-      Vec2f flow1_at_point = flow1.at<Vec2f>(y, x);
-      Vec2f flow2_at_point = flow2.at<Vec2f>(y, x);
-
-      float u1 = flow1_at_point[0];
-      float v1 = flow1_at_point[1];
-      float u2 = flow2_at_point[0];
-      float v2 = flow2_at_point[1];
-
-      if (isFlowCorrect(u1) && isFlowCorrect(u2) && isFlowCorrect(v1) && isFlowCorrect(v2)) {
-        sum += (u1-u2)*(u1-u2) + (v1-v2)*(v1-v2);
-        counter++;
-      }
-    }
+    double MAX_DIF = 0.01;
+    double MAX_MEAN_DIF = 0.001;
+    int loopsCount = 2;
+    RNG rng(0);
+
+    OFParams params = GetParam();
+    Size size = get<0>(params);
+
+    for (int iter = 0; iter <= loopsCount; iter++)
+    {
+        Mat frame1(size, CV_8U);
+        randu(frame1, 0, 255);
+        Mat frame2(size, CV_8U);
+        randu(frame2, 0, 255);
+
+        Ptr<DISOpticalFlow> algo = createOptFlow_DIS();
+        int psz = rng.uniform(4, 16);
+        int pstr = rng.uniform(1, psz - 1);
+        int grad_iter = rng.uniform(1, 64);
+        int var_iter = rng.uniform(0, 10);
+        bool use_mean_normalization = !!rng.uniform(0, 2);
+        bool use_spatial_propagation = !!rng.uniform(0, 2);
+        algo->setFinestScale(0);
+        algo->setPatchSize(psz);
+        algo->setPatchStride(pstr);
+        algo->setGradientDescentIterations(grad_iter);
+        algo->setVariationalRefinementIterations(var_iter);
+        algo->setUseMeanNormalization(use_mean_normalization);
+        algo->setUseSpatialPropagation(use_spatial_propagation);
+
+        cv::setNumThreads(cv::getNumberOfCPUs());
+        Mat resMultiThread;
+        algo->calc(frame1, frame2, resMultiThread);
+
+        cv::setNumThreads(1);
+        Mat resSingleThread;
+        algo->calc(frame1, frame2, resSingleThread);
+
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_INF), MAX_DIF);
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_L1), MAX_MEAN_DIF * frame1.total());
+
+        // resulting flow should be within the frame bounds:
+        double min_val, max_val;
+        minMaxLoc(resMultiThread, &min_val, &max_val);
+        EXPECT_LE(abs(min_val), sqrt(size.height * size.height + size.width * size.width));
+        EXPECT_LE(abs(max_val), sqrt(size.height * size.height + size.width * size.width));
    }
-  return (float)sqrt(sum / (1e-9 + counter));
 }

-void CV_SparseToDenseFlowTest::run(int) {
-    const float MAX_RMSE = 0.6f;
-    const string frame1_path = ts->get_data_path() + "optflow/RubberWhale1.png";
-    const string frame2_path = ts->get_data_path() + "optflow/RubberWhale2.png";
-    const string gt_flow_path = ts->get_data_path() + "optflow/RubberWhale.flo";
-
-    Mat frame1 = imread(frame1_path);
-    Mat frame2 = imread(frame2_path);
-
-    if (frame1.empty()) {
-      ts->printf(cvtest::TS::LOG, "could not read image %s\n", frame2_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
-
-    if (frame2.empty()) {
-      ts->printf(cvtest::TS::LOG, "could not read image %s\n", frame2_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
-
-    if (frame1.rows != frame2.rows && frame1.cols != frame2.cols) {
-      ts->printf(cvtest::TS::LOG, "images should be of equal sizes (%s and %s)",
-                 frame1_path.c_str(), frame2_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
-
-    if (frame1.type() != 16 || frame2.type() != 16) {
-      ts->printf(cvtest::TS::LOG, "images should be of equal type CV_8UC3 (%s and %s)",
-                 frame1_path.c_str(), frame2_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
-
-    Mat flow_gt = optflow::readOpticalFlow(gt_flow_path);
-    if(flow_gt.empty()) {
-      ts->printf(cvtest::TS::LOG, "error while reading flow data from file %s",
-                 gt_flow_path.c_str());
-      ts->set_failed_test_info(cvtest::TS::FAIL_MISSING_TEST_DATA);
-      return;
-    }
+INSTANTIATE_TEST_CASE_P(FullSet, DenseOpticalFlow_DIS, Values(szODD, szQVGA));

-    Mat flow;
-    optflow::calcOpticalFlowSparseToDense(frame1, frame2, flow);
-
-    float rmse = calc_rmse(flow_gt, flow);
-
-    ts->printf(cvtest::TS::LOG, "Optical flow estimation RMSE for SparseToDenseFlow algorithm : %lf\n",
-               rmse);
-
-    if (rmse > MAX_RMSE) {
-      ts->printf( cvtest::TS::LOG,
-                 "Too big rmse error : %lf ( >= %lf )\n", rmse, MAX_RMSE);
-      ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-      return;
+TEST_P(DenseOpticalFlow_VariationalRefinement, MultithreadReproducibility)
+{
+    double MAX_DIF = 0.01;
+    double MAX_MEAN_DIF = 0.001;
+    float input_flow_rad = 5.0;
+    int loopsCount = 2;
+    RNG rng(0);
+
+    OFParams params = GetParam();
+    Size size = get<0>(params);
+
+    for (int iter = 0; iter <= loopsCount; iter++)
+    {
+        Mat frame1(size, CV_8U);
+        randu(frame1, 0, 255);
+        Mat frame2(size, CV_8U);
+        randu(frame2, 0, 255);
+        Mat flow(size, CV_32FC2);
+        randu(flow, -input_flow_rad, input_flow_rad);
+
+        Ptr<VariationalRefinement> var = createVariationalFlowRefinement();
+        var->setAlpha(rng.uniform(1.0f, 100.0f));
+        var->setGamma(rng.uniform(0.1f, 10.0f));
+        var->setDelta(rng.uniform(0.1f, 10.0f));
+        var->setSorIterations(rng.uniform(1, 20));
+        var->setFixedPointIterations(rng.uniform(1, 20));
+        var->setOmega(rng.uniform(1.01f, 1.99f));
+
+        cv::setNumThreads(cv::getNumberOfCPUs());
+        Mat resMultiThread;
+        flow.copyTo(resMultiThread);
+        var->calc(frame1, frame2, resMultiThread);
+
+        cv::setNumThreads(1);
+        Mat resSingleThread;
+        flow.copyTo(resSingleThread);
+        var->calc(frame1, frame2, resSingleThread);
+
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_INF), MAX_DIF);
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_L1), MAX_MEAN_DIF * frame1.total());
+
+        // resulting flow should be within the frame bounds:
+        double min_val, max_val;
+        minMaxLoc(resMultiThread, &min_val, &max_val);
+        EXPECT_LE(abs(min_val), sqrt(size.height * size.height + size.width * size.width));
+        EXPECT_LE(abs(max_val), sqrt(size.height * size.height + size.width * size.width));
    }
 }

-
-TEST(Video_OpticalFlowSparseToDenseFlow, accuracy) { CV_SparseToDenseFlowTest test; test.safe_run(); }
+INSTANTIATE_TEST_CASE_P(FullSet, DenseOpticalFlow_VariationalRefinement, Values(szODD, szQVGA));