Merge pull request #13084 from vpisarev:shuffle_optflow_algos

* moved DIS optical flow from opencv_contrib to opencv, moved TVL1 from opencv to opencv_contrib * fixed compile warning * TVL1 optical flow example moved to opencv_contrib

Merge pull request #13084 from vpisarev:shuffle_optflow_algos
* moved DIS optical flow from opencv_contrib to opencv, moved TVL1 from opencv to opencv_contrib * fixed compile warning * TVL1 optical flow example moved to opencv_contrib
96bf2661 · Vadim Pisarevsky · Alexander Alekhin · 841741aa · 96bf2661 · 96bf2661
Commit 96bf2661 authored Nov 09, 2018 by Vadim Pisarevsky Committed by Alexander Alekhin Nov 09, 2018
20 changed files
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@@ -397,6 +397,29 @@ public:
 };
+/** @brief Read a .flo file
+ @param path Path to the file to be loaded
+ The function readOpticalFlow loads a flow field from a file and returns it as a single matrix.
+ Resulting Mat has a type CV_32FC2 - floating-point, 2-channel. First channel corresponds to the
+ flow in the horizontal direction (u), second - vertical (v).
+ */
+CV_EXPORTS_W Mat readOpticalFlow( const String& path );
+/** @brief Write a .flo to disk
+ @param path Path to the file to be written
+ @param flow Flow field to be stored
+ The function stores a flow field in a file, returns true on success, false otherwise.
+ The flow field must be a 2-channel, floating-point matrix (CV_32FC2). First channel corresponds
+ to the flow in the horizontal direction (u), second - vertical (v).
+ */
+CV_EXPORTS_W bool writeOpticalFlow( const String& path, InputArray flow );
+/**
+   Base class for dense optical flow algorithms
+*/
 class CV_EXPORTS_W DenseOpticalFlow : public Algorithm
 {
 public:
@@ -433,131 +456,6 @@ public:
                      OutputArray err = cv::noArray()) = 0;
 };
-/** @brief "Dual TV L1" Optical Flow Algorithm.
-The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and
-@cite Javier2012 .
-Here are important members of the class that control the algorithm, which you can set after
-constructing the class instance:
-   member double tau
-    Time step of the numerical scheme.
-   member double lambda
-    Weight parameter for the data term, attachment parameter. This is the most relevant
-    parameter, which determines the smoothness of the output. The smaller this parameter is,
-    the smoother the solutions we obtain. It depends on the range of motions of the images, so
-    its value should be adapted to each image sequence.
-   member double theta
-    Weight parameter for (u - v)\^2, tightness parameter. It serves as a link between the
-    attachment and the regularization terms. In theory, it should have a small value in order
-    to maintain both parts in correspondence. The method is stable for a large range of values
-    of this parameter.
-   member int nscales
-    Number of scales used to create the pyramid of images.
-   member int warps
-    Number of warpings per scale. Represents the number of times that I1(x+u0) and grad(
-    I1(x+u0) ) are computed per scale. This is a parameter that assures the stability of the
-    method. It also affects the running time, so it is a compromise between speed and
-    accuracy.
-   member double epsilon
-    Stopping criterion threshold used in the numerical scheme, which is a trade-off between
-    precision and running time. A small value will yield more accurate solutions at the
-    expense of a slower convergence.
-   member int iterations
-    Stopping criterion iterations number used in the numerical scheme.
-C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
-Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
-*/
-class CV_EXPORTS_W DualTVL1OpticalFlow : public DenseOpticalFlow
-{
-public:
-    //! @brief Time step of the numerical scheme
-    /** @see setTau */
-    CV_WRAP virtual double getTau() const = 0;
-    /** @copybrief getTau @see getTau */
-    CV_WRAP virtual void setTau(double val) = 0;
-    //! @brief Weight parameter for the data term, attachment parameter
-    /** @see setLambda */
-    CV_WRAP virtual double getLambda() const = 0;
-    /** @copybrief getLambda @see getLambda */
-    CV_WRAP virtual void setLambda(double val) = 0;
-    //! @brief Weight parameter for (u - v)^2, tightness parameter
-    /** @see setTheta */
-    CV_WRAP virtual double getTheta() const = 0;
-    /** @copybrief getTheta @see getTheta */
-    CV_WRAP virtual void setTheta(double val) = 0;
-    //! @brief coefficient for additional illumination variation term
-    /** @see setGamma */
-    CV_WRAP virtual double getGamma() const = 0;
-    /** @copybrief getGamma @see getGamma */
-    CV_WRAP virtual void setGamma(double val) = 0;
-    //! @brief Number of scales used to create the pyramid of images
-    /** @see setScalesNumber */
-    CV_WRAP virtual int getScalesNumber() const = 0;
-    /** @copybrief getScalesNumber @see getScalesNumber */
-    CV_WRAP virtual void setScalesNumber(int val) = 0;
-    //! @brief Number of warpings per scale
-    /** @see setWarpingsNumber */
-    CV_WRAP virtual int getWarpingsNumber() const = 0;
-    /** @copybrief getWarpingsNumber @see getWarpingsNumber */
-    CV_WRAP virtual void setWarpingsNumber(int val) = 0;
-    //! @brief Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time
-    /** @see setEpsilon */
-    CV_WRAP virtual double getEpsilon() const = 0;
-    /** @copybrief getEpsilon @see getEpsilon */
-    CV_WRAP virtual void setEpsilon(double val) = 0;
-    //! @brief Inner iterations (between outlier filtering) used in the numerical scheme
-    /** @see setInnerIterations */
-    CV_WRAP virtual int getInnerIterations() const = 0;
-    /** @copybrief getInnerIterations @see getInnerIterations */
-    CV_WRAP virtual void setInnerIterations(int val) = 0;
-    //! @brief Outer iterations (number of inner loops) used in the numerical scheme
-    /** @see setOuterIterations */
-    CV_WRAP virtual int getOuterIterations() const = 0;
-    /** @copybrief getOuterIterations @see getOuterIterations */
-    CV_WRAP virtual void setOuterIterations(int val) = 0;
-    //! @brief Use initial flow
-    /** @see setUseInitialFlow */
-    CV_WRAP virtual bool getUseInitialFlow() const = 0;
-    /** @copybrief getUseInitialFlow @see getUseInitialFlow */
-    CV_WRAP virtual void setUseInitialFlow(bool val) = 0;
-    //! @brief Step between scales (<1)
-    /** @see setScaleStep */
-    CV_WRAP virtual double getScaleStep() const = 0;
-    /** @copybrief getScaleStep @see getScaleStep */
-    CV_WRAP virtual void setScaleStep(double val) = 0;
-    //! @brief Median filter kernel size (1 = no filter) (3 or 5)
-    /** @see setMedianFiltering */
-    CV_WRAP virtual int getMedianFiltering() const = 0;
-    /** @copybrief getMedianFiltering @see getMedianFiltering */
-    CV_WRAP virtual void setMedianFiltering(int val) = 0;
-    /** @brief Creates instance of cv::DualTVL1OpticalFlow*/
-    CV_WRAP static Ptr<DualTVL1OpticalFlow> create(
-                                            double tau = 0.25,
-                                            double lambda = 0.15,
-                                            double theta = 0.3,
-                                            int nscales = 5,
-                                            int warps = 5,
-                                            double epsilon = 0.01,
-                                            int innnerIterations = 30,
-                                            int outerIterations = 10,
-                                            double scaleStep = 0.8,
-                                            double gamma = 0.0,
-                                            int medianFiltering = 5,
-                                            bool useInitialFlow = false);
-};
-/** @brief Creates instance of cv::DenseOpticalFlow
-*/
-CV_EXPORTS_W Ptr<DualTVL1OpticalFlow> createOptFlow_DualTVL1();
 /** @brief Class computing a dense optical flow using the Gunnar Farneback's algorithm.
 */
@@ -599,6 +497,166 @@ public:
            int flags = 0);
 };
+/** @brief Variational optical flow refinement
+This class implements variational refinement of the input flow field, i.e.
+it uses input flow to initialize the minimization of the following functional:
+\f$E(U) = \int_{\Omega} \delta \Psi(E_I) + \gamma \Psi(E_G) + \alpha \Psi(E_S) \f$,
+where \f$E_I,E_G,E_S\f$ are color constancy, gradient constancy and smoothness terms
+respectively. \f$\Psi(s^2)=\sqrt{s^2+\epsilon^2}\f$ is a robust penalizer to limit the
+influence of outliers. A complete formulation and a description of the minimization
+procedure can be found in @cite Brox2004
+*/
+class CV_EXPORTS_W VariationalRefinement : public DenseOpticalFlow
+{
+public:
+    /** @brief @ref calc function overload to handle separate horizontal (u) and vertical (v) flow components
+    (to avoid extra splits/merges) */
+    CV_WRAP virtual void calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v) = 0;
+    /** @brief Number of outer (fixed-point) iterations in the minimization procedure.
+    @see setFixedPointIterations */
+    CV_WRAP virtual int getFixedPointIterations() const = 0;
+    /** @copybrief getFixedPointIterations @see getFixedPointIterations */
+    CV_WRAP virtual void setFixedPointIterations(int val) = 0;
+    /** @brief Number of inner successive over-relaxation (SOR) iterations
+        in the minimization procedure to solve the respective linear system.
+    @see setSorIterations */
+    CV_WRAP virtual int getSorIterations() const = 0;
+    /** @copybrief getSorIterations @see getSorIterations */
+    CV_WRAP virtual void setSorIterations(int val) = 0;
+    /** @brief Relaxation factor in SOR
+    @see setOmega */
+    CV_WRAP virtual float getOmega() const = 0;
+    /** @copybrief getOmega @see getOmega */
+    CV_WRAP virtual void setOmega(float val) = 0;
+    /** @brief Weight of the smoothness term
+    @see setAlpha */
+    CV_WRAP virtual float getAlpha() const = 0;
+    /** @copybrief getAlpha @see getAlpha */
+    CV_WRAP virtual void setAlpha(float val) = 0;
+    /** @brief Weight of the color constancy term
+    @see setDelta */
+    CV_WRAP virtual float getDelta() const = 0;
+    /** @copybrief getDelta @see getDelta */
+    CV_WRAP virtual void setDelta(float val) = 0;
+    /** @brief Weight of the gradient constancy term
+    @see setGamma */
+    CV_WRAP virtual float getGamma() const = 0;
+    /** @copybrief getGamma @see getGamma */
+    CV_WRAP virtual void setGamma(float val) = 0;
+    /** @brief Creates an instance of VariationalRefinement
+    */
+    CV_WRAP static Ptr<VariationalRefinement> create();
+};
+/** @brief DIS optical flow algorithm.
+This class implements the Dense Inverse Search (DIS) optical flow algorithm. More
+details about the algorithm can be found at @cite Kroeger2016 . Includes three presets with preselected
+parameters to provide reasonable trade-off between speed and quality. However, even the slowest preset is
+still relatively fast, use DeepFlow if you need better quality and don't care about speed.
+This implementation includes several additional features compared to the algorithm described in the paper,
+including spatial propagation of flow vectors (@ref getUseSpatialPropagation), as well as an option to
+utilize an initial flow approximation passed to @ref calc (which is, essentially, temporal propagation,
+if the previous frame's flow field is passed).
+*/
+class CV_EXPORTS_W DISOpticalFlow : public DenseOpticalFlow
+{
+public:
+    enum
+    {
+        PRESET_ULTRAFAST = 0,
+        PRESET_FAST = 1,
+        PRESET_MEDIUM = 2
+    };
+    /** @brief Finest level of the Gaussian pyramid on which the flow is computed (zero level
+        corresponds to the original image resolution). The final flow is obtained by bilinear upscaling.
+        @see setFinestScale */
+    CV_WRAP virtual int getFinestScale() const = 0;
+    /** @copybrief getFinestScale @see getFinestScale */
+    CV_WRAP virtual void setFinestScale(int val) = 0;
+    /** @brief Size of an image patch for matching (in pixels). Normally, default 8x8 patches work well
+        enough in most cases.
+        @see setPatchSize */
+    CV_WRAP virtual int getPatchSize() const = 0;
+    /** @copybrief getPatchSize @see getPatchSize */
+    CV_WRAP virtual void setPatchSize(int val) = 0;
+    /** @brief Stride between neighbor patches. Must be less than patch size. Lower values correspond
+        to higher flow quality.
+        @see setPatchStride */
+    CV_WRAP virtual int getPatchStride() const = 0;
+    /** @copybrief getPatchStride @see getPatchStride */
+    CV_WRAP virtual void setPatchStride(int val) = 0;
+    /** @brief Maximum number of gradient descent iterations in the patch inverse search stage. Higher values
+        may improve quality in some cases.
+        @see setGradientDescentIterations */
+    CV_WRAP virtual int getGradientDescentIterations() const = 0;
+    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
+    CV_WRAP virtual void setGradientDescentIterations(int val) = 0;
+    /** @brief Number of fixed point iterations of variational refinement per scale. Set to zero to
+        disable variational refinement completely. Higher values will typically result in more smooth and
+        high-quality flow.
+    @see setGradientDescentIterations */
+    CV_WRAP virtual int getVariationalRefinementIterations() const = 0;
+    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
+    CV_WRAP virtual void setVariationalRefinementIterations(int val) = 0;
+    /** @brief Weight of the smoothness term
+    @see setVariationalRefinementAlpha */
+    CV_WRAP virtual float getVariationalRefinementAlpha() const = 0;
+    /** @copybrief getVariationalRefinementAlpha @see getVariationalRefinementAlpha */
+    CV_WRAP virtual void setVariationalRefinementAlpha(float val) = 0;
+    /** @brief Weight of the color constancy term
+    @see setVariationalRefinementDelta */
+    CV_WRAP virtual float getVariationalRefinementDelta() const = 0;
+    /** @copybrief getVariationalRefinementDelta @see getVariationalRefinementDelta */
+    CV_WRAP virtual void setVariationalRefinementDelta(float val) = 0;
+    /** @brief Weight of the gradient constancy term
+    @see setVariationalRefinementGamma */
+    CV_WRAP virtual float getVariationalRefinementGamma() const = 0;
+    /** @copybrief getVariationalRefinementGamma @see getVariationalRefinementGamma */
+    CV_WRAP virtual void setVariationalRefinementGamma(float val) = 0;
+    /** @brief Whether to use mean-normalization of patches when computing patch distance. It is turned on
+        by default as it typically provides a noticeable quality boost because of increased robustness to
+        illumination variations. Turn it off if you are certain that your sequence doesn't contain any changes
+        in illumination.
+    @see setUseMeanNormalization */
+    CV_WRAP virtual bool getUseMeanNormalization() const = 0;
+    /** @copybrief getUseMeanNormalization @see getUseMeanNormalization */
+    CV_WRAP virtual void setUseMeanNormalization(bool val) = 0;
+    /** @brief Whether to use spatial propagation of good optical flow vectors. This option is turned on by
+        default, as it tends to work better on average and can sometimes help recover from major errors
+        introduced by the coarse-to-fine scheme employed by the DIS optical flow algorithm. Turning this
+        option off can make the output flow field a bit smoother, however.
+    @see setUseSpatialPropagation */
+    CV_WRAP virtual bool getUseSpatialPropagation() const = 0;
+    /** @copybrief getUseSpatialPropagation @see getUseSpatialPropagation */
+    CV_WRAP virtual void setUseSpatialPropagation(bool val) = 0;
+    /** @brief Creates an instance of DISOpticalFlow
+    @param preset one of PRESET_ULTRAFAST, PRESET_FAST and PRESET_MEDIUM
+    */
+    CV_WRAP static Ptr<DISOpticalFlow> create(int preset = DISOpticalFlow::PRESET_FAST);
+};
 /** @brief Class used for calculating a sparse optical flow.

--- a/modules/video/perf/opencl/perf_dis_optflow.cpp
+++ b/modules/video/perf/opencl/perf_dis_optflow.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "../perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+namespace opencv_test { namespace {
+#ifdef HAVE_OPENCL
+void MakeArtificialExample(UMat &dst_frame1, UMat &dst_frame2);
+typedef tuple<String, Size> DISParams;
+typedef TestBaseWithParam<DISParams> DenseOpticalFlow_DIS;
+OCL_PERF_TEST_P(DenseOpticalFlow_DIS, perf,
+                Combine(Values("PRESET_ULTRAFAST", "PRESET_FAST", "PRESET_MEDIUM"), Values(szVGA, sz720p, sz1080p)))
+{
+    DISParams params = GetParam();
+    // use strings to print preset names in the perf test results:
+    String preset_string = get<0>(params);
+    int preset = DISOpticalFlow::PRESET_FAST;
+    if (preset_string == "PRESET_ULTRAFAST")
+        preset = DISOpticalFlow::PRESET_ULTRAFAST;
+    else if (preset_string == "PRESET_FAST")
+        preset = DISOpticalFlow::PRESET_FAST;
+    else if (preset_string == "PRESET_MEDIUM")
+        preset = DISOpticalFlow::PRESET_MEDIUM;
+    Size sz = get<1>(params);
+    UMat frame1(sz, CV_8U);
+    UMat frame2(sz, CV_8U);
+    UMat flow;
+    MakeArtificialExample(frame1, frame2);
+    Ptr<DenseOpticalFlow> algo = DISOpticalFlow::create(preset);
+    OCL_TEST_CYCLE_N(10)
+    {
+        algo->calc(frame1, frame2, flow);
+    }
+    SANITY_CHECK_NOTHING();
+}
+void MakeArtificialExample(UMat &dst_frame1, UMat &dst_frame2)
+{
+    int src_scale = 2;
+    int OF_scale = 6;
+    double sigma = dst_frame1.cols / 300;
+    UMat tmp(Size(dst_frame1.cols / (1 << src_scale), dst_frame1.rows / (1 << src_scale)), CV_8U);
+    randu(tmp, 0, 255);
+    resize(tmp, dst_frame1, dst_frame1.size(), 0.0, 0.0, INTER_LINEAR_EXACT);
+    resize(tmp, dst_frame2, dst_frame2.size(), 0.0, 0.0, INTER_LINEAR_EXACT);
+    Mat displacement_field(Size(dst_frame1.cols / (1 << OF_scale), dst_frame1.rows / (1 << OF_scale)),
+                           CV_32FC2);
+    randn(displacement_field, 0.0, sigma);
+    resize(displacement_field, displacement_field, dst_frame2.size(), 0.0, 0.0, INTER_CUBIC);
+    for (int i = 0; i < displacement_field.rows; i++)
+        for (int j = 0; j < displacement_field.cols; j++)
+            displacement_field.at<Vec2f>(i, j) += Vec2f((float)j, (float)i);
+    remap(dst_frame2, dst_frame2, displacement_field, Mat(), INTER_LINEAR, BORDER_REPLICATE);
+}
+#endif // HAVE_OPENCL
+}} // namespace
--- a/modules/video/perf/perf_disflow.cpp
+++ b/modules/video/perf/perf_disflow.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "perf_precomp.hpp"
+namespace opencv_test { namespace {
+void MakeArtificialExample(Mat &dst_frame1, Mat &dst_frame2);
+typedef tuple<String, Size> DISParams;
+typedef TestBaseWithParam<DISParams> DenseOpticalFlow_DIS;
+PERF_TEST_P(DenseOpticalFlow_DIS, perf,
+            Combine(Values("PRESET_ULTRAFAST", "PRESET_FAST", "PRESET_MEDIUM"), Values(szVGA, sz720p, sz1080p)))
+{
+    DISParams params = GetParam();
+    // use strings to print preset names in the perf test results:
+    String preset_string = get<0>(params);
+    int preset = DISOpticalFlow::PRESET_FAST;
+    if (preset_string == "PRESET_ULTRAFAST")
+        preset = DISOpticalFlow::PRESET_ULTRAFAST;
+    else if (preset_string == "PRESET_FAST")
+        preset = DISOpticalFlow::PRESET_FAST;
+    else if (preset_string == "PRESET_MEDIUM")
+        preset = DISOpticalFlow::PRESET_MEDIUM;
+    Size sz = get<1>(params);
+    Mat frame1(sz, CV_8U);
+    Mat frame2(sz, CV_8U);
+    Mat flow;
+    MakeArtificialExample(frame1, frame2);
+    TEST_CYCLE_N(10)
+    {
+        Ptr<DenseOpticalFlow> algo = DISOpticalFlow::create(preset);
+        algo->calc(frame1, frame2, flow);
+    }
+    SANITY_CHECK_NOTHING();
+}
+void MakeArtificialExample(Mat &dst_frame1, Mat &dst_frame2)
+{
+    int src_scale = 2;
+    int OF_scale = 6;
+    double sigma = dst_frame1.cols / 300;
+    Mat tmp(Size(dst_frame1.cols / (1 << src_scale), dst_frame1.rows / (1 << src_scale)), CV_8U);
+    randu(tmp, 0, 255);
+    resize(tmp, dst_frame1, dst_frame1.size(), 0.0, 0.0, INTER_LINEAR_EXACT);
+    resize(tmp, dst_frame2, dst_frame2.size(), 0.0, 0.0, INTER_LINEAR_EXACT);
+    Mat displacement_field(Size(dst_frame1.cols / (1 << OF_scale), dst_frame1.rows / (1 << OF_scale)),
+                           CV_32FC2);
+    randn(displacement_field, 0.0, sigma);
+    resize(displacement_field, displacement_field, dst_frame2.size(), 0.0, 0.0, INTER_CUBIC);
+    for (int i = 0; i < displacement_field.rows; i++)
+        for (int j = 0; j < displacement_field.cols; j++)
+            displacement_field.at<Vec2f>(i, j) += Vec2f((float)j, (float)i);
+    remap(dst_frame2, dst_frame2, displacement_field, Mat(), INTER_LINEAR, BORDER_REPLICATE);
+}
+}} // namespace
--- a/modules/video/perf/perf_precomp.hpp
+++ b/modules/video/perf/perf_precomp.hpp
@@ -6,5 +6,11 @@
 #include "opencv2/ts.hpp"
 #include <opencv2/video.hpp>
+#include "opencv2/ts/ts_perf.hpp"
+namespace cvtest
+{
+using namespace perf;
+}
 #endif
--- a/modules/video/perf/perf_tvl1optflow.cpp
+++ b/modules/video/perf/perf_tvl1optflow.cpp
-#include "perf_precomp.hpp"
-namespace opencv_test { namespace {
-using namespace perf;
-typedef TestBaseWithParam< std::pair<string, string> > ImagePair;
-std::pair<string, string> impair(const char* im1, const char* im2)
-{
-    return std::make_pair(string(im1), string(im2));
-}
-PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1, testing::Values(impair("cv/optflow/RubberWhale1.png", "cv/optflow/RubberWhale2.png")))
-{
-    declare.time(260);
-    Mat frame1 = imread(getDataPath(GetParam().first), IMREAD_GRAYSCALE);
-    Mat frame2 = imread(getDataPath(GetParam().second), IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-    ASSERT_FALSE(frame2.empty());
-    Mat flow;
-    Ptr<DenseOpticalFlow> tvl1 = createOptFlow_DualTVL1();
-    TEST_CYCLE() tvl1->calc(frame1, frame2, flow);
-    SANITY_CHECK_NOTHING();
-}
-}} // namespace
--- a/modules/video/perf/perf_variational_refinement.cpp
+++ b/modules/video/perf/perf_variational_refinement.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "perf_precomp.hpp"
+namespace opencv_test { namespace {
+typedef tuple<Size, int, int> VarRefParams;
+typedef TestBaseWithParam<VarRefParams> DenseOpticalFlow_VariationalRefinement;
+PERF_TEST_P(DenseOpticalFlow_VariationalRefinement, perf, Combine(Values(szQVGA, szVGA), Values(5, 10), Values(5, 10)))
+{
+    VarRefParams params = GetParam();
+    Size sz = get<0>(params);
+    int sorIter = get<1>(params);
+    int fixedPointIter = get<2>(params);
+    Mat frame1(sz, CV_8U);
+    Mat frame2(sz, CV_8U);
+    Mat flow(sz, CV_32FC2);
+    randu(frame1, 0, 255);
+    randu(frame2, 0, 255);
+    flow.setTo(0.0f);
+    TEST_CYCLE_N(10)
+    {
+        Ptr<VariationalRefinement> var = VariationalRefinement::create();
+        var->setAlpha(20.0f);
+        var->setGamma(10.0f);
+        var->setDelta(5.0f);
+        var->setSorIterations(sorIter);
+        var->setFixedPointIterations(fixedPointIter);
+        var->calc(frame1, frame2, flow);
+    }
+    SANITY_CHECK_NOTHING();
+}
+}} // namespace
--- a/modules/video/src/dis_flow.cpp
+++ b/modules/video/src/dis_flow.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include "opencl_kernels_video.hpp"
+using namespace std;
+#define EPS 0.001F
+#define INF 1E+10F
+namespace cv
+{
+class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
+{
+  public:
+    DISOpticalFlowImpl();
+    void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
+    void collectGarbage() CV_OVERRIDE;
+  protected: //!< algorithm parameters
+    int finest_scale, coarsest_scale;
+    int patch_size;
+    int patch_stride;
+    int grad_descent_iter;
+    int variational_refinement_iter;
+    float variational_refinement_alpha;
+    float variational_refinement_gamma;
+    float variational_refinement_delta;
+    bool use_mean_normalization;
+    bool use_spatial_propagation;
+  protected: //!< some auxiliary variables
+    int border_size;
+    int w, h;   //!< flow buffer width and height on the current scale
+    int ws, hs; //!< sparse flow buffer width and height on the current scale
+  public:
+    int getFinestScale() const CV_OVERRIDE { return finest_scale; }
+    void setFinestScale(int val) CV_OVERRIDE { finest_scale = val; }
+    int getPatchSize() const CV_OVERRIDE { return patch_size; }
+    void setPatchSize(int val) CV_OVERRIDE { patch_size = val; }
+    int getPatchStride() const CV_OVERRIDE { return patch_stride; }
+    void setPatchStride(int val) CV_OVERRIDE { patch_stride = val; }
+    int getGradientDescentIterations() const CV_OVERRIDE { return grad_descent_iter; }
+    void setGradientDescentIterations(int val) CV_OVERRIDE { grad_descent_iter = val; }
+    int getVariationalRefinementIterations() const CV_OVERRIDE { return variational_refinement_iter; }
+    void setVariationalRefinementIterations(int val) CV_OVERRIDE { variational_refinement_iter = val; }
+    float getVariationalRefinementAlpha() const CV_OVERRIDE { return variational_refinement_alpha; }
+    void setVariationalRefinementAlpha(float val) CV_OVERRIDE { variational_refinement_alpha = val; }
+    float getVariationalRefinementDelta() const CV_OVERRIDE { return variational_refinement_delta; }
+    void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; }
+    float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; }
+    void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; }
+    bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; }
+    void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; }
+    bool getUseSpatialPropagation() const CV_OVERRIDE { return use_spatial_propagation; }
+    void setUseSpatialPropagation(bool val) CV_OVERRIDE { use_spatial_propagation = val; }
+  protected:                      //!< internal buffers
+    vector<Mat_<uchar> > I0s;     //!< Gaussian pyramid for the current frame
+    vector<Mat_<uchar> > I1s;     //!< Gaussian pyramid for the next frame
+    vector<Mat_<uchar> > I1s_ext; //!< I1s with borders
+    vector<Mat_<short> > I0xs; //!< Gaussian pyramid for the x gradient of the current frame
+    vector<Mat_<short> > I0ys; //!< Gaussian pyramid for the y gradient of the current frame
+    vector<Mat_<float> > Ux; //!< x component of the flow vectors
+    vector<Mat_<float> > Uy; //!< y component of the flow vectors
+    vector<Mat_<float> > initial_Ux; //!< x component of the initial flow field, if one was passed as an input
+    vector<Mat_<float> > initial_Uy; //!< y component of the initial flow field, if one was passed as an input
+    Mat_<Vec2f> U; //!< a buffer for the merged flow
+    Mat_<float> Sx; //!< intermediate sparse flow representation (x component)
+    Mat_<float> Sy; //!< intermediate sparse flow representation (y component)
+    /* Structure tensor components: */
+    Mat_<float> I0xx_buf; //!< sum of squares of x gradient values
+    Mat_<float> I0yy_buf; //!< sum of squares of y gradient values
+    Mat_<float> I0xy_buf; //!< sum of x and y gradient products
+    /* Extra buffers that are useful if patch mean-normalization is used: */
+    Mat_<float> I0x_buf; //!< sum of x gradient values
+    Mat_<float> I0y_buf; //!< sum of y gradient values
+    /* Auxiliary buffers used in structure tensor computation: */
+    Mat_<float> I0xx_buf_aux;
+    Mat_<float> I0yy_buf_aux;
+    Mat_<float> I0xy_buf_aux;
+    Mat_<float> I0x_buf_aux;
+    Mat_<float> I0y_buf_aux;
+    vector<Ptr<VariationalRefinement> > variational_refinement_processors;
+  private: //!< private methods and parallel sections
+    void prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow);
+    void precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, Mat &dst_I0y, Mat &I0x,
+                                   Mat &I0y);
+    struct PatchInverseSearch_ParBody : public ParallelLoopBody
+    {
+        DISOpticalFlowImpl *dis;
+        int nstripes, stripe_sz;
+        int hs;
+        Mat *Sx, *Sy, *Ux, *Uy, *I0, *I1, *I0x, *I0y;
+        int num_iter, pyr_level;
+        PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _hs, Mat &dst_Sx, Mat &dst_Sy,
+                                   Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1, Mat &_I0x, Mat &_I0y, int _num_iter,
+                                   int _pyr_level);
+        void operator()(const Range &range) const CV_OVERRIDE;
+    };
+    struct Densification_ParBody : public ParallelLoopBody
+    {
+        DISOpticalFlowImpl *dis;
+        int nstripes, stripe_sz;
+        int h;
+        Mat *Ux, *Uy, *Sx, *Sy, *I0, *I1;
+        Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h, Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx,
+                              Mat &src_Sy, Mat &_I0, Mat &_I1);
+        void operator()(const Range &range) const CV_OVERRIDE;
+    };
+#ifdef HAVE_OPENCL
+    vector<UMat> u_I0s;     //!< Gaussian pyramid for the current frame
+    vector<UMat> u_I1s;     //!< Gaussian pyramid for the next frame
+    vector<UMat> u_I1s_ext; //!< I1s with borders
+    vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame
+    vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame
+    vector<UMat> u_Ux; //!< x component of the flow vectors
+    vector<UMat> u_Uy; //!< y component of the flow vectors
+    vector<UMat> u_initial_Ux; //!< x component of the initial flow field, if one was passed as an input
+    vector<UMat> u_initial_Uy; //!< y component of the initial flow field, if one was passed as an input
+    UMat u_U; //!< a buffer for the merged flow
+    UMat u_Sx; //!< intermediate sparse flow representation (x component)
+    UMat u_Sy; //!< intermediate sparse flow representation (y component)
+    /* Structure tensor components: */
+    UMat u_I0xx_buf; //!< sum of squares of x gradient values
+    UMat u_I0yy_buf; //!< sum of squares of y gradient values
+    UMat u_I0xy_buf; //!< sum of x and y gradient products
+    /* Extra buffers that are useful if patch mean-normalization is used: */
+    UMat u_I0x_buf; //!< sum of x gradient values
+    UMat u_I0y_buf; //!< sum of y gradient values
+    /* Auxiliary buffers used in structure tensor computation: */
+    UMat u_I0xx_buf_aux;
+    UMat u_I0yy_buf_aux;
+    UMat u_I0xy_buf_aux;
+    UMat u_I0x_buf_aux;
+    UMat u_I0y_buf_aux;
+    bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
+                                       UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y);
+    void ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool use_flow);
+    bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow);
+    bool ocl_Densification(UMat &dst_Ux, UMat &dst_Uy, UMat &src_Sx, UMat &src_Sy, UMat &_I0, UMat &_I1);
+    bool ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
+                                UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level);
+#endif
+};
+DISOpticalFlowImpl::DISOpticalFlowImpl()
+{
+    finest_scale = 2;
+    patch_size = 8;
+    patch_stride = 4;
+    grad_descent_iter = 16;
+    variational_refinement_iter = 5;
+    variational_refinement_alpha = 20.f;
+    variational_refinement_gamma = 10.f;
+    variational_refinement_delta = 5.f;
+    border_size = 16;
+    use_mean_normalization = true;
+    use_spatial_propagation = true;
+    /* Use separate variational refinement instances for different scales to avoid repeated memory allocation: */
+    int max_possible_scales = 10;
+    for (int i = 0; i < max_possible_scales; i++)
+        variational_refinement_processors.push_back(VariationalRefinement::create());
+}
+void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow)
+{
+    I0s.resize(coarsest_scale + 1);
+    I1s.resize(coarsest_scale + 1);
+    I1s_ext.resize(coarsest_scale + 1);
+    I0xs.resize(coarsest_scale + 1);
+    I0ys.resize(coarsest_scale + 1);
+    Ux.resize(coarsest_scale + 1);
+    Uy.resize(coarsest_scale + 1);
+    Mat flow_uv[2];
+    if (use_flow)
+    {
+        split(flow, flow_uv);
+        initial_Ux.resize(coarsest_scale + 1);
+        initial_Uy.resize(coarsest_scale + 1);
+    }
+    int fraction = 1;
+    int cur_rows = 0, cur_cols = 0;
+    for (int i = 0; i <= coarsest_scale; i++)
+    {
+        /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
+        if (i == finest_scale)
+        {
+            cur_rows = I0.rows / fraction;
+            cur_cols = I0.cols / fraction;
+            I0s[i].create(cur_rows, cur_cols);
+            resize(I0, I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
+            I1s[i].create(cur_rows, cur_cols);
+            resize(I1, I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
+            /* These buffers are reused in each scale so we initialize them once on the finest scale: */
+            Sx.create(cur_rows / patch_stride, cur_cols / patch_stride);
+            Sy.create(cur_rows / patch_stride, cur_cols / patch_stride);
+            I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
+            I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
+            I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
+            I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
+            I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
+            I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride);
+            I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride);
+            I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride);
+            I0x_buf_aux.create(cur_rows, cur_cols / patch_stride);
+            I0y_buf_aux.create(cur_rows, cur_cols / patch_stride);
+            U.create(cur_rows, cur_cols);
+        }
+        else if (i > finest_scale)
+        {
+            cur_rows = I0s[i - 1].rows / 2;
+            cur_cols = I0s[i - 1].cols / 2;
+            I0s[i].create(cur_rows, cur_cols);
+            resize(I0s[i - 1], I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
+            I1s[i].create(cur_rows, cur_cols);
+            resize(I1s[i - 1], I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
+        }
+        if (i >= finest_scale)
+        {
+            I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size);
+            copyMakeBorder(I1s[i], I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
+            I0xs[i].create(cur_rows, cur_cols);
+            I0ys[i].create(cur_rows, cur_cols);
+            spatialGradient(I0s[i], I0xs[i], I0ys[i]);
+            Ux[i].create(cur_rows, cur_cols);
+            Uy[i].create(cur_rows, cur_cols);
+            variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
+            variational_refinement_processors[i]->setDelta(variational_refinement_delta);
+            variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
+            variational_refinement_processors[i]->setSorIterations(5);
+            variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
+            if (use_flow)
+            {
+                resize(flow_uv[0], initial_Ux[i], Size(cur_cols, cur_rows));
+                initial_Ux[i] /= fraction;
+                resize(flow_uv[1], initial_Uy[i], Size(cur_cols, cur_rows));
+                initial_Uy[i] /= fraction;
+            }
+        }
+        fraction *= 2;
+    }
+}
+/* This function computes the structure tensor elements (local sums of I0x^2, I0x*I0y and I0y^2).
+ * A simple box filter is not used instead because we need to compute these sums on a sparse grid
+ * and store them densely in the output buffers.
+ */
+void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x,
+                                                   Mat &dst_I0y, Mat &I0x, Mat &I0y)
+{
+    float *I0xx_ptr = dst_I0xx.ptr<float>();
+    float *I0yy_ptr = dst_I0yy.ptr<float>();
+    float *I0xy_ptr = dst_I0xy.ptr<float>();
+    float *I0x_ptr = dst_I0x.ptr<float>();
+    float *I0y_ptr = dst_I0y.ptr<float>();
+    float *I0xx_aux_ptr = I0xx_buf_aux.ptr<float>();
+    float *I0yy_aux_ptr = I0yy_buf_aux.ptr<float>();
+    float *I0xy_aux_ptr = I0xy_buf_aux.ptr<float>();
+    float *I0x_aux_ptr = I0x_buf_aux.ptr<float>();
+    float *I0y_aux_ptr = I0y_buf_aux.ptr<float>();
+    /* Separable box filter: horizontal pass */
+    for (int i = 0; i < h; i++)
+    {
+        float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
+        short *x_row = I0x.ptr<short>(i);
+        short *y_row = I0y.ptr<short>(i);
+        for (int j = 0; j < patch_size; j++)
+        {
+            sum_xx += x_row[j] * x_row[j];
+            sum_yy += y_row[j] * y_row[j];
+            sum_xy += x_row[j] * y_row[j];
+            sum_x += x_row[j];
+            sum_y += y_row[j];
+        }
+        I0xx_aux_ptr[i * ws] = sum_xx;
+        I0yy_aux_ptr[i * ws] = sum_yy;
+        I0xy_aux_ptr[i * ws] = sum_xy;
+        I0x_aux_ptr[i * ws] = sum_x;
+        I0y_aux_ptr[i * ws] = sum_y;
+        int js = 1;
+        for (int j = patch_size; j < w; j++)
+        {
+            sum_xx += (x_row[j] * x_row[j] - x_row[j - patch_size] * x_row[j - patch_size]);
+            sum_yy += (y_row[j] * y_row[j] - y_row[j - patch_size] * y_row[j - patch_size]);
+            sum_xy += (x_row[j] * y_row[j] - x_row[j - patch_size] * y_row[j - patch_size]);
+            sum_x += (x_row[j] - x_row[j - patch_size]);
+            sum_y += (y_row[j] - y_row[j - patch_size]);
+            if ((j - patch_size + 1) % patch_stride == 0)
+            {
+                I0xx_aux_ptr[i * ws + js] = sum_xx;
+                I0yy_aux_ptr[i * ws + js] = sum_yy;
+                I0xy_aux_ptr[i * ws + js] = sum_xy;
+                I0x_aux_ptr[i * ws + js] = sum_x;
+                I0y_aux_ptr[i * ws + js] = sum_y;
+                js++;
+            }
+        }
+    }
+    AutoBuffer<float> sum_xx(ws), sum_yy(ws), sum_xy(ws), sum_x(ws), sum_y(ws);
+    for (int j = 0; j < ws; j++)
+    {
+        sum_xx[j] = 0.0f;
+        sum_yy[j] = 0.0f;
+        sum_xy[j] = 0.0f;
+        sum_x[j] = 0.0f;
+        sum_y[j] = 0.0f;
+    }
+    /* Separable box filter: vertical pass */
+    for (int i = 0; i < patch_size; i++)
+        for (int j = 0; j < ws; j++)
+        {
+            sum_xx[j] += I0xx_aux_ptr[i * ws + j];
+            sum_yy[j] += I0yy_aux_ptr[i * ws + j];
+            sum_xy[j] += I0xy_aux_ptr[i * ws + j];
+            sum_x[j] += I0x_aux_ptr[i * ws + j];
+            sum_y[j] += I0y_aux_ptr[i * ws + j];
+        }
+    for (int j = 0; j < ws; j++)
+    {
+        I0xx_ptr[j] = sum_xx[j];
+        I0yy_ptr[j] = sum_yy[j];
+        I0xy_ptr[j] = sum_xy[j];
+        I0x_ptr[j] = sum_x[j];
+        I0y_ptr[j] = sum_y[j];
+    }
+    int is = 1;
+    for (int i = patch_size; i < h; i++)
+    {
+        for (int j = 0; j < ws; j++)
+        {
+            sum_xx[j] += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
+            sum_yy[j] += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
+            sum_xy[j] += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
+            sum_x[j] += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
+            sum_y[j] += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
+        }
+        if ((i - patch_size + 1) % patch_stride == 0)
+        {
+            for (int j = 0; j < ws; j++)
+            {
+                I0xx_ptr[is * ws + j] = sum_xx[j];
+                I0yy_ptr[is * ws + j] = sum_yy[j];
+                I0xy_ptr[is * ws + j] = sum_xy[j];
+                I0x_ptr[is * ws + j] = sum_x[j];
+                I0y_ptr[is * ws + j] = sum_y[j];
+            }
+            is++;
+        }
+    }
+}
+DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes,
+                                                                           int _hs, Mat &dst_Sx, Mat &dst_Sy,
+                                                                           Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1,
+                                                                           Mat &_I0x, Mat &_I0y, int _num_iter,
+                                                                           int _pyr_level)
+    : dis(&_dis), nstripes(_nstripes), hs(_hs), Sx(&dst_Sx), Sy(&dst_Sy), Ux(&src_Ux), Uy(&src_Uy), I0(&_I0), I1(&_I1),
+      I0x(&_I0x), I0y(&_I0y), num_iter(_num_iter), pyr_level(_pyr_level)
+{
+    stripe_sz = (int)ceil(hs / (double)nstripes);
+}
+/////////////////////////////////////////////* Patch processing functions */////////////////////////////////////////////
+/* Some auxiliary macros */
+#define HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION                                                                         \
+    v_float32x4 w00v = v_setall_f32(w00);                                                                              \
+    v_float32x4 w01v = v_setall_f32(w01);                                                                              \
+    v_float32x4 w10v = v_setall_f32(w10);                                                                              \
+    v_float32x4 w11v = v_setall_f32(w11);                                                                              \
+                                                                                                                       \
+    v_uint8x16 I0_row_16, I1_row_16, I1_row_shifted_16, I1_row_next_16, I1_row_next_shifted_16;                        \
+    v_uint16x8 I0_row_8, I1_row_8, I1_row_shifted_8, I1_row_next_8, I1_row_next_shifted_8, tmp;                        \
+    v_uint32x4 I0_row_4_left, I1_row_4_left, I1_row_shifted_4_left, I1_row_next_4_left, I1_row_next_shifted_4_left;    \
+    v_uint32x4 I0_row_4_right, I1_row_4_right, I1_row_shifted_4_right, I1_row_next_4_right,                            \
+      I1_row_next_shifted_4_right;                                                                                     \
+    v_float32x4 I_diff_left, I_diff_right;                                                                             \
+                                                                                                                       \
+    /* Preload and expand the first row of I1: */                                                                      \
+    I1_row_16 = v_load(I1_ptr);                                                                                        \
+    I1_row_shifted_16 = v_extract<1>(I1_row_16, I1_row_16);                                                            \
+    v_expand(I1_row_16, I1_row_8, tmp);                                                                                \
+    v_expand(I1_row_shifted_16, I1_row_shifted_8, tmp);                                                                \
+    v_expand(I1_row_8, I1_row_4_left, I1_row_4_right);                                                                 \
+    v_expand(I1_row_shifted_8, I1_row_shifted_4_left, I1_row_shifted_4_right);                                         \
+    I1_ptr += I1_stride;
+#define HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION                                                                      \
+    /* Load the next row of I1: */                                                                                     \
+    I1_row_next_16 = v_load(I1_ptr);                                                                                   \
+    /* Circular shift left by 1 element: */                                                                            \
+    I1_row_next_shifted_16 = v_extract<1>(I1_row_next_16, I1_row_next_16);                                             \
+    /* Expand to 8 ushorts (we only need the first 8 values): */                                                       \
+    v_expand(I1_row_next_16, I1_row_next_8, tmp);                                                                      \
+    v_expand(I1_row_next_shifted_16, I1_row_next_shifted_8, tmp);                                                      \
+    /* Separate the left and right halves: */                                                                          \
+    v_expand(I1_row_next_8, I1_row_next_4_left, I1_row_next_4_right);                                                  \
+    v_expand(I1_row_next_shifted_8, I1_row_next_shifted_4_left, I1_row_next_shifted_4_right);                          \
+                                                                                                                       \
+    /* Load current row of I0: */                                                                                      \
+    I0_row_16 = v_load(I0_ptr);                                                                                        \
+    v_expand(I0_row_16, I0_row_8, tmp);                                                                                \
+    v_expand(I0_row_8, I0_row_4_left, I0_row_4_right);                                                                 \
+                                                                                                                       \
+    /* Compute diffs between I0 and bilinearly interpolated I1: */                                                     \
+    I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) +                                              \
+                  w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) +                                      \
+                  w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) +                                         \
+                  w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) -                                 \
+                  v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left));                                                      \
+    I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) +                                            \
+                   w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) +                                    \
+                   w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) +                                       \
+                   w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) -                               \
+                   v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right));
+#define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW                                                                     \
+    I0_ptr += I0_stride;                                                                                               \
+    I1_ptr += I1_stride;                                                                                               \
+                                                                                                                       \
+    I1_row_4_left = I1_row_next_4_left;                                                                                \
+    I1_row_4_right = I1_row_next_4_right;                                                                              \
+    I1_row_shifted_4_left = I1_row_next_shifted_4_left;                                                                \
+    I1_row_shifted_4_right = I1_row_next_shifted_4_right;
+/* This function essentially performs one iteration of gradient descent when finding the most similar patch in I1 for a
+ * given one in I0. It assumes that I0_ptr and I1_ptr already point to the corresponding patches and w00, w01, w10, w11
+ * are precomputed bilinear interpolation weights. It returns the SSD (sum of squared differences) between these patches
+ * and computes the values (dst_dUx, dst_dUy) that are used in the flow vector update. HAL acceleration is implemented
+ * only for the default patch size (8x8). Everything is processed in floats as using fixed-point approximations harms
+ * the quality significantly.
+ */
+inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr, short *I0y_ptr,
+                          int I0_stride, int I1_stride, float w00, float w01, float w10, float w11, int patch_sz)
+{
+    float SSD = 0.0f;
+#ifdef CV_SIMD128
+    if (patch_sz == 8)
+    {
+        /* Variables to accumulate the sums */
+        v_float32x4 Ux_vec = v_setall_f32(0);
+        v_float32x4 Uy_vec = v_setall_f32(0);
+        v_float32x4 SSD_vec = v_setall_f32(0);
+        v_int16x8 I0x_row, I0y_row;
+        v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
+        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
+        for (int row = 0; row < 8; row++)
+        {
+            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
+            I0x_row = v_load(I0x_ptr);
+            v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
+            I0y_row = v_load(I0y_ptr);
+            v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
+            /* Update the sums: */
+            Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
+            Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
+            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            I0x_ptr += I0_stride;
+            I0y_ptr += I0_stride;
+            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
+        }
+        /* Final reduce operations: */
+        dst_dUx = v_reduce_sum(Ux_vec);
+        dst_dUy = v_reduce_sum(Uy_vec);
+        SSD = v_reduce_sum(SSD_vec);
+    }
+    else
+    {
+#endif
+        dst_dUx = 0.0f;
+        dst_dUy = 0.0f;
+        float diff;
+        for (int i = 0; i < patch_sz; i++)
+            for (int j = 0; j < patch_sz; j++)
+            {
+                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
+                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
+                       I0_ptr[i * I0_stride + j];
+                SSD += diff * diff;
+                dst_dUx += diff * I0x_ptr[i * I0_stride + j];
+                dst_dUy += diff * I0y_ptr[i * I0_stride + j];
+            }
+#ifdef CV_SIMD128
+    }
+#endif
+    return SSD;
+}
+/* Same as processPatch, but with patch mean normalization, which improves robustness under changing
+ * lighting conditions
+ */
+inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr,
+                                  short *I0y_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
+                                  float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
+{
+    float sum_diff = 0.0, sum_diff_sq = 0.0;
+    float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
+    float n = (float)patch_sz * patch_sz;
+#ifdef CV_SIMD128
+    if (patch_sz == 8)
+    {
+        /* Variables to accumulate the sums */
+        v_float32x4 sum_I0x_mul_vec = v_setall_f32(0);
+        v_float32x4 sum_I0y_mul_vec = v_setall_f32(0);
+        v_float32x4 sum_diff_vec = v_setall_f32(0);
+        v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
+        v_int16x8 I0x_row, I0y_row;
+        v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
+        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
+        for (int row = 0; row < 8; row++)
+        {
+            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
+            I0x_row = v_load(I0x_ptr);
+            v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
+            I0y_row = v_load(I0y_ptr);
+            v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
+            /* Update the sums: */
+            sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
+            sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
+            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            sum_diff_vec += I_diff_left + I_diff_right;
+            I0x_ptr += I0_stride;
+            I0y_ptr += I0_stride;
+            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
+        }
+        /* Final reduce operations: */
+        sum_I0x_mul = v_reduce_sum(sum_I0x_mul_vec);
+        sum_I0y_mul = v_reduce_sum(sum_I0y_mul_vec);
+        sum_diff = v_reduce_sum(sum_diff_vec);
+        sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
+    }
+    else
+    {
+#endif
+        float diff;
+        for (int i = 0; i < patch_sz; i++)
+            for (int j = 0; j < patch_sz; j++)
+            {
+                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
+                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
+                       I0_ptr[i * I0_stride + j];
+                sum_diff += diff;
+                sum_diff_sq += diff * diff;
+                sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j];
+                sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j];
+            }
+#ifdef CV_SIMD128
+    }
+#endif
+    dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
+    dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
+    return sum_diff_sq - sum_diff * sum_diff / n;
+}
+/* Similar to processPatch, but compute only the sum of squared differences (SSD) between the patches */
+inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
+                        float w11, int patch_sz)
+{
+    float SSD = 0.0f;
+#ifdef CV_SIMD128
+    if (patch_sz == 8)
+    {
+        v_float32x4 SSD_vec = v_setall_f32(0);
+        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
+        for (int row = 0; row < 8; row++)
+        {
+            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
+            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
+        }
+        SSD = v_reduce_sum(SSD_vec);
+    }
+    else
+    {
+#endif
+        float diff;
+        for (int i = 0; i < patch_sz; i++)
+            for (int j = 0; j < patch_sz; j++)
+            {
+                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
+                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
+                       I0_ptr[i * I0_stride + j];
+                SSD += diff * diff;
+            }
+#ifdef CV_SIMD128
+    }
+#endif
+    return SSD;
+}
+/* Same as computeSSD, but with patch mean normalization */
+inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01,
+                                float w10, float w11, int patch_sz)
+{
+    float sum_diff = 0.0f, sum_diff_sq = 0.0f;
+    float n = (float)patch_sz * patch_sz;
+#ifdef CV_SIMD128
+    if (patch_sz == 8)
+    {
+        v_float32x4 sum_diff_vec = v_setall_f32(0);
+        v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
+        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
+        for (int row = 0; row < 8; row++)
+        {
+            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
+            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            sum_diff_vec += I_diff_left + I_diff_right;
+            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
+        }
+        sum_diff = v_reduce_sum(sum_diff_vec);
+        sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
+    }
+    else
+    {
+#endif
+        float diff;
+        for (int i = 0; i < patch_sz; i++)
+            for (int j = 0; j < patch_sz; j++)
+            {
+                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
+                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
+                       I0_ptr[i * I0_stride + j];
+                sum_diff += diff;
+                sum_diff_sq += diff * diff;
+            }
+#ifdef CV_SIMD128
+    }
+#endif
+    return sum_diff_sq - sum_diff * sum_diff / n;
+}
+#undef HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION
+#undef HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION
+#undef HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const
+{
+    // force separate processing of stripes if we are using spatial propagation:
+    if (dis->use_spatial_propagation && range.end > range.start + 1)
+    {
+        for (int n = range.start; n < range.end; n++)
+            (*this)(Range(n, n + 1));
+        return;
+    }
+    int psz = dis->patch_size;
+    int psz2 = psz / 2;
+    int w_ext = dis->w + 2 * dis->border_size; //!< width of I1_ext
+    int bsz = dis->border_size;
+    /* Input dense flow */
+    float *Ux_ptr = Ux->ptr<float>();
+    float *Uy_ptr = Uy->ptr<float>();
+    /* Output sparse flow */
+    float *Sx_ptr = Sx->ptr<float>();
+    float *Sy_ptr = Sy->ptr<float>();
+    uchar *I0_ptr = I0->ptr<uchar>();
+    uchar *I1_ptr = I1->ptr<uchar>();
+    short *I0x_ptr = I0x->ptr<short>();
+    short *I0y_ptr = I0y->ptr<short>();
+    /* Precomputed structure tensor */
+    float *xx_ptr = dis->I0xx_buf.ptr<float>();
+    float *yy_ptr = dis->I0yy_buf.ptr<float>();
+    float *xy_ptr = dis->I0xy_buf.ptr<float>();
+    /* And extra buffers for mean-normalization: */
+    float *x_ptr = dis->I0x_buf.ptr<float>();
+    float *y_ptr = dis->I0y_buf.ptr<float>();
+    bool use_temporal_candidates = false;
+    float *initial_Ux_ptr = NULL, *initial_Uy_ptr = NULL;
+    if (!dis->initial_Ux.empty())
+    {
+        initial_Ux_ptr = dis->initial_Ux[pyr_level].ptr<float>();
+        initial_Uy_ptr = dis->initial_Uy[pyr_level].ptr<float>();
+        use_temporal_candidates = true;
+    }
+    int i, j, dir;
+    int start_is, end_is, start_js, end_js;
+    int start_i, start_j;
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + dis->h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + dis->w - 1.0f;
+    float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
+#define INIT_BILINEAR_WEIGHTS(Ux, Uy)                                                                                  \
+    i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit);                                                       \
+    j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit);                                                       \
+                                                                                                                       \
+    w11 = (i_I1 - floor(i_I1)) * (j_I1 - floor(j_I1));                                                                 \
+    w10 = (i_I1 - floor(i_I1)) * (floor(j_I1) + 1 - j_I1);                                                             \
+    w01 = (floor(i_I1) + 1 - i_I1) * (j_I1 - floor(j_I1));                                                             \
+    w00 = (floor(i_I1) + 1 - i_I1) * (floor(j_I1) + 1 - j_I1);
+#define COMPUTE_SSD(dst, Ux, Uy)                                                                                       \
+    INIT_BILINEAR_WEIGHTS(Ux, Uy);                                                                                     \
+    if (dis->use_mean_normalization)                                                                                   \
+        dst = computeSSDMeanNorm(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00,  \
+                                 w01, w10, w11, psz);                                                                  \
+    else                                                                                                               \
+        dst = computeSSD(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, w01,     \
+                         w10, w11, psz);
+    int num_inner_iter = (int)floor(dis->grad_descent_iter / (float)num_iter);
+    for (int iter = 0; iter < num_iter; iter++)
+    {
+        if (iter % 2 == 0)
+        {
+            dir = 1;
+            start_is = min(range.start * stripe_sz, hs);
+            end_is = min(range.end * stripe_sz, hs);
+            start_js = 0;
+            end_js = dis->ws;
+            start_i = start_is * dis->patch_stride;
+            start_j = 0;
+        }
+        else
+        {
+            dir = -1;
+            start_is = min(range.end * stripe_sz, hs) - 1;
+            end_is = min(range.start * stripe_sz, hs) - 1;
+            start_js = dis->ws - 1;
+            end_js = -1;
+            start_i = start_is * dis->patch_stride;
+            start_j = (dis->ws - 1) * dis->patch_stride;
+        }
+        i = start_i;
+        for (int is = start_is; dir * is < dir * end_is; is += dir)
+        {
+            j = start_j;
+            for (int js = start_js; dir * js < dir * end_js; js += dir)
+            {
+                if (iter == 0)
+                {
+                    /* Using result form the previous pyramid level as the very first approximation: */
+                    Sx_ptr[is * dis->ws + js] = Ux_ptr[(i + psz2) * dis->w + j + psz2];
+                    Sy_ptr[is * dis->ws + js] = Uy_ptr[(i + psz2) * dis->w + j + psz2];
+                }
+                float min_SSD = INF, cur_SSD;
+                if (use_temporal_candidates || dis->use_spatial_propagation)
+                {
+                    COMPUTE_SSD(min_SSD, Sx_ptr[is * dis->ws + js], Sy_ptr[is * dis->ws + js]);
+                }
+                if (use_temporal_candidates)
+                {
+                    /* Try temporal candidates (vectors from the initial flow field that was passed to the function) */
+                    COMPUTE_SSD(cur_SSD, initial_Ux_ptr[(i + psz2) * dis->w + j + psz2],
+                                initial_Uy_ptr[(i + psz2) * dis->w + j + psz2]);
+                    if (cur_SSD < min_SSD)
+                    {
+                        min_SSD = cur_SSD;
+                        Sx_ptr[is * dis->ws + js] = initial_Ux_ptr[(i + psz2) * dis->w + j + psz2];
+                        Sy_ptr[is * dis->ws + js] = initial_Uy_ptr[(i + psz2) * dis->w + j + psz2];
+                    }
+                }
+                if (dis->use_spatial_propagation)
+                {
+                    /* Try spatial candidates: */
+                    if (dir * js > dir * start_js)
+                    {
+                        COMPUTE_SSD(cur_SSD, Sx_ptr[is * dis->ws + js - dir], Sy_ptr[is * dis->ws + js - dir]);
+                        if (cur_SSD < min_SSD)
+                        {
+                            min_SSD = cur_SSD;
+                            Sx_ptr[is * dis->ws + js] = Sx_ptr[is * dis->ws + js - dir];
+                            Sy_ptr[is * dis->ws + js] = Sy_ptr[is * dis->ws + js - dir];
+                        }
+                    }
+                    /* Flow vectors won't actually propagate across different stripes, which is the reason for keeping
+                     * the number of stripes constant. It works well enough in practice and doesn't introduce any
+                     * visible seams.
+                     */
+                    if (dir * is > dir * start_is)
+                    {
+                        COMPUTE_SSD(cur_SSD, Sx_ptr[(is - dir) * dis->ws + js], Sy_ptr[(is - dir) * dis->ws + js]);
+                        if (cur_SSD < min_SSD)
+                        {
+                            min_SSD = cur_SSD;
+                            Sx_ptr[is * dis->ws + js] = Sx_ptr[(is - dir) * dis->ws + js];
+                            Sy_ptr[is * dis->ws + js] = Sy_ptr[(is - dir) * dis->ws + js];
+                        }
+                    }
+                }
+                /* Use the best candidate as a starting point for the gradient descent: */
+                float cur_Ux = Sx_ptr[is * dis->ws + js];
+                float cur_Uy = Sy_ptr[is * dis->ws + js];
+                /* Computing the inverse of the structure tensor: */
+                float detH = xx_ptr[is * dis->ws + js] * yy_ptr[is * dis->ws + js] -
+                             xy_ptr[is * dis->ws + js] * xy_ptr[is * dis->ws + js];
+                if (abs(detH) < EPS)
+                    detH = EPS;
+                float invH11 = yy_ptr[is * dis->ws + js] / detH;
+                float invH12 = -xy_ptr[is * dis->ws + js] / detH;
+                float invH22 = xx_ptr[is * dis->ws + js] / detH;
+                float prev_SSD = INF, SSD;
+                float x_grad_sum = x_ptr[is * dis->ws + js];
+                float y_grad_sum = y_ptr[is * dis->ws + js];
+                for (int t = 0; t < num_inner_iter; t++)
+                {
+                    INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
+                    if (dis->use_mean_normalization)
+                        SSD = processPatchMeanNorm(dUx, dUy, I0_ptr + i * dis->w + j,
+                                                   I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * dis->w + j,
+                                                   I0y_ptr + i * dis->w + j, dis->w, w_ext, w00, w01, w10, w11, psz,
+                                                   x_grad_sum, y_grad_sum);
+                    else
+                        SSD = processPatch(dUx, dUy, I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                           I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j, dis->w, w_ext, w00, w01,
+                                           w10, w11, psz);
+                    dx = invH11 * dUx + invH12 * dUy;
+                    dy = invH12 * dUx + invH22 * dUy;
+                    cur_Ux -= dx;
+                    cur_Uy -= dy;
+                    /* Break when patch distance stops decreasing */
+                    if (SSD >= prev_SSD)
+                        break;
+                    prev_SSD = SSD;
+                }
+                /* If gradient descent converged to a flow vector that is very far from the initial approximation
+                 * (more than patch size) then we don't use it. Noticeably improves the robustness.
+                 */
+                if (norm(Vec2f(cur_Ux - Sx_ptr[is * dis->ws + js], cur_Uy - Sy_ptr[is * dis->ws + js])) <= psz)
+                {
+                    Sx_ptr[is * dis->ws + js] = cur_Ux;
+                    Sy_ptr[is * dis->ws + js] = cur_Uy;
+                }
+                j += dir * dis->patch_stride;
+            }
+            i += dir * dis->patch_stride;
+        }
+    }
+#undef INIT_BILINEAR_WEIGHTS
+#undef COMPUTE_SSD
+}
+DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h,
+                                                                 Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy,
+                                                                 Mat &_I0, Mat &_I1)
+    : dis(&_dis), nstripes(_nstripes), h(_h), Ux(&dst_Ux), Uy(&dst_Uy), Sx(&src_Sx), Sy(&src_Sy), I0(&_I0), I1(&_I1)
+{
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+/* This function transforms a sparse optical flow field obtained by PatchInverseSearch (which computes flow values
+ * on a sparse grid defined by patch_stride) into a dense optical flow field by weighted averaging of values from the
+ * overlapping patches.
+ */
+void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const
+{
+    int start_i = min(range.start * stripe_sz, h);
+    int end_i = min(range.end * stripe_sz, h);
+    /* Input sparse flow */
+    float *Sx_ptr = Sx->ptr<float>();
+    float *Sy_ptr = Sy->ptr<float>();
+    /* Output dense flow */
+    float *Ux_ptr = Ux->ptr<float>();
+    float *Uy_ptr = Uy->ptr<float>();
+    uchar *I0_ptr = I0->ptr<uchar>();
+    uchar *I1_ptr = I1->ptr<uchar>();
+    int psz = dis->patch_size;
+    int pstr = dis->patch_stride;
+    int i_l, i_u;
+    int j_l, j_u;
+    float i_m, j_m, diff;
+    /* These values define the set of sparse grid locations that contain patches overlapping with the current dense flow
+     * location */
+    int start_is, end_is;
+    int start_js, end_js;
+/* Some helper macros for updating this set of sparse grid locations */
+#define UPDATE_SPARSE_I_COORDINATES                                                                                    \
+    if (i % pstr == 0 && i + psz <= h)                                                                                 \
+        end_is++;                                                                                                      \
+    if (i - psz >= 0 && (i - psz) % pstr == 0 && start_is < end_is)                                                    \
+        start_is++;
+#define UPDATE_SPARSE_J_COORDINATES                                                                                    \
+    if (j % pstr == 0 && j + psz <= dis->w)                                                                            \
+        end_js++;                                                                                                      \
+    if (j - psz >= 0 && (j - psz) % pstr == 0 && start_js < end_js)                                                    \
+        start_js++;
+    start_is = 0;
+    end_is = -1;
+    for (int i = 0; i < start_i; i++)
+    {
+        UPDATE_SPARSE_I_COORDINATES;
+    }
+    for (int i = start_i; i < end_i; i++)
+    {
+        UPDATE_SPARSE_I_COORDINATES;
+        start_js = 0;
+        end_js = -1;
+        for (int j = 0; j < dis->w; j++)
+        {
+            UPDATE_SPARSE_J_COORDINATES;
+            float coef, sum_coef = 0.0f;
+            float sum_Ux = 0.0f;
+            float sum_Uy = 0.0f;
+            /* Iterate through all the patches that overlap the current location (i,j) */
+            for (int is = start_is; is <= end_is; is++)
+                for (int js = start_js; js <= end_js; js++)
+                {
+                    j_m = min(max(j + Sx_ptr[is * dis->ws + js], 0.0f), dis->w - 1.0f - EPS);
+                    i_m = min(max(i + Sy_ptr[is * dis->ws + js], 0.0f), dis->h - 1.0f - EPS);
+                    j_l = (int)j_m;
+                    j_u = j_l + 1;
+                    i_l = (int)i_m;
+                    i_u = i_l + 1;
+                    diff = (j_m - j_l) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_u] +
+                           (j_u - j_m) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_l] +
+                           (j_m - j_l) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_u] +
+                           (j_u - j_m) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_l] - I0_ptr[i * dis->w + j];
+                    coef = 1 / max(1.0f, abs(diff));
+                    sum_Ux += coef * Sx_ptr[is * dis->ws + js];
+                    sum_Uy += coef * Sy_ptr[is * dis->ws + js];
+                    sum_coef += coef;
+                }
+            Ux_ptr[i * dis->w + j] = sum_Ux / sum_coef;
+            Uy_ptr[i * dis->w + j] = sum_Uy / sum_coef;
+        }
+    }
+#undef UPDATE_SPARSE_I_COORDINATES
+#undef UPDATE_SPARSE_J_COORDINATES
+}
+#ifdef HAVE_OPENCL
+bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
+                                                UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level)
+{
+    size_t globalSize[] = {(size_t)ws, (size_t)hs};
+    size_t localSize[]  = {16, 16};
+    int idx;
+    int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter);
+    for (int iter = 0; iter < num_iter; iter++)
+    {
+        if (iter == 0)
+        {
+            ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc);
+            size_t global_sz[] = {(size_t)hs * 8};
+            size_t local_sz[]  = {8};
+            idx = 0;
+            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(src_Ux));
+            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(src_Uy));
+            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(I0));
+            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(I1));
+            idx = k1.set(idx, (int)border_size);
+            idx = k1.set(idx, (int)patch_size);
+            idx = k1.set(idx, (int)patch_stride);
+            idx = k1.set(idx, (int)w);
+            idx = k1.set(idx, (int)h);
+            idx = k1.set(idx, (int)ws);
+            idx = k1.set(idx, (int)hs);
+            idx = k1.set(idx, (int)pyr_level);
+            idx = k1.set(idx, ocl::KernelArg::PtrWriteOnly(u_Sx));
+            idx = k1.set(idx, ocl::KernelArg::PtrWriteOnly(u_Sy));
+            if (!k1.run(1, global_sz, local_sz, false))
+                return false;
+            ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::video::dis_flow_oclsrc);
+            idx = 0;
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(src_Ux));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(src_Uy));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I1));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0x));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0y));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xx_buf));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0yy_buf));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xy_buf));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0x_buf));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0y_buf));
+            idx = k2.set(idx, (int)border_size);
+            idx = k2.set(idx, (int)patch_size);
+            idx = k2.set(idx, (int)patch_stride);
+            idx = k2.set(idx, (int)w);
+            idx = k2.set(idx, (int)h);
+            idx = k2.set(idx, (int)ws);
+            idx = k2.set(idx, (int)hs);
+            idx = k2.set(idx, (int)num_inner_iter);
+            idx = k2.set(idx, (int)pyr_level);
+            idx = k2.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
+            idx = k2.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
+            if (!k2.run(2, globalSize, localSize, false))
+                return false;
+        }
+        else
+        {
+            ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc);
+            size_t global_sz[] = {(size_t)hs * 8};
+            size_t local_sz[]  = {8};
+            idx = 0;
+            idx = k3.set(idx, ocl::KernelArg::PtrReadOnly(I0));
+            idx = k3.set(idx, ocl::KernelArg::PtrReadOnly(I1));
+            idx = k3.set(idx, (int)border_size);
+            idx = k3.set(idx, (int)patch_size);
+            idx = k3.set(idx, (int)patch_stride);
+            idx = k3.set(idx, (int)w);
+            idx = k3.set(idx, (int)h);
+            idx = k3.set(idx, (int)ws);
+            idx = k3.set(idx, (int)hs);
+            idx = k3.set(idx, (int)pyr_level);
+            idx = k3.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
+            idx = k3.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
+            if (!k3.run(1, global_sz, local_sz, false))
+                return false;
+            ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::video::dis_flow_oclsrc);
+            idx = 0;
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I1));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0x));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0y));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xx_buf));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0yy_buf));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xy_buf));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0x_buf));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0y_buf));
+            idx = k4.set(idx, (int)border_size);
+            idx = k4.set(idx, (int)patch_size);
+            idx = k4.set(idx, (int)patch_stride);
+            idx = k4.set(idx, (int)w);
+            idx = k4.set(idx, (int)h);
+            idx = k4.set(idx, (int)ws);
+            idx = k4.set(idx, (int)hs);
+            idx = k4.set(idx, (int)num_inner_iter);
+            idx = k4.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
+            idx = k4.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
+            if (!k4.run(2, globalSize, localSize, false))
+                return false;
+        }
+    }
+    return true;
+}
+bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_Ux, UMat &dst_Uy, UMat &src_Sx, UMat &src_Sy, UMat &_I0, UMat &_I1)
+{
+    size_t globalSize[] = {(size_t)w, (size_t)h};
+    size_t localSize[]  = {16, 16};
+    ocl::Kernel kernel("dis_densification", ocl::video::dis_flow_oclsrc);
+    kernel.args(ocl::KernelArg::PtrReadOnly(src_Sx),
+                ocl::KernelArg::PtrReadOnly(src_Sy),
+                ocl::KernelArg::PtrReadOnly(_I0),
+                ocl::KernelArg::PtrReadOnly(_I1),
+                (int)patch_size, (int)patch_stride,
+                (int)w, (int)h, (int)ws,
+                ocl::KernelArg::PtrWriteOnly(dst_Ux),
+                ocl::KernelArg::PtrWriteOnly(dst_Uy));
+    return kernel.run(2, globalSize, localSize, false);
+}
+void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool use_flow)
+{
+    u_I0s.resize(coarsest_scale + 1);
+    u_I1s.resize(coarsest_scale + 1);
+    u_I1s_ext.resize(coarsest_scale + 1);
+    u_I0xs.resize(coarsest_scale + 1);
+    u_I0ys.resize(coarsest_scale + 1);
+    u_Ux.resize(coarsest_scale + 1);
+    u_Uy.resize(coarsest_scale + 1);
+    vector<UMat> flow_uv(2);
+    if (use_flow)
+    {
+        split(flow, flow_uv);
+        u_initial_Ux.resize(coarsest_scale + 1);
+        u_initial_Uy.resize(coarsest_scale + 1);
+    }
+    int fraction = 1;
+    int cur_rows = 0, cur_cols = 0;
+    for (int i = 0; i <= coarsest_scale; i++)
+    {
+        /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
+        if (i == finest_scale)
+        {
+            cur_rows = I0.rows / fraction;
+            cur_cols = I0.cols / fraction;
+            u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
+            resize(I0, u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
+            u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
+            resize(I1, u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
+            /* These buffers are reused in each scale so we initialize them once on the finest scale: */
+            u_Sx.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_Sy.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
+            u_I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+            u_I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+            u_I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+            u_I0x_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+            u_I0y_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
+            u_U.create(cur_rows, cur_cols, CV_32FC2);
+        }
+        else if (i > finest_scale)
+        {
+            cur_rows = u_I0s[i - 1].rows / 2;
+            cur_cols = u_I0s[i - 1].cols / 2;
+            u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
+            resize(u_I0s[i - 1], u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
+            u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
+            resize(u_I1s[i - 1], u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
+        }
+        if (i >= finest_scale)
+        {
+            u_I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size, CV_8UC1);
+            copyMakeBorder(u_I1s[i], u_I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
+            u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1);
+            u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1);
+            spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]);
+            u_Ux[i].create(cur_rows, cur_cols, CV_32FC1);
+            u_Uy[i].create(cur_rows, cur_cols, CV_32FC1);
+            variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
+            variational_refinement_processors[i]->setDelta(variational_refinement_delta);
+            variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
+            variational_refinement_processors[i]->setSorIterations(5);
+            variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
+            if (use_flow)
+            {
+                resize(flow_uv[0], u_initial_Ux[i], Size(cur_cols, cur_rows));
+                divide(u_initial_Ux[i], static_cast<float>(fraction), u_initial_Ux[i]);
+                resize(flow_uv[1], u_initial_Uy[i], Size(cur_cols, cur_rows));
+                divide(u_initial_Uy[i], static_cast<float>(fraction), u_initial_Uy[i]);
+            }
+        }
+        fraction *= 2;
+    }
+}
+bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
+                                                       UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y)
+{
+    size_t globalSizeX[] = {(size_t)h};
+    size_t localSizeX[]  = {16};
+    ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::video::dis_flow_oclsrc);
+    kernelX.args(ocl::KernelArg::PtrReadOnly(I0x),
+                 ocl::KernelArg::PtrReadOnly(I0y),
+                 (int)patch_size, (int)patch_stride,
+                 (int)w, (int)h, (int)ws,
+                 ocl::KernelArg::PtrWriteOnly(u_I0xx_buf_aux),
+                 ocl::KernelArg::PtrWriteOnly(u_I0yy_buf_aux),
+                 ocl::KernelArg::PtrWriteOnly(u_I0xy_buf_aux),
+                 ocl::KernelArg::PtrWriteOnly(u_I0x_buf_aux),
+                 ocl::KernelArg::PtrWriteOnly(u_I0y_buf_aux));
+    if (!kernelX.run(1, globalSizeX, localSizeX, false))
+        return false;
+    size_t globalSizeY[] = {(size_t)ws};
+    size_t localSizeY[]  = {16};
+    ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::video::dis_flow_oclsrc);
+    kernelY.args(ocl::KernelArg::PtrReadOnly(u_I0xx_buf_aux),
+                 ocl::KernelArg::PtrReadOnly(u_I0yy_buf_aux),
+                 ocl::KernelArg::PtrReadOnly(u_I0xy_buf_aux),
+                 ocl::KernelArg::PtrReadOnly(u_I0x_buf_aux),
+                 ocl::KernelArg::PtrReadOnly(u_I0y_buf_aux),
+                 (int)patch_size, (int)patch_stride,
+                 (int)w, (int)h, (int)ws,
+                 ocl::KernelArg::PtrWriteOnly(dst_I0xx),
+                 ocl::KernelArg::PtrWriteOnly(dst_I0yy),
+                 ocl::KernelArg::PtrWriteOnly(dst_I0xy),
+                 ocl::KernelArg::PtrWriteOnly(dst_I0x),
+                 ocl::KernelArg::PtrWriteOnly(dst_I0y));
+    return kernelY.run(1, globalSizeY, localSizeY, false);
+}
+bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow)
+{
+    UMat I0Mat = I0.getUMat();
+    UMat I1Mat = I1.getUMat();
+    bool use_input_flow = false;
+    if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
+        use_input_flow = true;
+    else
+        flow.create(I1Mat.size(), CV_32FC2);
+    UMat &u_flowMat = flow.getUMatRef();
+    coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code serach for maximal movement of width/4 */
+                         (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0)));              /* Deepest pyramid level greater or equal than patch*/
+    ocl_prepareBuffers(I0Mat, I1Mat, u_flowMat, use_input_flow);
+    u_Ux[coarsest_scale].setTo(0.0f);
+    u_Uy[coarsest_scale].setTo(0.0f);
+    for (int i = coarsest_scale; i >= finest_scale; i--)
+    {
+        w = u_I0s[i].cols;
+        h = u_I0s[i].rows;
+        ws = 1 + (w - patch_size) / patch_stride;
+        hs = 1 + (h - patch_size) / patch_stride;
+        if (!ocl_precomputeStructureTensor(u_I0xx_buf, u_I0yy_buf, u_I0xy_buf,
+                                           u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i]))
+            return false;
+        if (!ocl_PatchInverseSearch(u_Ux[i], u_Uy[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], 2, i))
+            return false;
+        if (!ocl_Densification(u_Ux[i], u_Uy[i], u_Sx, u_Sy, u_I0s[i], u_I1s[i]))
+            return false;
+        if (variational_refinement_iter > 0)
+            variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i],
+                                                         u_Ux[i].getMat(ACCESS_WRITE), u_Uy[i].getMat(ACCESS_WRITE));
+        if (i > finest_scale)
+        {
+            resize(u_Ux[i], u_Ux[i - 1], u_Ux[i - 1].size());
+            resize(u_Uy[i], u_Uy[i - 1], u_Uy[i - 1].size());
+            multiply(u_Ux[i - 1], 2, u_Ux[i - 1]);
+            multiply(u_Uy[i - 1], 2, u_Uy[i - 1]);
+        }
+    }
+    vector<UMat> uxy(2);
+    uxy[0] = u_Ux[finest_scale];
+    uxy[1] = u_Uy[finest_scale];
+    merge(uxy, u_U);
+    resize(u_U, u_flowMat, u_flowMat.size());
+    multiply(u_flowMat, 1 << finest_scale, u_flowMat);
+    return true;
+}
+#endif
+void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
+{
+    CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
+    CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1);
+    CV_Assert(I0.sameSize(I1));
+    CV_Assert(I0.isContinuous());
+    CV_Assert(I1.isContinuous());
+    CV_OCL_RUN(ocl::Device::getDefault().isIntel() && flow.isUMat() &&
+               (patch_size == 8) && (use_spatial_propagation == true),
+               ocl_calc(I0, I1, flow));
+    Mat I0Mat = I0.getMat();
+    Mat I1Mat = I1.getMat();
+    bool use_input_flow = false;
+    if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
+        use_input_flow = true;
+    else
+        flow.create(I1Mat.size(), CV_32FC2);
+    Mat flowMat = flow.getMat();
+    coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code serach for maximal movement of width/4 */
+                         (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0)));              /* Deepest pyramid level greater or equal than patch*/
+    int num_stripes = getNumThreads();
+    prepareBuffers(I0Mat, I1Mat, flowMat, use_input_flow);
+    Ux[coarsest_scale].setTo(0.0f);
+    Uy[coarsest_scale].setTo(0.0f);
+    for (int i = coarsest_scale; i >= finest_scale; i--)
+    {
+        w = I0s[i].cols;
+        h = I0s[i].rows;
+        ws = 1 + (w - patch_size) / patch_stride;
+        hs = 1 + (h - patch_size) / patch_stride;
+        precomputeStructureTensor(I0xx_buf, I0yy_buf, I0xy_buf, I0x_buf, I0y_buf, I0xs[i], I0ys[i]);
+        if (use_spatial_propagation)
+        {
+            /* Use a fixed number of stripes regardless the number of threads to make inverse search
+             * with spatial propagation reproducible
+             */
+            parallel_for_(Range(0, 8), PatchInverseSearch_ParBody(*this, 8, hs, Sx, Sy, Ux[i], Uy[i], I0s[i],
+                                                                  I1s_ext[i], I0xs[i], I0ys[i], 2, i));
+        }
+        else
+        {
+            parallel_for_(Range(0, num_stripes),
+                          PatchInverseSearch_ParBody(*this, num_stripes, hs, Sx, Sy, Ux[i], Uy[i], I0s[i], I1s_ext[i],
+                                                     I0xs[i], I0ys[i], 1, i));
+        }
+        parallel_for_(Range(0, num_stripes),
+                      Densification_ParBody(*this, num_stripes, I0s[i].rows, Ux[i], Uy[i], Sx, Sy, I0s[i], I1s[i]));
+        if (variational_refinement_iter > 0)
+            variational_refinement_processors[i]->calcUV(I0s[i], I1s[i], Ux[i], Uy[i]);
+        if (i > finest_scale)
+        {
+            resize(Ux[i], Ux[i - 1], Ux[i - 1].size());
+            resize(Uy[i], Uy[i - 1], Uy[i - 1].size());
+            Ux[i - 1] *= 2;
+            Uy[i - 1] *= 2;
+        }
+    }
+    Mat uxy[] = {Ux[finest_scale], Uy[finest_scale]};
+    merge(uxy, 2, U);
+    resize(U, flowMat, flowMat.size());
+    flowMat *= 1 << finest_scale;
+}
+void DISOpticalFlowImpl::collectGarbage()
+{
+    I0s.clear();
+    I1s.clear();
+    I1s_ext.clear();
+    I0xs.clear();
+    I0ys.clear();
+    Ux.clear();
+    Uy.clear();
+    U.release();
+    Sx.release();
+    Sy.release();
+    I0xx_buf.release();
+    I0yy_buf.release();
+    I0xy_buf.release();
+    I0xx_buf_aux.release();
+    I0yy_buf_aux.release();
+    I0xy_buf_aux.release();
+#ifdef HAVE_OPENCL
+    u_I0s.clear();
+    u_I1s.clear();
+    u_I1s_ext.clear();
+    u_I0xs.clear();
+    u_I0ys.clear();
+    u_Ux.clear();
+    u_Uy.clear();
+    u_U.release();
+    u_Sx.release();
+    u_Sy.release();
+    u_I0xx_buf.release();
+    u_I0yy_buf.release();
+    u_I0xy_buf.release();
+    u_I0xx_buf_aux.release();
+    u_I0yy_buf_aux.release();
+    u_I0xy_buf_aux.release();
+#endif
+    for (int i = finest_scale; i <= coarsest_scale; i++)
+        variational_refinement_processors[i]->collectGarbage();
+    variational_refinement_processors.clear();
+}
+Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset)
+{
+    Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>();
+    dis->setPatchSize(8);
+    if (preset == DISOpticalFlow::PRESET_ULTRAFAST)
+    {
+        dis->setFinestScale(2);
+        dis->setPatchStride(4);
+        dis->setGradientDescentIterations(12);
+        dis->setVariationalRefinementIterations(0);
+    }
+    else if (preset == DISOpticalFlow::PRESET_FAST)
+    {
+        dis->setFinestScale(2);
+        dis->setPatchStride(4);
+        dis->setGradientDescentIterations(16);
+        dis->setVariationalRefinementIterations(5);
+    }
+    else if (preset == DISOpticalFlow::PRESET_MEDIUM)
+    {
+        dis->setFinestScale(1);
+        dis->setPatchStride(3);
+        dis->setGradientDescentIterations(25);
+        dis->setVariationalRefinementIterations(5);
+    }
+    return dis;
+}
+}
--- a/modules/video/src/opencl/dis_flow.cl
+++ b/modules/video/src/opencl/dis_flow.cl
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#define EPS 0.001f
+#define INF 1E+10F
+__kernel void dis_precomputeStructureTensor_hor(__global const short *I0x,
+                                                __global const short *I0y,
+                                                int patch_size, int patch_stride,
+                                                int w, int h, int ws,
+                                                __global float *I0xx_aux_ptr,
+                                                __global float *I0yy_aux_ptr,
+                                                __global float *I0xy_aux_ptr,
+                                                __global float *I0x_aux_ptr,
+                                                __global float *I0y_aux_ptr)
+{
+    int i = get_global_id(0);
+    if (i >= h) return;
+    const __global short *x_row = I0x + i * w;
+    const __global short *y_row = I0y + i * w;
+    float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
+    float8 x_vec = convert_float8(vload8(0, x_row));
+    float8 y_vec = convert_float8(vload8(0, y_row));
+    sum_xx = dot(x_vec.lo, x_vec.lo) + dot(x_vec.hi, x_vec.hi);
+    sum_yy = dot(y_vec.lo, y_vec.lo) + dot(y_vec.hi, y_vec.hi);
+    sum_xy = dot(x_vec.lo, y_vec.lo) + dot(x_vec.hi, y_vec.hi);
+    sum_x = dot(x_vec.lo, 1.0f) + dot(x_vec.hi, 1.0f);
+    sum_y = dot(y_vec.lo, 1.0f) + dot(y_vec.hi, 1.0f);
+    I0xx_aux_ptr[i * ws] = sum_xx;
+    I0yy_aux_ptr[i * ws] = sum_yy;
+    I0xy_aux_ptr[i * ws] = sum_xy;
+    I0x_aux_ptr[i * ws] = sum_x;
+    I0y_aux_ptr[i * ws] = sum_y;
+    int js = 1;
+    for (int j = patch_size; j < w; j++)
+    {
+        short x_val1 = x_row[j];
+        short x_val2 = x_row[j - patch_size];
+        short y_val1 = y_row[j];
+        short y_val2 = y_row[j - patch_size];
+        sum_xx += (x_val1 * x_val1 - x_val2 * x_val2);
+        sum_yy += (y_val1 * y_val1 - y_val2 * y_val2);
+        sum_xy += (x_val1 * y_val1 - x_val2 * y_val2);
+        sum_x += (x_val1 - x_val2);
+        sum_y += (y_val1 - y_val2);
+        if ((j - patch_size + 1) % patch_stride == 0)
+        {
+            int index = i * ws + js;
+            I0xx_aux_ptr[index] = sum_xx;
+            I0yy_aux_ptr[index] = sum_yy;
+            I0xy_aux_ptr[index] = sum_xy;
+            I0x_aux_ptr[index] = sum_x;
+            I0y_aux_ptr[index] = sum_y;
+            js++;
+        }
+    }
+}
+__kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_ptr,
+                                                __global const float *I0yy_aux_ptr,
+                                                __global const float *I0xy_aux_ptr,
+                                                __global const float *I0x_aux_ptr,
+                                                __global const float *I0y_aux_ptr,
+                                                int patch_size, int patch_stride,
+                                                int w, int h, int ws,
+                                                __global float *I0xx_ptr,
+                                                __global float *I0yy_ptr,
+                                                __global float *I0xy_ptr,
+                                                __global float *I0x_ptr,
+                                                __global float *I0y_ptr)
+{
+    int j = get_global_id(0);
+    if (j >= ws) return;
+    float sum_xx, sum_yy, sum_xy, sum_x, sum_y;
+    sum_xx = sum_yy = sum_xy = sum_x = sum_y = 0.0f;
+    for (int i = 0; i < patch_size; i++)
+    {
+        sum_xx += I0xx_aux_ptr[i * ws + j];
+        sum_yy += I0yy_aux_ptr[i * ws + j];
+        sum_xy += I0xy_aux_ptr[i * ws + j];
+        sum_x  += I0x_aux_ptr[i * ws + j];
+        sum_y  += I0y_aux_ptr[i * ws + j];
+    }
+    I0xx_ptr[j] = sum_xx;
+    I0yy_ptr[j] = sum_yy;
+    I0xy_ptr[j] = sum_xy;
+    I0x_ptr[j] = sum_x;
+    I0y_ptr[j] = sum_y;
+    int is = 1;
+    for (int i = patch_size; i < h; i++)
+    {
+        sum_xx += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
+        sum_yy += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
+        sum_xy += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
+        sum_x  += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
+        sum_y  += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
+        if ((i - patch_size + 1) % patch_stride == 0)
+        {
+            I0xx_ptr[is * ws + j] = sum_xx;
+            I0yy_ptr[is * ws + j] = sum_yy;
+            I0xy_ptr[is * ws + j] = sum_xy;
+            I0x_ptr[is * ws + j] = sum_x;
+            I0y_ptr[is * ws + j] = sum_y;
+            is++;
+        }
+    }
+}
+__kernel void dis_densification(__global const float *sx, __global const float *sy,
+                                __global const uchar *i0, __global const uchar *i1,
+                                int psz, int pstr,
+                                int w, int h, int ws,
+                                __global float *ux, __global float *uy)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int i, j;
+    if (x >= w || y >= h) return;
+    int start_is, end_is;
+    int start_js, end_js;
+    end_is = min(y / pstr, (h - psz) / pstr);
+    start_is = max(0, y - psz + pstr) / pstr;
+    start_is = min(start_is, end_is);
+    end_js = min(x / pstr, (w - psz) / pstr);
+    start_js = max(0, x - psz + pstr) / pstr;
+    start_js = min(start_js, end_js);
+    float coef, sum_coef = 0.0f;
+    float sum_Ux = 0.0f;
+    float sum_Uy = 0.0f;
+    int i_l, i_u;
+    int j_l, j_u;
+    float i_m, j_m, diff;
+    i = y;
+    j = x;
+    /* Iterate through all the patches that overlap the current location (i,j) */
+    for (int is = start_is; is <= end_is; is++)
+        for (int js = start_js; js <= end_js; js++)
+        {
+            float sx_val = sx[is * ws + js];
+            float sy_val = sy[is * ws + js];
+            uchar2 i1_vec1, i1_vec2;
+            j_m = min(max(j + sx_val, 0.0f), w - 1.0f - EPS);
+            i_m = min(max(i + sy_val, 0.0f), h - 1.0f - EPS);
+            j_l = (int)j_m;
+            j_u = j_l + 1;
+            i_l = (int)i_m;
+            i_u = i_l + 1;
+            i1_vec1 = vload2(0, i1 + i_u * w + j_l);
+            i1_vec2 = vload2(0, i1 + i_l * w + j_l);
+            diff = (j_m - j_l) * (i_m - i_l) * i1_vec1.y +
+                   (j_u - j_m) * (i_m - i_l) * i1_vec1.x +
+                   (j_m - j_l) * (i_u - i_m) * i1_vec2.y +
+                   (j_u - j_m) * (i_u - i_m) * i1_vec2.x - i0[i * w + j];
+            coef = 1 / max(1.0f, fabs(diff));
+            sum_Ux += coef * sx_val;
+            sum_Uy += coef * sy_val;
+            sum_coef += coef;
+        }
+    ux[i * w + j] = sum_Ux / sum_coef;
+    uy[i * w + j] = sum_Uy / sum_coef;
+}
+#define INIT_BILINEAR_WEIGHTS(Ux, Uy)                                                                                  \
+    i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit);                                                       \
+    j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit);                                                       \
+                                                                                                                       \
+    w11 = (i_I1 - floor(i_I1)) * (j_I1 - floor(j_I1));                                                                 \
+    w10 = (i_I1 - floor(i_I1)) * (floor(j_I1) + 1 - j_I1);                                                             \
+    w01 = (floor(i_I1) + 1 - i_I1) * (j_I1 - floor(j_I1));                                                             \
+    w00 = (floor(i_I1) + 1 - i_I1) * (floor(j_I1) + 1 - j_I1);
+float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
+                         int I0_stride, int I1_stride,
+                         float w00, float w01, float w10, float w11, int patch_sz, int i)
+{
+    float sum_diff = 0.0f, sum_diff_sq = 0.0f;
+    int n = patch_sz * patch_sz;
+    uchar8 I1_vec1, I1_vec2, I0_vec;
+    uchar I1_val1, I1_val2;
+    I0_vec  = vload8(0, I0_ptr + i * I0_stride);
+    I1_vec1 = vload8(0, I1_ptr + i * I1_stride);
+    I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);
+    I1_val1 = I1_ptr[i * I1_stride + 8];
+    I1_val2 = I1_ptr[(i + 1) * I1_stride + 8];
+    float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +
+                 w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -
+                 convert_float8(I0_vec);
+    sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
+    sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
+    sum_diff = sub_group_reduce_add(sum_diff);
+    sum_diff_sq = sub_group_reduce_add(sum_diff_sq);
+    return sum_diff_sq - sum_diff * sum_diff / n;
+}
+__kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr,
+                                             __global const uchar *I0_ptr, __global const uchar *I1_ptr,
+                                             int border_size, int patch_size, int patch_stride,
+                                             int w, int h, int ws, int hs, int pyr_level,
+                                             __global float *Sx_ptr, __global float *Sy_ptr)
+{
+    int id = get_global_id(0);
+    int is = id / 8;
+    if (id >= (hs * 8)) return;
+    int i = is * patch_stride;
+    int j = 0;
+    int psz = patch_size;
+    int psz2 = psz / 2;
+    int w_ext = w + 2 * border_size;
+    int bsz = border_size;
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + w - 1.0f;
+    float i_I1, j_I1, w00, w01, w10, w11;
+    float prev_Ux = Ux_ptr[(i + psz2) * w + j + psz2];
+    float prev_Uy = Uy_ptr[(i + psz2) * w + j + psz2];
+    Sx_ptr[is * ws] = prev_Ux;
+    Sy_ptr[is * ws] = prev_Uy;
+    j += patch_stride;
+    int sid = get_sub_group_local_id();
+    for (int js = 1; js < ws; js++, j += patch_stride)
+    {
+        float min_SSD, cur_SSD;
+        float Ux = Ux_ptr[(i + psz2) * w + j + psz2];
+        float Uy = Uy_ptr[(i + psz2) * w + j + psz2];
+        INIT_BILINEAR_WEIGHTS(Ux, Uy);
+        min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                     w, w_ext, w00, w01, w10, w11, psz, sid);
+        INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy);
+        cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                     w, w_ext, w00, w01, w10, w11, psz, sid);
+        if (cur_SSD < min_SSD)
+        {
+            Ux = prev_Ux;
+            Uy = prev_Uy;
+        }
+        prev_Ux = Ux;
+        prev_Uy = Uy;
+        Sx_ptr[is * ws + js] = Ux;
+        Sy_ptr[is * ws + js] = Uy;
+    }
+}
+float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
+                            const __global short *I0x_ptr, const __global short *I0y_ptr,
+                            int I0_stride, int I1_stride, float w00, float w01, float w10,
+                            float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
+{
+    float sum_diff = 0.0, sum_diff_sq = 0.0;
+    float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
+    int n = patch_sz * patch_sz;
+    uchar8 I1_vec1, I1_vec2;
+    uchar I1_val1, I1_val2;
+    for (int i = 0; i < 8; i++)
+    {
+        uchar8 I0_vec = vload8(0, I0_ptr + i * I0_stride);
+        I1_vec1 = (i == 0) ? vload8(0, I1_ptr + i * I1_stride) : I1_vec2;
+        I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);
+        I1_val1 = (i == 0) ? I1_ptr[i * I1_stride + patch_sz] : I1_val2;
+        I1_val2 = I1_ptr[(i + 1) * I1_stride + patch_sz];
+        float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +
+                     w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -
+                     convert_float8(I0_vec);
+        sum_diff += (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
+        sum_diff_sq += (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
+        short8 I0x_vec = vload8(0, I0x_ptr + i * I0_stride);
+        short8 I0y_vec = vload8(0, I0y_ptr + i * I0_stride);
+        sum_I0x_mul += dot(vec.lo, convert_float4(I0x_vec.lo));
+        sum_I0x_mul += dot(vec.hi, convert_float4(I0x_vec.hi));
+        sum_I0y_mul += dot(vec.lo, convert_float4(I0y_vec.lo));
+        sum_I0y_mul += dot(vec.hi, convert_float4(I0y_vec.hi));
+    }
+    float dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
+    float dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
+    float SSD = sum_diff_sq - sum_diff * sum_diff / n;
+    return (float3)(SSD, dst_dUx, dst_dUy);
+}
+__kernel void dis_patch_inverse_search_fwd_2(__global const float *Ux_ptr, __global const float *Uy_ptr,
+                                             __global const uchar *I0_ptr, __global const uchar *I1_ptr,
+                                             __global const short *I0x_ptr, __global const short *I0y_ptr,
+                                             __global const float *xx_ptr, __global const float *yy_ptr,
+                                             __global const float *xy_ptr,
+                                             __global const float *x_ptr, __global const float *y_ptr,
+                                             int border_size, int patch_size, int patch_stride,
+                                             int w, int h, int ws, int hs, int num_inner_iter, int pyr_level,
+                                             __global float *Sx_ptr, __global float *Sy_ptr)
+{
+    int js = get_global_id(0);
+    int is = get_global_id(1);
+    int i = is * patch_stride;
+    int j = js * patch_stride;
+    int psz = patch_size;
+    int psz2 = psz / 2;
+    int w_ext = w + 2 * border_size;
+    int bsz = border_size;
+    int index = is * ws + js;
+    if (js >= ws || is >= hs) return;
+    float Ux = Sx_ptr[index];
+    float Uy = Sy_ptr[index];
+    float cur_Ux = Ux;
+    float cur_Uy = Uy;
+    float cur_xx = xx_ptr[index];
+    float cur_yy = yy_ptr[index];
+    float cur_xy = xy_ptr[index];
+    float detH = cur_xx * cur_yy - cur_xy * cur_xy;
+    if (fabs(detH) < EPS) detH = EPS;
+    float invH11 = cur_yy / detH;
+    float invH12 = -cur_xy / detH;
+    float invH22 = cur_xx / detH;
+    float prev_SSD = INF, SSD;
+    float x_grad_sum = x_ptr[index];
+    float y_grad_sum = y_ptr[index];
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + w - 1.0f;
+    float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
+    float3 res;
+    for (int t = 0; t < num_inner_iter; t++)
+    {
+        INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
+        res = processPatchMeanNorm(I0_ptr + i * w + j,
+                                   I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * w + j,
+                                   I0y_ptr + i * w + j, w, w_ext, w00, w01, w10, w11, psz,
+                                   x_grad_sum, y_grad_sum);
+        SSD = res.x;
+        dUx = res.y;
+        dUy = res.z;
+        dx = invH11 * dUx + invH12 * dUy;
+        dy = invH12 * dUx + invH22 * dUy;
+        cur_Ux -= dx;
+        cur_Uy -= dy;
+        if (SSD >= prev_SSD)
+            break;
+        prev_SSD = SSD;
+    }
+    float2 vec = (float2)(cur_Ux - Ux, cur_Uy - Uy);
+    if (dot(vec, vec) <= (float)(psz * psz))
+    {
+        Sx_ptr[index] = cur_Ux;
+        Sy_ptr[index] = cur_Uy;
+    }
+}
+__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
+                                             int border_size, int patch_size, int patch_stride,
+                                             int w, int h, int ws, int hs, int pyr_level,
+                                             __global float *Sx_ptr, __global float *Sy_ptr)
+{
+    int id = get_global_id(0);
+    int is = id / 8;
+    if (id >= (hs * 8)) return;
+    is = (hs - 1 - is);
+    int i = is * patch_stride;
+    int j = (ws - 2) * patch_stride;
+    int psz = patch_size;
+    int psz2 = psz / 2;
+    int w_ext = w + 2 * border_size;
+    int bsz = border_size;
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + w - 1.0f;
+    float i_I1, j_I1, w00, w01, w10, w11;
+    int sid = get_sub_group_local_id();
+    for (int js = (ws - 2); js > -1; js--, j -= patch_stride)
+    {
+        float min_SSD, cur_SSD;
+        float2 Ux = vload2(0, Sx_ptr + is * ws + js);
+        float2 Uy = vload2(0, Sy_ptr + is * ws + js);
+        INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x);
+        min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                     w, w_ext, w00, w01, w10, w11, psz, sid);
+        INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y);
+        cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
+                                     w, w_ext, w00, w01, w10, w11, psz, sid);
+        if (cur_SSD < min_SSD)
+        {
+            Sx_ptr[is * ws + js] = Ux.y;
+            Sy_ptr[is * ws + js] = Uy.y;
+        }
+    }
+}
+__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
+                                             __global const short *I0x_ptr, __global const short *I0y_ptr,
+                                             __global const float *xx_ptr, __global const float *yy_ptr,
+                                             __global const float *xy_ptr,
+                                             __global const float *x_ptr, __global const float *y_ptr,
+                                             int border_size, int patch_size, int patch_stride,
+                                             int w, int h, int ws, int hs, int num_inner_iter,
+                                             __global float *Sx_ptr, __global float *Sy_ptr)
+{
+    int js = get_global_id(0);
+    int is = get_global_id(1);
+    if (js >= ws || is >= hs) return;
+    js = (ws - 1 - js);
+    is = (hs - 1 - is);
+    int j = js * patch_stride;
+    int i = is * patch_stride;
+    int psz = patch_size;
+    int psz2 = psz / 2;
+    int w_ext = w + 2 * border_size;
+    int bsz = border_size;
+    int index = is * ws + js;
+    float Ux = Sx_ptr[index];
+    float Uy = Sy_ptr[index];
+    float cur_Ux = Ux;
+    float cur_Uy = Uy;
+    float cur_xx = xx_ptr[index];
+    float cur_yy = yy_ptr[index];
+    float cur_xy = xy_ptr[index];
+    float detH = cur_xx * cur_yy - cur_xy * cur_xy;
+    if (fabs(detH) < EPS) detH = EPS;
+    float invH11 = cur_yy / detH;
+    float invH12 = -cur_xy / detH;
+    float invH22 = cur_xx / detH;
+    float prev_SSD = INF, SSD;
+    float x_grad_sum = x_ptr[index];
+    float y_grad_sum = y_ptr[index];
+    float i_lower_limit = bsz - psz + 1.0f;
+    float i_upper_limit = bsz + h - 1.0f;
+    float j_lower_limit = bsz - psz + 1.0f;
+    float j_upper_limit = bsz + w - 1.0f;
+    float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
+    float3 res;
+    for (int t = 0; t < num_inner_iter; t++)
+    {
+        INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
+        res = processPatchMeanNorm(I0_ptr + i * w + j,
+                                   I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * w + j,
+                                   I0y_ptr + i * w + j, w, w_ext, w00, w01, w10, w11, psz,
+                                   x_grad_sum, y_grad_sum);
+        SSD = res.x;
+        dUx = res.y;
+        dUy = res.z;
+        dx = invH11 * dUx + invH12 * dUy;
+        dy = invH12 * dUx + invH22 * dUy;
+        cur_Ux -= dx;
+        cur_Uy -= dy;
+        if (SSD >= prev_SSD)
+            break;
+        prev_SSD = SSD;
+    }
+    float2 vec = (float2)(cur_Ux - Ux, cur_Uy - Uy);
+    if ((dot(vec, vec)) <= (float)(psz * psz))
+    {
+        Sx_ptr[index] = cur_Ux;
+        Sy_ptr[index] = cur_Uy;
+    }
+}
--- a/modules/video/src/opencl/optical_flow_tvl1.cl
+++ b/modules/video/src/opencl/optical_flow_tvl1.cl
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-__kernel void centeredGradientKernel(__global const float* src_ptr, int src_col, int src_row, int src_step,
-                                     __global float* dx, __global float* dy, int d_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    if((x < src_col)&&(y < src_row))
-    {
-        int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);
-        int src_x2 = (x - 1) > 0 ? (x -1) : 0;
-        dx[y * d_step+ x] = 0.5f * (src_ptr[y * src_step + src_x1] - src_ptr[y * src_step+ src_x2]);
-        int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);
-        int src_y2 = (y - 1) > 0 ? (y - 1) : 0;
-        dy[y * d_step+ x] = 0.5f * (src_ptr[src_y1 * src_step + x] - src_ptr[src_y2 * src_step+ x]);
-    }
-}
-inline float bicubicCoeff(float x_)
-{
-    float x = fabs(x_);
-    if (x <= 1.0f)
-        return x * x * (1.5f * x - 2.5f) + 1.0f;
-    else if (x < 2.0f)
-        return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
-    else
-        return 0.0f;
-}
-__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,
-    image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,
-    __global const float* u1, int u1_step,
-    __global const float* u2,
-    __global float* I1w,
-    __global float* I1wx, /*int I1wx_step,*/
-    __global float* I1wy, /*int I1wy_step,*/
-    __global float* grad, /*int grad_step,*/
-    __global float* rho,
-    int I1w_step,
-    int u2_step,
-    int u1_offset_x,
-    int u1_offset_y,
-    int u2_offset_x,
-    int u2_offset_y)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    if(x < I0_col&&y < I0_row)
-    {
-        //float u1Val = u1(y, x);
-        float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
-        //float u2Val = u2(y, x);
-        float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
-        float wx = x + u1Val;
-        float wy = y + u2Val;
-        int xmin = ceil(wx - 2.0f);
-        int xmax = floor(wx + 2.0f);
-        int ymin = ceil(wy - 2.0f);
-        int ymax = floor(wy + 2.0f);
-        float sum  = 0.0f;
-        float sumx = 0.0f;
-        float sumy = 0.0f;
-        float wsum = 0.0f;
-        sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-        for (int cy = ymin; cy <= ymax; ++cy)
-        {
-            for (int cx = xmin; cx <= xmax; ++cx)
-            {
-                float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-                //sum  += w * tex2D(tex_I1 , cx, cy);
-                int2 cood = (int2)(cx, cy);
-                sum += w * read_imagef(tex_I1, sampleri, cood).x;
-                //sumx += w * tex2D(tex_I1x, cx, cy);
-                sumx += w * read_imagef(tex_I1x, sampleri, cood).x;
-                //sumy += w * tex2D(tex_I1y, cx, cy);
-                sumy += w * read_imagef(tex_I1y, sampleri, cood).x;
-                wsum += w;
-            }
-        }
-        float coeff = 1.0f / wsum;
-        float I1wVal  = sum  * coeff;
-        float I1wxVal = sumx * coeff;
-        float I1wyVal = sumy * coeff;
-        I1w[y * I1w_step + x]  = I1wVal;
-        I1wx[y * I1w_step + x] = I1wxVal;
-        I1wy[y * I1w_step + x] = I1wyVal;
-        float Ix2 = I1wxVal * I1wxVal;
-        float Iy2 = I1wyVal * I1wyVal;
-        // store the |Grad(I1)|^2
-        grad[y * I1w_step + x] = Ix2 + Iy2;
-        // compute the constant part of the rho function
-        float I0Val = I0[y * I0_step + x];
-        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
-    }
-}
-inline float readImage(__global const float *image,  int x,  int y,  int rows,  int cols, int elemCntPerRow)
-{
-    int i0 = clamp(x, 0, cols - 1);
-    int j0 = clamp(y, 0, rows - 1);
-    return image[j0 * elemCntPerRow + i0];
-}
-__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,
-    __global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,
-    __global const float* u1, int u1_step,
-    __global const float* u2,
-    __global float* I1w,
-    __global float* I1wx, /*int I1wx_step,*/
-    __global float* I1wy, /*int I1wy_step,*/
-    __global float* grad, /*int grad_step,*/
-    __global float* rho,
-    int I1w_step,
-    int u2_step,
-    int I1_step,
-    int I1x_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    if(x < I0_col&&y < I0_row)
-    {
-        //float u1Val = u1(y, x);
-        float u1Val = u1[y * u1_step + x];
-        //float u2Val = u2(y, x);
-        float u2Val = u2[y * u2_step + x];
-        float wx = x + u1Val;
-        float wy = y + u2Val;
-        int xmin = ceil(wx - 2.0f);
-        int xmax = floor(wx + 2.0f);
-        int ymin = ceil(wy - 2.0f);
-        int ymax = floor(wy + 2.0f);
-        float sum  = 0.0f;
-        float sumx = 0.0f;
-        float sumy = 0.0f;
-        float wsum = 0.0f;
-        for (int cy = ymin; cy <= ymax; ++cy)
-        {
-            for (int cx = xmin; cx <= xmax; ++cx)
-            {
-                float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-                int2 cood = (int2)(cx, cy);
-                sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);
-                sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);
-                sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);
-                wsum += w;
-            }
-        }
-        float coeff = 1.0f / wsum;
-        float I1wVal  = sum  * coeff;
-        float I1wxVal = sumx * coeff;
-        float I1wyVal = sumy * coeff;
-        I1w[y * I1w_step + x]  = I1wVal;
-        I1wx[y * I1w_step + x] = I1wxVal;
-        I1wy[y * I1w_step + x] = I1wyVal;
-        float Ix2 = I1wxVal * I1wxVal;
-        float Iy2 = I1wyVal * I1wyVal;
-        // store the |Grad(I1)|^2
-        grad[y * I1w_step + x] = Ix2 + Iy2;
-        // compute the constant part of the rho function
-        float I0Val = I0[y * I0_step + x];
-        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
-    }
-}
-__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,
-    __global const float* u2,
-    __global float* p11, int p11_step,
-    __global float* p12,
-    __global float* p21,
-    __global float* p22,
-    float taut,
-    int u2_step,
-    int u1_offset_x,
-    int u1_offset_y,
-    int u2_offset_x,
-    int u2_offset_y)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    if(x < u1_col && y < u1_row)
-    {
-        int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
-        float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
-        int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);
-        float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
-        int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
-        float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
-        int src_y2 = (y + 1) <  (u1_row - 1) ? (y + 1) : (u1_row - 1);
-        float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
-        float g1 = hypot(u1x, u1y);
-        float g2 = hypot(u2x, u2y);
-        float ng1 = 1.0f + taut * g1;
-        float ng2 = 1.0f + taut * g2;
-        p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;
-        p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;
-        p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;
-        p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;
-    }
-}
-inline float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)
-{
-    if (x > 0 && y > 0)
-    {
-        float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];
-        float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];
-        return v1x + v2y;
-    }
-    else
-    {
-        if (y > 0)
-            return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];
-        else
-        {
-            if (x > 0)
-                return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];
-            else
-                return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];
-        }
-    }
-}
-__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,
-    __global const float* I1wy, /*int I1wy_step,*/
-    __global const float* grad, /*int grad_step,*/
-    __global const float* rho_c, /*int rho_c_step,*/
-    __global const float* p11, /*int p11_step,*/
-    __global const float* p12, /*int p12_step,*/
-    __global const float* p21, /*int p21_step,*/
-    __global const float* p22, /*int p22_step,*/
-    __global float* u1, int u1_step,
-    __global float* u2,
-    __global float* error, float l_t, float theta, int u2_step,
-    int u1_offset_x,
-    int u1_offset_y,
-    int u2_offset_x,
-    int u2_offset_y,
-    char calc_error)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    if(x < I1wx_col && y < I1wx_row)
-    {
-        float I1wxVal = I1wx[y * I1wx_step + x];
-        float I1wyVal = I1wy[y * I1wx_step + x];
-        float gradVal = grad[y * I1wx_step + x];
-        float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
-        float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
-        float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
-        // estimate the values of the variable (v1, v2) (thresholding operator TH)
-        float d1 = 0.0f;
-        float d2 = 0.0f;
-        if (rho < -l_t * gradVal)
-        {
-            d1 = l_t * I1wxVal;
-            d2 = l_t * I1wyVal;
-        }
-        else if (rho > l_t * gradVal)
-        {
-            d1 = -l_t * I1wxVal;
-            d2 = -l_t * I1wyVal;
-        }
-        else if (gradVal > 1.192092896e-07f)
-        {
-            float fi = -rho / gradVal;
-            d1 = fi * I1wxVal;
-            d2 = fi * I1wyVal;
-        }
-        float v1 = u1OldVal + d1;
-        float v2 = u2OldVal + d2;
-        // compute the divergence of the dual variable (p1, p2)
-        float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);
-        float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);
-        // estimate the values of the optical flow (u1, u2)
-        float u1NewVal = v1 + theta * div_p1;
-        float u2NewVal = v2 + theta * div_p2;
-        u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;
-        u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;
-        if(calc_error)
-        {
-            float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
-            float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
-            error[y * I1wx_step + x] = n1 + n2;
-        }
-    }
-}
--- a/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
+++ b/modules/video/perf/opencl/perf_optflow_dualTVL1.cpp
@@ -10,14 +10,10 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -31,7 +27,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors as is and
+// This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -43,69 +39,91 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+#include "precomp.hpp"
+#include<iostream>
+#include<fstream>
-#include "../perf_precomp.hpp"
+namespace cv {
-#include "opencv2/ts/ocl_perf.hpp"
-#ifdef HAVE_OPENCL
+const float FLOW_TAG_FLOAT = 202021.25f;
+const char *FLOW_TAG_STRING = "PIEH";
+CV_EXPORTS_W Mat readOpticalFlow( const String& path )
+{
+    Mat_<Point2f> flow;
+    std::ifstream file(path.c_str(), std::ios_base::binary);
+    if ( !file.good() )
+        return flow; // no file - return empty matrix
-namespace opencv_test {
+    float tag;
-namespace ocl {
+    file.read((char*) &tag, sizeof(float));
+    if ( tag != FLOW_TAG_FLOAT )
+        return flow;
-///////////// OpticalFlow Dual TVL1 ////////////////////////
+    int width, height;
-typedef tuple< tuple<int, double>, bool> OpticalFlowDualTVL1Params;
-typedef TestBaseWithParam<OpticalFlowDualTVL1Params> OpticalFlowDualTVL1Fixture;
-OCL_PERF_TEST_P(OpticalFlowDualTVL1Fixture, OpticalFlowDualTVL1,
+    file.read((char*) &width, 4);
-            ::testing::Combine(
+    file.read((char*) &height, 4);
-                        ::testing::Values(make_tuple<int, double>(-1, 0.3),
-                                          make_tuple<int, double>(3, 0.5)),
-                        ::testing::Bool()
-                                )
-            )
-    {
-        Mat frame0 = imread(getDataPath("cv/optflow/RubberWhale1.png"), cv::IMREAD_GRAYSCALE);
-        ASSERT_FALSE(frame0.empty()) << "can't load RubberWhale1.png";
-        Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale2.png"), cv::IMREAD_GRAYSCALE);
+    flow.create(height, width);
-        ASSERT_FALSE(frame1.empty()) << "can't load RubberWhale2.png";
-        const Size srcSize = frame0.size();
+    for ( int i = 0; i < flow.rows; ++i )
+    {
+        for ( int j = 0; j < flow.cols; ++j )
+        {
+            Point2f u;
+            file.read((char*) &u.x, sizeof(float));
+            file.read((char*) &u.y, sizeof(float));
+            if ( !file.good() )
+            {
+                flow.release();
+                return flow;
+            }
+            flow(i, j) = u;
+        }
+    }
+    file.close();
+    return flow;
+}
-        const OpticalFlowDualTVL1Params params = GetParam();
+CV_EXPORTS_W bool writeOpticalFlow( const String& path, InputArray flow )
-        const tuple<int, double> filteringScale = get<0>(params);
+{
-            const int medianFiltering = get<0>(filteringScale);
+    const int nChannels = 2;
-            const double scaleStep = get<1>(filteringScale);
-        const bool useInitFlow = get<1>(params);
-        double eps = 0.9;
-        UMat uFrame0; frame0.copyTo(uFrame0);
+    Mat input = flow.getMat();
-        UMat uFrame1; frame1.copyTo(uFrame1);
+    if ( input.channels() != nChannels || input.depth() != CV_32F || path.length() == 0 )
-        UMat uFlow(srcSize, CV_32FC2);
+        return false;
-        declare.in(uFrame0, uFrame1, WARMUP_READ).out(uFlow, WARMUP_READ);
-        //create algorithm
+    std::ofstream file(path.c_str(), std::ofstream::binary);
-        cv::Ptr<cv::DualTVL1OpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    if ( !file.good() )
+        return false;
-        //set parameters
+    int nRows, nCols;
-        alg->setScaleStep(scaleStep);
-        alg->setMedianFiltering(medianFiltering);
-        if (useInitFlow)
+    nRows = (int) input.size().height;
-        {
+    nCols = (int) input.size().width;
-            //calculate initial flow as result of optical flow
-            alg->calc(uFrame0, uFrame1, uFlow);
-        }
-        //set flag to use initial flow
+    const int headerSize = 12;
-        alg->setUseInitialFlow(useInitFlow);
+    char header[headerSize];
-        OCL_TEST_CYCLE()
+    memcpy(header, FLOW_TAG_STRING, 4);
-            alg->calc(uFrame0, uFrame1, uFlow);
+    // size of ints is known - has been asserted in the current function
+    memcpy(header + 4, reinterpret_cast<const char*>(&nCols), sizeof(nCols));
+    memcpy(header + 8, reinterpret_cast<const char*>(&nRows), sizeof(nRows));
+    file.write(header, headerSize);
+    if ( !file.good() )
+        return false;
-        SANITY_CHECK(uFlow, eps, ERROR_RELATIVE);
+    int row;
+    char* p;
+    for ( row = 0; row < nRows; row++ )
+    {
+        p = input.ptr<char>(row);
+        file.write(p, nCols * nChannels * sizeof(float));
+        if ( !file.good() )
+            return false;
    }
+    file.close();
+    return true;
 }
-} // namespace opencv_test::ocl
+}
-#endif // HAVE_OPENCL
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-/*
-//
-// This implementation is based on Javier Sánchez Pérez <jsanchez@dis.ulpgc.es> implementation.
-// Original BSD license:
-//
-// Copyright (c) 2011, Javier Sánchez Pérez, Enric Meinhardt Llopis
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice, this
-//   list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-*/
-#include "precomp.hpp"
-#include "opencl_kernels_video.hpp"
-#include <limits>
-#include <iomanip>
-#include <iostream>
-#include "opencv2/core/opencl/ocl_defs.hpp"
-using namespace cv;
-namespace {
-class OpticalFlowDual_TVL1 : public DualTVL1OpticalFlow
-{
-public:
-    OpticalFlowDual_TVL1(double tau_, double lambda_, double theta_, int nscales_, int warps_,
-                         double epsilon_, int innerIterations_, int outerIterations_,
-                         double scaleStep_, double gamma_, int medianFiltering_,
-                         bool useInitialFlow_) :
-        tau(tau_), lambda(lambda_), theta(theta_), gamma(gamma_), nscales(nscales_),
-        warps(warps_), epsilon(epsilon_), innerIterations(innerIterations_),
-        outerIterations(outerIterations_), useInitialFlow(useInitialFlow_),
-        scaleStep(scaleStep_), medianFiltering(medianFiltering_)
-    {
-    }
-    OpticalFlowDual_TVL1();
-    void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
-    void collectGarbage() CV_OVERRIDE;
-    inline double getTau() const CV_OVERRIDE { return tau; }
-    inline void setTau(double val) CV_OVERRIDE { tau = val; }
-    inline double getLambda() const CV_OVERRIDE { return lambda; }
-    inline void setLambda(double val) CV_OVERRIDE { lambda = val; }
-    inline double getTheta() const CV_OVERRIDE { return theta; }
-    inline void setTheta(double val) CV_OVERRIDE { theta = val; }
-    inline double getGamma() const CV_OVERRIDE { return gamma; }
-    inline void setGamma(double val) CV_OVERRIDE { gamma = val; }
-    inline int getScalesNumber() const CV_OVERRIDE { return nscales; }
-    inline void setScalesNumber(int val) CV_OVERRIDE { nscales = val; }
-    inline int getWarpingsNumber() const CV_OVERRIDE { return warps; }
-    inline void setWarpingsNumber(int val) CV_OVERRIDE { warps = val; }
-    inline double getEpsilon() const CV_OVERRIDE { return epsilon; }
-    inline void setEpsilon(double val) CV_OVERRIDE { epsilon = val; }
-    inline int getInnerIterations() const CV_OVERRIDE { return innerIterations; }
-    inline void setInnerIterations(int val) CV_OVERRIDE { innerIterations = val; }
-    inline int getOuterIterations() const CV_OVERRIDE { return outerIterations; }
-    inline void setOuterIterations(int val) CV_OVERRIDE { outerIterations = val; }
-    inline bool getUseInitialFlow() const CV_OVERRIDE { return useInitialFlow; }
-    inline void setUseInitialFlow(bool val) CV_OVERRIDE { useInitialFlow = val; }
-    inline double getScaleStep() const CV_OVERRIDE { return scaleStep; }
-    inline void setScaleStep(double val) CV_OVERRIDE { scaleStep = val; }
-    inline int getMedianFiltering() const CV_OVERRIDE { return medianFiltering; }
-    inline void setMedianFiltering(int val) CV_OVERRIDE { medianFiltering = val; }
-protected:
-    double tau;
-    double lambda;
-    double theta;
-    double gamma;
-    int nscales;
-    int warps;
-    double epsilon;
-    int innerIterations;
-    int outerIterations;
-    bool useInitialFlow;
-    double scaleStep;
-    int medianFiltering;
-private:
-    void procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2, Mat_<float>& u3);
-#ifdef HAVE_OPENCL
-    bool procOneScale_ocl(const UMat& I0, const UMat& I1, UMat& u1, UMat& u2);
-    bool calc_ocl(InputArray I0, InputArray I1, InputOutputArray flow);
-#endif
-    struct dataMat
-    {
-        std::vector<Mat_<float> > I0s;
-        std::vector<Mat_<float> > I1s;
-        std::vector<Mat_<float> > u1s;
-        std::vector<Mat_<float> > u2s;
-        std::vector<Mat_<float> > u3s;
-        Mat_<float> I1x_buf;
-        Mat_<float> I1y_buf;
-        Mat_<float> flowMap1_buf;
-        Mat_<float> flowMap2_buf;
-        Mat_<float> I1w_buf;
-        Mat_<float> I1wx_buf;
-        Mat_<float> I1wy_buf;
-        Mat_<float> grad_buf;
-        Mat_<float> rho_c_buf;
-        Mat_<float> v1_buf;
-        Mat_<float> v2_buf;
-        Mat_<float> v3_buf;
-        Mat_<float> p11_buf;
-        Mat_<float> p12_buf;
-        Mat_<float> p21_buf;
-        Mat_<float> p22_buf;
-        Mat_<float> p31_buf;
-        Mat_<float> p32_buf;
-        Mat_<float> div_p1_buf;
-        Mat_<float> div_p2_buf;
-        Mat_<float> div_p3_buf;
-        Mat_<float> u1x_buf;
-        Mat_<float> u1y_buf;
-        Mat_<float> u2x_buf;
-        Mat_<float> u2y_buf;
-        Mat_<float> u3x_buf;
-        Mat_<float> u3y_buf;
-    } dm;
-#ifdef HAVE_OPENCL
-    struct dataUMat
-    {
-        std::vector<UMat> I0s;
-        std::vector<UMat> I1s;
-        std::vector<UMat> u1s;
-        std::vector<UMat> u2s;
-        UMat I1x_buf;
-        UMat I1y_buf;
-        UMat I1w_buf;
-        UMat I1wx_buf;
-        UMat I1wy_buf;
-        UMat grad_buf;
-        UMat rho_c_buf;
-        UMat p11_buf;
-        UMat p12_buf;
-        UMat p21_buf;
-        UMat p22_buf;
-        UMat diff_buf;
-        UMat norm_buf;
-    } dum;
-#endif
-};
-#ifdef HAVE_OPENCL
-namespace cv_ocl_tvl1flow
-{
-    bool centeredGradient(const UMat &src, UMat &dx, UMat &dy);
-    bool warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UMat &I1y,
-        UMat &u1, UMat &u2, UMat &I1w, UMat &I1wx, UMat &I1wy,
-        UMat &grad, UMat &rho);
-    bool estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
-        UMat &rho_c, UMat &p11, UMat &p12,
-        UMat &p21, UMat &p22, UMat &u1,
-        UMat &u2, UMat &error, float l_t, float theta, char calc_error);
-    bool estimateDualVariables(UMat &u1, UMat &u2,
-        UMat &p11, UMat &p12, UMat &p21, UMat &p22, float taut);
-}
-bool cv_ocl_tvl1flow::centeredGradient(const UMat &src, UMat &dx, UMat &dy)
-{
-    size_t globalsize[2] = { (size_t)src.cols, (size_t)src.rows };
-    ocl::Kernel kernel;
-    if (!kernel.create("centeredGradientKernel", cv::ocl::video::optical_flow_tvl1_oclsrc, ""))
-        return false;
-    int idxArg = 0;
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));//src mat
-    idxArg = kernel.set(idxArg, (int)(src.cols));//src mat col
-    idxArg = kernel.set(idxArg, (int)(src.rows));//src mat rows
-    idxArg = kernel.set(idxArg, (int)(src.step / src.elemSize()));//src mat step
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dx));//res mat dx
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dy));//res mat dy
-    idxArg = kernel.set(idxArg, (int)(dx.step/dx.elemSize()));//res mat step
-    return kernel.run(2, globalsize, NULL, false);
-}
-bool cv_ocl_tvl1flow::warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UMat &I1y,
-    UMat &u1, UMat &u2, UMat &I1w, UMat &I1wx, UMat &I1wy,
-    UMat &grad, UMat &rho)
-{
-    size_t globalsize[2] = { (size_t)I0.cols, (size_t)I0.rows };
-    ocl::Kernel kernel;
-    if (!kernel.create("warpBackwardKernel", cv::ocl::video::optical_flow_tvl1_oclsrc, ""))
-        return false;
-    int idxArg = 0;
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I0));//I0 mat
-    int I0_step = (int)(I0.step / I0.elemSize());
-    idxArg = kernel.set(idxArg, I0_step);//I0_step
-    idxArg = kernel.set(idxArg, (int)(I0.cols));//I0_col
-    idxArg = kernel.set(idxArg, (int)(I0.rows));//I0_row
-    ocl::Image2D imageI1(I1);
-    ocl::Image2D imageI1x(I1x);
-    ocl::Image2D imageI1y(I1y);
-    idxArg = kernel.set(idxArg, imageI1);//image2d_t tex_I1
-    idxArg = kernel.set(idxArg, imageI1x);//image2d_t tex_I1x
-    idxArg = kernel.set(idxArg, imageI1y);//image2d_t tex_I1y
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u1));//const float* u1
-    idxArg = kernel.set(idxArg, (int)(u1.step / u1.elemSize()));//int u1_step
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u2));//const float* u2
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(I1w));///float* I1w
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(I1wx));//float* I1wx
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(I1wy));//float* I1wy
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(grad));//float* grad
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(rho));//float* rho
-    idxArg = kernel.set(idxArg, (int)(I1w.step / I1w.elemSize()));//I1w_step
-    idxArg = kernel.set(idxArg, (int)(u2.step / u2.elemSize()));//u2_step
-    int u1_offset_x = (int)((u1.offset) % (u1.step));
-    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
-    idxArg = kernel.set(idxArg, (int)u1_offset_x );//u1_offset_x
-    idxArg = kernel.set(idxArg, (int)(u1.offset/u1.step));//u1_offset_y
-    int u2_offset_x = (int)((u2.offset) % (u2.step));
-    u2_offset_x = (int) (u2_offset_x / u2.elemSize());
-    idxArg = kernel.set(idxArg, (int)u2_offset_x);//u2_offset_x
-    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step));//u2_offset_y
-    return kernel.run(2, globalsize, NULL, false);
-}
-bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
-    UMat &rho_c, UMat &p11, UMat &p12,
-    UMat &p21, UMat &p22, UMat &u1,
-    UMat &u2, UMat &error, float l_t, float theta, char calc_error)
-{
-    size_t globalsize[2] = { (size_t)I1wx.cols, (size_t)I1wx.rows };
-    ocl::Kernel kernel;
-    if (!kernel.create("estimateUKernel", cv::ocl::video::optical_flow_tvl1_oclsrc, ""))
-        return false;
-    int idxArg = 0;
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I1wx)); //const float* I1wx
-    idxArg = kernel.set(idxArg, (int)(I1wx.cols)); //int I1wx_col
-    idxArg = kernel.set(idxArg, (int)(I1wx.rows)); //int I1wx_row
-    idxArg = kernel.set(idxArg, (int)(I1wx.step/I1wx.elemSize())); //int I1wx_step
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I1wy)); //const float* I1wy
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(grad)); //const float* grad
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(rho_c)); //const float* rho_c
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p11)); //const float* p11
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p12)); //const float* p12
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p21)); //const float* p21
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p22)); //const float* p22
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(u1)); //float* u1
-    idxArg = kernel.set(idxArg, (int)(u1.step / u1.elemSize())); //int u1_step
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(u2)); //float* u2
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(error)); //float* error
-    idxArg = kernel.set(idxArg, (float)l_t); //float l_t
-    idxArg = kernel.set(idxArg, (float)theta); //float theta
-    idxArg = kernel.set(idxArg, (int)(u2.step / u2.elemSize()));//int u2_step
-    int u1_offset_x = (int)(u1.offset % u1.step);
-    u1_offset_x = (int) (u1_offset_x  / u1.elemSize());
-    idxArg = kernel.set(idxArg, (int)u1_offset_x); //int u1_offset_x
-    idxArg = kernel.set(idxArg, (int)(u1.offset/u1.step)); //int u1_offset_y
-    int u2_offset_x = (int)(u2.offset % u2.step);
-    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
-    idxArg = kernel.set(idxArg, (int)u2_offset_x ); //int u2_offset_x
-    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
-    idxArg = kernel.set(idxArg, (char)calc_error);    //char calc_error
-    return kernel.run(2, globalsize, NULL, false);
-}
-bool cv_ocl_tvl1flow::estimateDualVariables(UMat &u1, UMat &u2,
-    UMat &p11, UMat &p12, UMat &p21, UMat &p22, float taut)
-{
-    size_t globalsize[2] = { (size_t)u1.cols, (size_t)u1.rows };
-    ocl::Kernel kernel;
-    if (!kernel.create("estimateDualVariablesKernel", cv::ocl::video::optical_flow_tvl1_oclsrc, ""))
-        return false;
-    int idxArg = 0;
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u1));// const float* u1
-    idxArg = kernel.set(idxArg, (int)(u1.cols)); //int u1_col
-    idxArg = kernel.set(idxArg, (int)(u1.rows)); //int u1_row
-    idxArg = kernel.set(idxArg, (int)(u1.step/u1.elemSize())); //int u1_step
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u2)); // const float* u2
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p11)); // float* p11
-    idxArg = kernel.set(idxArg, (int)(p11.step/p11.elemSize())); //int p11_step
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p12)); // float* p12
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p21)); // float* p21
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p22)); // float* p22
-    idxArg = kernel.set(idxArg, (float)(taut));    //float taut
-    idxArg = kernel.set(idxArg, (int)(u2.step/u2.elemSize())); //int u2_step
-    int u1_offset_x = (int)(u1.offset % u1.step);
-    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
-    idxArg = kernel.set(idxArg, u1_offset_x); //int u1_offset_x
-    idxArg = kernel.set(idxArg, (int)(u1.offset / u1.step)); //int u1_offset_y
-    int u2_offset_x = (int)(u2.offset % u2.step);
-    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
-    idxArg = kernel.set(idxArg, u2_offset_x); //int u2_offset_x
-    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
-    return kernel.run(2, globalsize, NULL, false);
-}
-#endif
-OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
-{
-    tau            = 0.25;
-    lambda         = 0.15;
-    theta          = 0.3;
-    nscales        = 5;
-    warps          = 5;
-    epsilon        = 0.01;
-    gamma          = 0.;
-    innerIterations = 30;
-    outerIterations = 10;
-    useInitialFlow = false;
-    medianFiltering = 5;
-    scaleStep      = 0.8;
-}
-void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow)
-{
-    CV_INSTRUMENT_REGION();
-#ifndef __APPLE__
-    CV_OCL_RUN(_flow.isUMat() &&
-               ocl::Image2D::isFormatSupported(CV_32F, 1, false),
-               calc_ocl(_I0, _I1, _flow))
-#endif
-    Mat I0 = _I0.getMat();
-    Mat I1 = _I1.getMat();
-    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
-    CV_Assert( I0.size() == I1.size() );
-    CV_Assert( I0.type() == I1.type() );
-    CV_Assert( !useInitialFlow || (_flow.size() == I0.size() && _flow.type() == CV_32FC2) );
-    CV_Assert( nscales > 0 );
-    bool use_gamma = gamma != 0;
-    // allocate memory for the pyramid structure
-    dm.I0s.resize(nscales);
-    dm.I1s.resize(nscales);
-    dm.u1s.resize(nscales);
-    dm.u2s.resize(nscales);
-    dm.u3s.resize(nscales);
-    I0.convertTo(dm.I0s[0], dm.I0s[0].depth(), I0.depth() == CV_8U ? 1.0 : 255.0);
-    I1.convertTo(dm.I1s[0], dm.I1s[0].depth(), I1.depth() == CV_8U ? 1.0 : 255.0);
-    dm.u1s[0].create(I0.size());
-    dm.u2s[0].create(I0.size());
-    if (use_gamma) dm.u3s[0].create(I0.size());
-    if (useInitialFlow)
-    {
-        Mat_<float> mv[] = { dm.u1s[0], dm.u2s[0] };
-        split(_flow.getMat(), mv);
-    }
-    dm.I1x_buf.create(I0.size());
-    dm.I1y_buf.create(I0.size());
-    dm.flowMap1_buf.create(I0.size());
-    dm.flowMap2_buf.create(I0.size());
-    dm.I1w_buf.create(I0.size());
-    dm.I1wx_buf.create(I0.size());
-    dm.I1wy_buf.create(I0.size());
-    dm.grad_buf.create(I0.size());
-    dm.rho_c_buf.create(I0.size());
-    dm.v1_buf.create(I0.size());
-    dm.v2_buf.create(I0.size());
-    dm.v3_buf.create(I0.size());
-    dm.p11_buf.create(I0.size());
-    dm.p12_buf.create(I0.size());
-    dm.p21_buf.create(I0.size());
-    dm.p22_buf.create(I0.size());
-    dm.p31_buf.create(I0.size());
-    dm.p32_buf.create(I0.size());
-    dm.div_p1_buf.create(I0.size());
-    dm.div_p2_buf.create(I0.size());
-    dm.div_p3_buf.create(I0.size());
-    dm.u1x_buf.create(I0.size());
-    dm.u1y_buf.create(I0.size());
-    dm.u2x_buf.create(I0.size());
-    dm.u2y_buf.create(I0.size());
-    dm.u3x_buf.create(I0.size());
-    dm.u3y_buf.create(I0.size());
-    // create the scales
-    for (int s = 1; s < nscales; ++s)
-    {
-        resize(dm.I0s[s - 1], dm.I0s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
-        resize(dm.I1s[s - 1], dm.I1s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
-        if (dm.I0s[s].cols < 16 || dm.I0s[s].rows < 16)
-        {
-            nscales = s;
-            break;
-        }
-        if (useInitialFlow)
-        {
-            resize(dm.u1s[s - 1], dm.u1s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
-            resize(dm.u2s[s - 1], dm.u2s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
-            multiply(dm.u1s[s], Scalar::all(scaleStep), dm.u1s[s]);
-            multiply(dm.u2s[s], Scalar::all(scaleStep), dm.u2s[s]);
-        }
-        else
-        {
-            dm.u1s[s].create(dm.I0s[s].size());
-            dm.u2s[s].create(dm.I0s[s].size());
-        }
-        if (use_gamma) dm.u3s[s].create(dm.I0s[s].size());
-    }
-    if (!useInitialFlow)
-    {
-        dm.u1s[nscales - 1].setTo(Scalar::all(0));
-        dm.u2s[nscales - 1].setTo(Scalar::all(0));
-    }
-    if (use_gamma) dm.u3s[nscales - 1].setTo(Scalar::all(0));
-    // pyramidal structure for computing the optical flow
-    for (int s = nscales - 1; s >= 0; --s)
-    {
-        // compute the optical flow at the current scale
-        procOneScale(dm.I0s[s], dm.I1s[s], dm.u1s[s], dm.u2s[s], dm.u3s[s]);
-        // if this was the last scale, finish now
-        if (s == 0)
-            break;
-        // otherwise, upsample the optical flow
-        // zoom the optical flow for the next finer scale
-        resize(dm.u1s[s], dm.u1s[s - 1], dm.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
-        resize(dm.u2s[s], dm.u2s[s - 1], dm.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
-        if (use_gamma) resize(dm.u3s[s], dm.u3s[s - 1], dm.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
-        // scale the optical flow with the appropriate zoom factor (don't scale u3!)
-        multiply(dm.u1s[s - 1], Scalar::all(1 / scaleStep), dm.u1s[s - 1]);
-        multiply(dm.u2s[s - 1], Scalar::all(1 / scaleStep), dm.u2s[s - 1]);
-    }
-    Mat uxy[] = { dm.u1s[0], dm.u2s[0] };
-    merge(uxy, 2, _flow);
-}
-#ifdef HAVE_OPENCL
-bool OpticalFlowDual_TVL1::calc_ocl(InputArray _I0, InputArray _I1, InputOutputArray _flow)
-{
-    UMat I0 = _I0.getUMat();
-    UMat I1 = _I1.getUMat();
-    CV_Assert(I0.type() == CV_8UC1 || I0.type() == CV_32FC1);
-    CV_Assert(I0.size() == I1.size());
-    CV_Assert(I0.type() == I1.type());
-    CV_Assert(!useInitialFlow || (_flow.size() == I0.size() && _flow.type() == CV_32FC2));
-    CV_Assert(nscales > 0);
-    // allocate memory for the pyramid structure
-    dum.I0s.resize(nscales);
-    dum.I1s.resize(nscales);
-    dum.u1s.resize(nscales);
-    dum.u2s.resize(nscales);
-    //I0s_step == I1s_step
-    double alpha = I0.depth() == CV_8U ? 1.0 : 255.0;
-    I0.convertTo(dum.I0s[0], CV_32F, alpha);
-    I1.convertTo(dum.I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
-    dum.u1s[0].create(I0.size(), CV_32FC1);
-    dum.u2s[0].create(I0.size(), CV_32FC1);
-    if (useInitialFlow)
-    {
-        std::vector<UMat> umv;
-        umv.push_back(dum.u1s[0]);
-        umv.push_back(dum.u2s[0]);
-        cv::split(_flow,umv);
-    }
-    dum.I1x_buf.create(I0.size(), CV_32FC1);
-    dum.I1y_buf.create(I0.size(), CV_32FC1);
-    dum.I1w_buf.create(I0.size(), CV_32FC1);
-    dum.I1wx_buf.create(I0.size(), CV_32FC1);
-    dum.I1wy_buf.create(I0.size(), CV_32FC1);
-    dum.grad_buf.create(I0.size(), CV_32FC1);
-    dum.rho_c_buf.create(I0.size(), CV_32FC1);
-    dum.p11_buf.create(I0.size(), CV_32FC1);
-    dum.p12_buf.create(I0.size(), CV_32FC1);
-    dum.p21_buf.create(I0.size(), CV_32FC1);
-    dum.p22_buf.create(I0.size(), CV_32FC1);
-    dum.diff_buf.create(I0.size(), CV_32FC1);
-    // create the scales
-    for (int s = 1; s < nscales; ++s)
-    {
-        resize(dum.I0s[s - 1], dum.I0s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
-        resize(dum.I1s[s - 1], dum.I1s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
-        if (dum.I0s[s].cols < 16 || dum.I0s[s].rows < 16)
-        {
-            nscales = s;
-            break;
-        }
-        if (useInitialFlow)
-        {
-            resize(dum.u1s[s - 1], dum.u1s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
-            resize(dum.u2s[s - 1], dum.u2s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
-            //scale by scale factor
-            multiply(dum.u1s[s], Scalar::all(scaleStep), dum.u1s[s]);
-            multiply(dum.u2s[s], Scalar::all(scaleStep), dum.u2s[s]);
-        }
-    }
-    // pyramidal structure for computing the optical flow
-    for (int s = nscales - 1; s >= 0; --s)
-    {
-        // compute the optical flow at the current scale
-        if (!OpticalFlowDual_TVL1::procOneScale_ocl(dum.I0s[s], dum.I1s[s], dum.u1s[s], dum.u2s[s]))
-            return false;
-        // if this was the last scale, finish now
-        if (s == 0)
-            break;
-        // zoom the optical flow for the next finer scale
-        resize(dum.u1s[s], dum.u1s[s - 1], dum.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
-        resize(dum.u2s[s], dum.u2s[s - 1], dum.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
-        // scale the optical flow with the appropriate zoom factor
-        multiply(dum.u1s[s - 1], Scalar::all(1 / scaleStep), dum.u1s[s - 1]);
-        multiply(dum.u2s[s - 1], Scalar::all(1 / scaleStep), dum.u2s[s - 1]);
-    }
-    std::vector<UMat> uxy;
-    uxy.push_back(dum.u1s[0]);
-    uxy.push_back(dum.u2s[0]);
-    merge(uxy, _flow);
-    return true;
-}
-#endif
-////////////////////////////////////////////////////////////
-// buildFlowMap
-struct BuildFlowMapBody : ParallelLoopBody
-{
-    void operator() (const Range& range) const CV_OVERRIDE;
-    Mat_<float> u1;
-    Mat_<float> u2;
-    mutable Mat_<float> map1;
-    mutable Mat_<float> map2;
-};
-void BuildFlowMapBody::operator() (const Range& range) const
-{
-    for (int y = range.start; y < range.end; ++y)
-    {
-        const float* u1Row = u1[y];
-        const float* u2Row = u2[y];
-        float* map1Row = map1[y];
-        float* map2Row = map2[y];
-        for (int x = 0; x < u1.cols; ++x)
-        {
-            map1Row[x] = x + u1Row[x];
-            map2Row[x] = y + u2Row[x];
-        }
-    }
-}
-void buildFlowMap(const Mat_<float>& u1, const Mat_<float>& u2, Mat_<float>& map1, Mat_<float>& map2)
-{
-    CV_DbgAssert( u2.size() == u1.size() );
-    CV_DbgAssert( map1.size() == u1.size() );
-    CV_DbgAssert( map2.size() == u1.size() );
-    BuildFlowMapBody body;
-    body.u1 = u1;
-    body.u2 = u2;
-    body.map1 = map1;
-    body.map2 = map2;
-    parallel_for_(Range(0, u1.rows), body);
-}
-////////////////////////////////////////////////////////////
-// centeredGradient
-struct CenteredGradientBody : ParallelLoopBody
-{
-    void operator() (const Range& range) const CV_OVERRIDE;
-    Mat_<float> src;
-    mutable Mat_<float> dx;
-    mutable Mat_<float> dy;
-};
-void CenteredGradientBody::operator() (const Range& range) const
-{
-    const int last_col = src.cols - 1;
-    for (int y = range.start; y < range.end; ++y)
-    {
-        const float* srcPrevRow = src[y - 1];
-        const float* srcCurRow = src[y];
-        const float* srcNextRow = src[y + 1];
-        float* dxRow = dx[y];
-        float* dyRow = dy[y];
-        for (int x = 1; x < last_col; ++x)
-        {
-            dxRow[x] = 0.5f * (srcCurRow[x + 1] - srcCurRow[x - 1]);
-            dyRow[x] = 0.5f * (srcNextRow[x] - srcPrevRow[x]);
-        }
-    }
-}
-void centeredGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
-{
-    CV_DbgAssert( src.rows > 2 && src.cols > 2 );
-    CV_DbgAssert( dx.size() == src.size() );
-    CV_DbgAssert( dy.size() == src.size() );
-    const int last_row = src.rows - 1;
-    const int last_col = src.cols - 1;
-    // compute the gradient on the center body of the image
-    {
-        CenteredGradientBody body;
-        body.src = src;
-        body.dx = dx;
-        body.dy = dy;
-        parallel_for_(Range(1, last_row), body);
-    }
-    // compute the gradient on the first and last rows
-    for (int x = 1; x < last_col; ++x)
-    {
-        dx(0, x) = 0.5f * (src(0, x + 1) - src(0, x - 1));
-        dy(0, x) = 0.5f * (src(1, x) - src(0, x));
-        dx(last_row, x) = 0.5f * (src(last_row, x + 1) - src(last_row, x - 1));
-        dy(last_row, x) = 0.5f * (src(last_row, x) - src(last_row - 1, x));
-    }
-    // compute the gradient on the first and last columns
-    for (int y = 1; y < last_row; ++y)
-    {
-        dx(y, 0) = 0.5f * (src(y, 1) - src(y, 0));
-        dy(y, 0) = 0.5f * (src(y + 1, 0) - src(y - 1, 0));
-        dx(y, last_col) = 0.5f * (src(y, last_col) - src(y, last_col - 1));
-        dy(y, last_col) = 0.5f * (src(y + 1, last_col) - src(y - 1, last_col));
-    }
-    // compute the gradient at the four corners
-    dx(0, 0) = 0.5f * (src(0, 1) - src(0, 0));
-    dy(0, 0) = 0.5f * (src(1, 0) - src(0, 0));
-    dx(0, last_col) = 0.5f * (src(0, last_col) - src(0, last_col - 1));
-    dy(0, last_col) = 0.5f * (src(1, last_col) - src(0, last_col));
-    dx(last_row, 0) = 0.5f * (src(last_row, 1) - src(last_row, 0));
-    dy(last_row, 0) = 0.5f * (src(last_row, 0) - src(last_row - 1, 0));
-    dx(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row, last_col - 1));
-    dy(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row - 1, last_col));
-}
-////////////////////////////////////////////////////////////
-// forwardGradient
-struct ForwardGradientBody : ParallelLoopBody
-{
-    void operator() (const Range& range) const CV_OVERRIDE;
-    Mat_<float> src;
-    mutable Mat_<float> dx;
-    mutable Mat_<float> dy;
-};
-void ForwardGradientBody::operator() (const Range& range) const
-{
-    const int last_col = src.cols - 1;
-    for (int y = range.start; y < range.end; ++y)
-    {
-        const float* srcCurRow = src[y];
-        const float* srcNextRow = src[y + 1];
-        float* dxRow = dx[y];
-        float* dyRow = dy[y];
-        for (int x = 0; x < last_col; ++x)
-        {
-            dxRow[x] = srcCurRow[x + 1] - srcCurRow[x];
-            dyRow[x] = srcNextRow[x] - srcCurRow[x];
-        }
-    }
-}
-void forwardGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
-{
-    CV_DbgAssert( src.rows > 2 && src.cols > 2 );
-    CV_DbgAssert( dx.size() == src.size() );
-    CV_DbgAssert( dy.size() == src.size() );
-    const int last_row = src.rows - 1;
-    const int last_col = src.cols - 1;
-    // compute the gradient on the central body of the image
-    {
-        ForwardGradientBody body;
-        body.src = src;
-        body.dx = dx;
-        body.dy = dy;
-        parallel_for_(Range(0, last_row), body);
-    }
-    // compute the gradient on the last row
-    for (int x = 0; x < last_col; ++x)
-    {
-        dx(last_row, x) = src(last_row, x + 1) - src(last_row, x);
-        dy(last_row, x) = 0.0f;
-    }
-    // compute the gradient on the last column
-    for (int y = 0; y < last_row; ++y)
-    {
-        dx(y, last_col) = 0.0f;
-        dy(y, last_col) = src(y + 1, last_col) - src(y, last_col);
-    }
-    dx(last_row, last_col) = 0.0f;
-    dy(last_row, last_col) = 0.0f;
-}
-////////////////////////////////////////////////////////////
-// divergence
-struct DivergenceBody : ParallelLoopBody
-{
-    void operator() (const Range& range) const CV_OVERRIDE;
-    Mat_<float> v1;
-    Mat_<float> v2;
-    mutable Mat_<float> div;
-};
-void DivergenceBody::operator() (const Range& range) const
-{
-    for (int y = range.start; y < range.end; ++y)
-    {
-        const float* v1Row = v1[y];
-        const float* v2PrevRow = v2[y - 1];
-        const float* v2CurRow = v2[y];
-        float* divRow = div[y];
-        for(int x = 1; x < v1.cols; ++x)
-        {
-            const float v1x = v1Row[x] - v1Row[x - 1];
-            const float v2y = v2CurRow[x] - v2PrevRow[x];
-            divRow[x] = v1x + v2y;
-        }
-    }
-}
-void divergence(const Mat_<float>& v1, const Mat_<float>& v2, Mat_<float>& div)
-{
-    CV_DbgAssert( v1.rows > 2 && v1.cols > 2 );
-    CV_DbgAssert( v2.size() == v1.size() );
-    CV_DbgAssert( div.size() == v1.size() );
-    {
-        DivergenceBody body;
-        body.v1 = v1;
-        body.v2 = v2;
-        body.div = div;
-        parallel_for_(Range(1, v1.rows), body);
-    }
-    // compute the divergence on the first row
-    for(int x = 1; x < v1.cols; ++x)
-        div(0, x) = v1(0, x) - v1(0, x - 1) + v2(0, x);
-    // compute the divergence on the first column
-    for (int y = 1; y < v1.rows; ++y)
-        div(y, 0) = v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
-    div(0, 0) = v1(0, 0) + v2(0, 0);
-}
-////////////////////////////////////////////////////////////
-// calcGradRho
-struct CalcGradRhoBody : ParallelLoopBody
-{
-    void operator() (const Range& range) const CV_OVERRIDE;
-    Mat_<float> I0;
-    Mat_<float> I1w;
-    Mat_<float> I1wx;
-    Mat_<float> I1wy;
-    Mat_<float> u1;
-    Mat_<float> u2;
-    mutable Mat_<float> grad;
-    mutable Mat_<float> rho_c;
-};
-void CalcGradRhoBody::operator() (const Range& range) const
-{
-    for (int y = range.start; y < range.end; ++y)
-    {
-        const float* I0Row = I0[y];
-        const float* I1wRow = I1w[y];
-        const float* I1wxRow = I1wx[y];
-        const float* I1wyRow = I1wy[y];
-        const float* u1Row = u1[y];
-        const float* u2Row = u2[y];
-        float* gradRow = grad[y];
-        float* rhoRow = rho_c[y];
-        for (int x = 0; x < I0.cols; ++x)
-        {
-            const float Ix2 = I1wxRow[x] * I1wxRow[x];
-            const float Iy2 = I1wyRow[x] * I1wyRow[x];
-            // store the |Grad(I1)|^2
-            gradRow[x] = Ix2 + Iy2;
-            // compute the constant part of the rho function
-            rhoRow[x] = (I1wRow[x] - I1wxRow[x] * u1Row[x] - I1wyRow[x] * u2Row[x] - I0Row[x]);
-        }
-    }
-}
-void calcGradRho(const Mat_<float>& I0, const Mat_<float>& I1w, const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2,
-    Mat_<float>& grad, Mat_<float>& rho_c)
-{
-    CV_DbgAssert( I1w.size() == I0.size() );
-    CV_DbgAssert( I1wx.size() == I0.size() );
-    CV_DbgAssert( I1wy.size() == I0.size() );
-    CV_DbgAssert( u1.size() == I0.size() );
-    CV_DbgAssert( u2.size() == I0.size() );
-    CV_DbgAssert( grad.size() == I0.size() );
-    CV_DbgAssert( rho_c.size() == I0.size() );
-    CalcGradRhoBody body;
-    body.I0 = I0;
-    body.I1w = I1w;
-    body.I1wx = I1wx;
-    body.I1wy = I1wy;
-    body.u1 = u1;
-    body.u2 = u2;
-    body.grad = grad;
-    body.rho_c = rho_c;
-    parallel_for_(Range(0, I0.rows), body);
-}
-////////////////////////////////////////////////////////////
-// estimateV
-struct EstimateVBody : ParallelLoopBody
-{
-    void operator() (const Range& range) const CV_OVERRIDE;
-    Mat_<float> I1wx;
-    Mat_<float> I1wy;
-    Mat_<float> u1;
-    Mat_<float> u2;
-    Mat_<float> u3;
-    Mat_<float> grad;
-    Mat_<float> rho_c;
-    mutable Mat_<float> v1;
-    mutable Mat_<float> v2;
-    mutable Mat_<float> v3;
-    float l_t;
-    float gamma;
-};
-void EstimateVBody::operator() (const Range& range) const
-{
-    bool use_gamma = gamma != 0;
-    for (int y = range.start; y < range.end; ++y)
-    {
-        const float* I1wxRow = I1wx[y];
-        const float* I1wyRow = I1wy[y];
-        const float* u1Row = u1[y];
-        const float* u2Row = u2[y];
-        const float* u3Row = use_gamma?u3[y]:NULL;
-        const float* gradRow = grad[y];
-        const float* rhoRow = rho_c[y];
-        float* v1Row = v1[y];
-        float* v2Row = v2[y];
-        float* v3Row = use_gamma ? v3[y]:NULL;
-        for (int x = 0; x < I1wx.cols; ++x)
-        {
-            const float rho = use_gamma ? rhoRow[x] + (I1wxRow[x] * u1Row[x] + I1wyRow[x] * u2Row[x]) + gamma * u3Row[x] :
-                                          rhoRow[x] + (I1wxRow[x] * u1Row[x] + I1wyRow[x] * u2Row[x]);
-            float d1 = 0.0f;
-            float d2 = 0.0f;
-            float d3 = 0.0f;
-            if (rho < -l_t * gradRow[x])
-            {
-                d1 = l_t * I1wxRow[x];
-                d2 = l_t * I1wyRow[x];
-                if (use_gamma) d3 = l_t * gamma;
-            }
-            else if (rho > l_t * gradRow[x])
-            {
-                d1 = -l_t * I1wxRow[x];
-                d2 = -l_t * I1wyRow[x];
-                if (use_gamma) d3 = -l_t * gamma;
-            }
-            else if (gradRow[x] > std::numeric_limits<float>::epsilon())
-            {
-                float fi = -rho / gradRow[x];
-                d1 = fi * I1wxRow[x];
-                d2 = fi * I1wyRow[x];
-                if (use_gamma) d3 = fi * gamma;
-            }
-            v1Row[x] = u1Row[x] + d1;
-            v2Row[x] = u2Row[x] + d2;
-            if (use_gamma) v3Row[x] = u3Row[x] + d3;
-        }
-    }
-}
-void estimateV(const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2, const Mat_<float>& u3, const Mat_<float>& grad, const Mat_<float>& rho_c,
-   Mat_<float>& v1, Mat_<float>& v2, Mat_<float>& v3, float l_t, float gamma)
-{
-    CV_DbgAssert( I1wy.size() == I1wx.size() );
-    CV_DbgAssert( u1.size() == I1wx.size() );
-    CV_DbgAssert( u2.size() == I1wx.size() );
-    CV_DbgAssert( grad.size() == I1wx.size() );
-    CV_DbgAssert( rho_c.size() == I1wx.size() );
-    CV_DbgAssert( v1.size() == I1wx.size() );
-    CV_DbgAssert( v2.size() == I1wx.size() );
-    EstimateVBody body;
-    bool use_gamma = gamma != 0;
-    body.I1wx = I1wx;
-    body.I1wy = I1wy;
-    body.u1 = u1;
-    body.u2 = u2;
-    if (use_gamma) body.u3 = u3;
-    body.grad = grad;
-    body.rho_c = rho_c;
-    body.v1 = v1;
-    body.v2 = v2;
-    if (use_gamma) body.v3 = v3;
-    body.l_t = l_t;
-    body.gamma = gamma;
-    parallel_for_(Range(0, I1wx.rows), body);
-}
-////////////////////////////////////////////////////////////
-// estimateU
-float estimateU(const Mat_<float>& v1, const Mat_<float>& v2, const Mat_<float>& v3,
-            const Mat_<float>& div_p1, const Mat_<float>& div_p2, const Mat_<float>& div_p3,
-            Mat_<float>& u1, Mat_<float>& u2, Mat_<float>& u3,
-            float theta, float gamma)
-{
-    CV_DbgAssert( v2.size() == v1.size() );
-    CV_DbgAssert( div_p1.size() == v1.size() );
-    CV_DbgAssert( div_p2.size() == v1.size() );
-    CV_DbgAssert( u1.size() == v1.size() );
-    CV_DbgAssert( u2.size() == v1.size() );
-    float error = 0.0f;
-    bool use_gamma = gamma != 0;
-    for (int y = 0; y < v1.rows; ++y)
-    {
-        const float* v1Row = v1[y];
-        const float* v2Row = v2[y];
-        const float* v3Row = use_gamma?v3[y]:NULL;
-        const float* divP1Row = div_p1[y];
-        const float* divP2Row = div_p2[y];
-        const float* divP3Row = use_gamma?div_p3[y]:NULL;
-        float* u1Row = u1[y];
-        float* u2Row = u2[y];
-        float* u3Row = use_gamma?u3[y]:NULL;
-        for (int x = 0; x < v1.cols; ++x)
-        {
-            const float u1k = u1Row[x];
-            const float u2k = u2Row[x];
-            const float u3k = use_gamma?u3Row[x]:0;
-            u1Row[x] = v1Row[x] + theta * divP1Row[x];
-            u2Row[x] = v2Row[x] + theta * divP2Row[x];
-            if (use_gamma) u3Row[x] = v3Row[x] + theta * divP3Row[x];
-            error += use_gamma?(u1Row[x] - u1k) * (u1Row[x] - u1k) + (u2Row[x] - u2k) * (u2Row[x] - u2k) + (u3Row[x] - u3k) * (u3Row[x] - u3k):
-                               (u1Row[x] - u1k) * (u1Row[x] - u1k) + (u2Row[x] - u2k) * (u2Row[x] - u2k);
-        }
-    }
-    return error;
-}
-////////////////////////////////////////////////////////////
-// estimateDualVariables
-struct EstimateDualVariablesBody : ParallelLoopBody
-{
-    void operator() (const Range& range) const CV_OVERRIDE;
-    Mat_<float> u1x;
-    Mat_<float> u1y;
-    Mat_<float> u2x;
-    Mat_<float> u2y;
-    Mat_<float> u3x;
-    Mat_<float> u3y;
-    mutable Mat_<float> p11;
-    mutable Mat_<float> p12;
-    mutable Mat_<float> p21;
-    mutable Mat_<float> p22;
-    mutable Mat_<float> p31;
-    mutable Mat_<float> p32;
-    float taut;
-    bool use_gamma;
-};
-void EstimateDualVariablesBody::operator() (const Range& range) const
-{
-    for (int y = range.start; y < range.end; ++y)
-    {
-        const float* u1xRow = u1x[y];
-        const float* u1yRow = u1y[y];
-        const float* u2xRow = u2x[y];
-        const float* u2yRow = u2y[y];
-        const float* u3xRow = u3x[y];
-        const float* u3yRow = u3y[y];
-        float* p11Row = p11[y];
-        float* p12Row = p12[y];
-        float* p21Row = p21[y];
-        float* p22Row = p22[y];
-        float* p31Row = p31[y];
-        float* p32Row = p32[y];
-        for (int x = 0; x < u1x.cols; ++x)
-        {
-            const float g1 = static_cast<float>(hypot(u1xRow[x], u1yRow[x]));
-            const float g2 = static_cast<float>(hypot(u2xRow[x], u2yRow[x]));
-            const float ng1  = 1.0f + taut * g1;
-            const float ng2 =  1.0f + taut * g2;
-            p11Row[x] = (p11Row[x] + taut * u1xRow[x]) / ng1;
-            p12Row[x] = (p12Row[x] + taut * u1yRow[x]) / ng1;
-            p21Row[x] = (p21Row[x] + taut * u2xRow[x]) / ng2;
-            p22Row[x] = (p22Row[x] + taut * u2yRow[x]) / ng2;
-            if (use_gamma)
-            {
-                const float g3 = static_cast<float>(hypot(u3xRow[x], u3yRow[x]));
-                const float ng3 = 1.0f + taut * g3;
-                p31Row[x] = (p31Row[x] + taut * u3xRow[x]) / ng3;
-                p32Row[x] = (p32Row[x] + taut * u3yRow[x]) / ng3;
-            }
-        }
-    }
-}
-void estimateDualVariables(const Mat_<float>& u1x, const Mat_<float>& u1y,
-                     const Mat_<float>& u2x, const Mat_<float>& u2y,
-                     const Mat_<float>& u3x, const Mat_<float>& u3y,
-                           Mat_<float>& p11, Mat_<float>& p12,
-                     Mat_<float>& p21, Mat_<float>& p22,
-                     Mat_<float>& p31, Mat_<float>& p32,
-                     float taut, bool use_gamma)
-{
-    CV_DbgAssert( u1y.size() == u1x.size() );
-    CV_DbgAssert( u2x.size() == u1x.size() );
-    CV_DbgAssert( u3x.size() == u1x.size() );
-    CV_DbgAssert( u2y.size() == u1x.size() );
-    CV_DbgAssert( u3y.size() == u1x.size() );
-    CV_DbgAssert( p11.size() == u1x.size() );
-    CV_DbgAssert( p12.size() == u1x.size() );
-    CV_DbgAssert( p21.size() == u1x.size() );
-    CV_DbgAssert( p22.size() == u1x.size() );
-    CV_DbgAssert( p31.size() == u1x.size() );
-    CV_DbgAssert( p32.size() == u1x.size() );
-    EstimateDualVariablesBody body;
-    body.u1x = u1x;
-    body.u1y = u1y;
-    body.u2x = u2x;
-    body.u2y = u2y;
-    body.u3x = u3x;
-    body.u3y = u3y;
-    body.p11 = p11;
-    body.p12 = p12;
-    body.p21 = p21;
-    body.p22 = p22;
-    body.p31 = p31;
-    body.p32 = p32;
-    body.taut = taut;
-    body.use_gamma = use_gamma;
-    parallel_for_(Range(0, u1x.rows), body);
-}
-#ifdef HAVE_OPENCL
-bool OpticalFlowDual_TVL1::procOneScale_ocl(const UMat& I0, const UMat& I1, UMat& u1, UMat& u2)
-{
-    using namespace cv_ocl_tvl1flow;
-    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
-    CV_DbgAssert(I1.size() == I0.size());
-    CV_DbgAssert(I1.type() == I0.type());
-    CV_DbgAssert(u1.empty() || u1.size() == I0.size());
-    CV_DbgAssert(u2.size() == u1.size());
-    if (u1.empty())
-    {
-        u1.create(I0.size(), CV_32FC1);
-        u1.setTo(Scalar::all(0));
-        u2.create(I0.size(), CV_32FC1);
-        u2.setTo(Scalar::all(0));
-    }
-    UMat I1x = dum.I1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    UMat I1y = dum.I1y_buf(Rect(0, 0, I0.cols, I0.rows));
-    if (!centeredGradient(I1, I1x, I1y))
-        return false;
-    UMat I1w = dum.I1w_buf(Rect(0, 0, I0.cols, I0.rows));
-    UMat I1wx = dum.I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
-    UMat I1wy = dum.I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
-    UMat grad = dum.grad_buf(Rect(0, 0, I0.cols, I0.rows));
-    UMat rho_c = dum.rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
-    UMat p11 = dum.p11_buf(Rect(0, 0, I0.cols, I0.rows));
-    UMat p12 = dum.p12_buf(Rect(0, 0, I0.cols, I0.rows));
-    UMat p21 = dum.p21_buf(Rect(0, 0, I0.cols, I0.rows));
-    UMat p22 = dum.p22_buf(Rect(0, 0, I0.cols, I0.rows));
-    p11.setTo(Scalar::all(0));
-    p12.setTo(Scalar::all(0));
-    p21.setTo(Scalar::all(0));
-    p22.setTo(Scalar::all(0));
-    UMat diff = dum.diff_buf(Rect(0, 0, I0.cols, I0.rows));
-    const float l_t = static_cast<float>(lambda * theta);
-    const float taut = static_cast<float>(tau / theta);
-    int n;
-    for (int warpings = 0; warpings < warps; ++warpings)
-    {
-        if (!warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c))
-            return false;
-        double error = std::numeric_limits<double>::max();
-        double prev_error = 0;
-        for (int n_outer = 0; error > scaledEpsilon && n_outer < outerIterations; ++n_outer)
-        {
-            if (medianFiltering > 1) {
-                cv::medianBlur(u1, u1, medianFiltering);
-                cv::medianBlur(u2, u2, medianFiltering);
-            }
-            for (int n_inner = 0; error > scaledEpsilon && n_inner < innerIterations; ++n_inner)
-            {
-                // some tweaks to make sum operation less frequently
-                n = n_inner + n_outer*innerIterations;
-                char calc_error = (n & 0x1) && (prev_error < scaledEpsilon);
-                if (!estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22,
-                    u1, u2, diff, l_t, static_cast<float>(theta), calc_error))
-                    return false;
-                if (calc_error)
-                {
-                    error = cv::sum(diff)[0];
-                    prev_error = error;
-                }
-                else
-                {
-                    error = std::numeric_limits<double>::max();
-                    prev_error -= scaledEpsilon;
-                }
-                if (!estimateDualVariables(u1, u2, p11, p12, p21, p22, taut))
-                    return false;
-            }
-        }
-    }
-    return true;
-}
-#endif
-void OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2, Mat_<float>& u3)
-{
-    const float scaledEpsilon = static_cast<float>(epsilon * epsilon * I0.size().area());
-    CV_DbgAssert( I1.size() == I0.size() );
-    CV_DbgAssert( I1.type() == I0.type() );
-    CV_DbgAssert( u1.size() == I0.size() );
-    CV_DbgAssert( u2.size() == u1.size() );
-    Mat_<float> I1x = dm.I1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> I1y = dm.I1y_buf(Rect(0, 0, I0.cols, I0.rows));
-    centeredGradient(I1, I1x, I1y);
-    Mat_<float> flowMap1 = dm.flowMap1_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> flowMap2 = dm.flowMap2_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> I1w = dm.I1w_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> I1wx = dm.I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> I1wy = dm.I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> grad = dm.grad_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> rho_c = dm.rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> v1 = dm.v1_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> v2 = dm.v2_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> v3 = dm.v3_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> p11 = dm.p11_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> p12 = dm.p12_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> p21 = dm.p21_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> p22 = dm.p22_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> p31 = dm.p31_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> p32 = dm.p32_buf(Rect(0, 0, I0.cols, I0.rows));
-    p11.setTo(Scalar::all(0));
-    p12.setTo(Scalar::all(0));
-    p21.setTo(Scalar::all(0));
-    p22.setTo(Scalar::all(0));
-    bool use_gamma = gamma != 0.;
-    if (use_gamma) p31.setTo(Scalar::all(0));
-    if (use_gamma) p32.setTo(Scalar::all(0));
-    Mat_<float> div_p1 = dm.div_p1_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> div_p2 = dm.div_p2_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> div_p3 = dm.div_p3_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> u1x = dm.u1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> u1y = dm.u1y_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> u2x = dm.u2x_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> u2y = dm.u2y_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> u3x = dm.u3x_buf(Rect(0, 0, I0.cols, I0.rows));
-    Mat_<float> u3y = dm.u3y_buf(Rect(0, 0, I0.cols, I0.rows));
-    const float l_t = static_cast<float>(lambda * theta);
-    const float taut = static_cast<float>(tau / theta);
-    for (int warpings = 0; warpings < warps; ++warpings)
-    {
-        // compute the warping of the target image and its derivatives
-        buildFlowMap(u1, u2, flowMap1, flowMap2);
-        remap(I1, I1w, flowMap1, flowMap2, INTER_CUBIC);
-        remap(I1x, I1wx, flowMap1, flowMap2, INTER_CUBIC);
-        remap(I1y, I1wy, flowMap1, flowMap2, INTER_CUBIC);
-        //calculate I1(x+u0) and its gradient
-        calcGradRho(I0, I1w, I1wx, I1wy, u1, u2, grad, rho_c);
-        float error = std::numeric_limits<float>::max();
-        for (int n_outer = 0; error > scaledEpsilon && n_outer < outerIterations; ++n_outer)
-        {
-            if (medianFiltering > 1) {
-                cv::medianBlur(u1, u1, medianFiltering);
-                cv::medianBlur(u2, u2, medianFiltering);
-            }
-            for (int n_inner = 0; error > scaledEpsilon && n_inner < innerIterations; ++n_inner)
-            {
-                // estimate the values of the variable (v1, v2) (thresholding operator TH)
-                estimateV(I1wx, I1wy, u1, u2, u3, grad, rho_c, v1, v2, v3, l_t, static_cast<float>(gamma));
-                // compute the divergence of the dual variable (p1, p2, p3)
-                divergence(p11, p12, div_p1);
-                divergence(p21, p22, div_p2);
-                if (use_gamma) divergence(p31, p32, div_p3);
-                // estimate the values of the optical flow (u1, u2)
-                error = estimateU(v1, v2, v3, div_p1, div_p2, div_p3, u1, u2, u3, static_cast<float>(theta), static_cast<float>(gamma));
-                // compute the gradient of the optical flow (Du1, Du2)
-                forwardGradient(u1, u1x, u1y);
-                forwardGradient(u2, u2x, u2y);
-                if (use_gamma) forwardGradient(u3, u3x, u3y);
-                // estimate the values of the dual variable (p1, p2, p3)
-                estimateDualVariables(u1x, u1y, u2x, u2y, u3x, u3y, p11, p12, p21, p22, p31, p32, taut, use_gamma);
-            }
-        }
-    }
-}
-void OpticalFlowDual_TVL1::collectGarbage()
-{
-    //dataMat structure dm
-    dm.I0s.clear();
-    dm.I1s.clear();
-    dm.u1s.clear();
-    dm.u2s.clear();
-    dm.I1x_buf.release();
-    dm.I1y_buf.release();
-    dm.flowMap1_buf.release();
-    dm.flowMap2_buf.release();
-    dm.I1w_buf.release();
-    dm.I1wx_buf.release();
-    dm.I1wy_buf.release();
-    dm.grad_buf.release();
-    dm.rho_c_buf.release();
-    dm.v1_buf.release();
-    dm.v2_buf.release();
-    dm.p11_buf.release();
-    dm.p12_buf.release();
-    dm.p21_buf.release();
-    dm.p22_buf.release();
-    dm.div_p1_buf.release();
-    dm.div_p2_buf.release();
-    dm.u1x_buf.release();
-    dm.u1y_buf.release();
-    dm.u2x_buf.release();
-    dm.u2y_buf.release();
-#ifdef HAVE_OPENCL
-    //dataUMat structure dum
-    dum.I0s.clear();
-    dum.I1s.clear();
-    dum.u1s.clear();
-    dum.u2s.clear();
-    dum.I1x_buf.release();
-    dum.I1y_buf.release();
-    dum.I1w_buf.release();
-    dum.I1wx_buf.release();
-    dum.I1wy_buf.release();
-    dum.grad_buf.release();
-    dum.rho_c_buf.release();
-    dum.p11_buf.release();
-    dum.p12_buf.release();
-    dum.p21_buf.release();
-    dum.p22_buf.release();
-    dum.diff_buf.release();
-    dum.norm_buf.release();
-#endif
-}
-} // namespace
-Ptr<DualTVL1OpticalFlow> cv::createOptFlow_DualTVL1()
-{
-    return makePtr<OpticalFlowDual_TVL1>();
-}
-Ptr<DualTVL1OpticalFlow> cv::DualTVL1OpticalFlow::create(
-    double tau, double lambda, double theta, int nscales, int warps,
-    double epsilon, int innerIterations, int outerIterations, double scaleStep,
-    double gamma, int medianFilter, bool useInitialFlow)
-{
-    return makePtr<OpticalFlowDual_TVL1>(tau, lambda, theta, nscales, warps,
-                                         epsilon, innerIterations, outerIterations,
-                                         scaleStep, gamma, medianFilter, useInitialFlow);
-}
--- a/modules/video/src/variational_refinement.cpp
+++ b/modules/video/src/variational_refinement.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+using namespace std;
+namespace cv
+{
+class VariationalRefinementImpl CV_FINAL : public VariationalRefinement
+{
+  public:
+    VariationalRefinementImpl();
+    void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
+    void calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v) CV_OVERRIDE;
+    void collectGarbage() CV_OVERRIDE;
+  protected: //!< algorithm parameters
+    int fixedPointIterations, sorIterations;
+    float omega;
+    float alpha, delta, gamma;
+    float zeta, epsilon;
+  public:
+    int getFixedPointIterations() const CV_OVERRIDE { return fixedPointIterations; }
+    void setFixedPointIterations(int val) CV_OVERRIDE { fixedPointIterations = val; }
+    int getSorIterations() const CV_OVERRIDE { return sorIterations; }
+    void setSorIterations(int val) CV_OVERRIDE { sorIterations = val; }
+    float getOmega() const CV_OVERRIDE { return omega; }
+    void setOmega(float val) CV_OVERRIDE { omega = val; }
+    float getAlpha() const CV_OVERRIDE { return alpha; }
+    void setAlpha(float val) CV_OVERRIDE { alpha = val; }
+    float getDelta() const CV_OVERRIDE { return delta; }
+    void setDelta(float val) CV_OVERRIDE { delta = val; }
+    float getGamma() const CV_OVERRIDE { return gamma; }
+    void setGamma(float val) CV_OVERRIDE { gamma = val; }
+  protected: //!< internal buffers
+    /* This struct defines a special data layout for Mat_<float>. Original buffer is split into two: one for "red"
+     * elements (sum of indices is even) and one for "black" (sum of indices is odd) in a checkerboard pattern. It
+     * allows for more efficient processing in SOR iterations, more natural SIMD vectorization and parallelization
+     * (Red-Black SOR). Additionally, it simplifies border handling by adding repeated borders to both red and
+     * black buffers.
+     */
+    struct RedBlackBuffer
+    {
+        Mat_<float> red;   //!< (i+j)%2==0
+        Mat_<float> black; //!< (i+j)%2==1
+        /* Width of even and odd rows may be different */
+        int red_even_len, red_odd_len;
+        int black_even_len, black_odd_len;
+        void create(Size s);
+        void release();
+    };
+    Mat_<float> Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz;                            //!< image derivative buffers
+    RedBlackBuffer Ix_rb, Iy_rb, Iz_rb, Ixx_rb, Ixy_rb, Iyy_rb, Ixz_rb, Iyz_rb; //!< corresponding red-black buffers
+    RedBlackBuffer A11, A12, A22, b1, b2; //!< main linear system coefficients
+    RedBlackBuffer weights;               //!< smoothness term weights in the current fixed point iteration
+    Mat_<float> mapX, mapY; //!< auxiliary buffers for remapping
+    RedBlackBuffer tempW_u, tempW_v; //!< flow buffers that are modified in each fixed point iteration
+    RedBlackBuffer dW_u, dW_v;       //!< optical flow increment
+    RedBlackBuffer W_u_rb, W_v_rb;   //!< red-black-buffer version of the input flow
+  private: //!< private methods and parallel sections
+    void splitCheckerboard(RedBlackBuffer &dst, Mat &src);
+    void mergeCheckerboard(Mat &dst, RedBlackBuffer &src);
+    void updateRepeatedBorders(RedBlackBuffer &dst);
+    void warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat &flow_v);
+    void prepareBuffers(Mat &I0, Mat &I1, Mat &W_u, Mat &W_v);
+    /* Parallelizing arbitrary operations with 3 input/output arguments */
+    typedef void (VariationalRefinementImpl::*Op)(void *op1, void *op2, void *op3);
+    struct ParallelOp_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        vector<Op> ops;
+        vector<void *> op1s;
+        vector<void *> op2s;
+        vector<void *> op3s;
+        ParallelOp_ParBody(VariationalRefinementImpl &_var, vector<Op> _ops, vector<void *> &_op1s,
+                           vector<void *> &_op2s, vector<void *> &_op3s);
+        void operator()(const Range &range) const CV_OVERRIDE;
+    };
+    void gradHorizAndSplitOp(void *src, void *dst, void *dst_split)
+    {
+        Sobel(*(Mat *)src, *(Mat *)dst, -1, 1, 0, 1, 1, 0.00, BORDER_REPLICATE);
+        splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst);
+    }
+    void gradVertAndSplitOp(void *src, void *dst, void *dst_split)
+    {
+        Sobel(*(Mat *)src, *(Mat *)dst, -1, 0, 1, 1, 1, 0.00, BORDER_REPLICATE);
+        splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst);
+    }
+    void averageOp(void *src1, void *src2, void *dst)
+    {
+        addWeighted(*(Mat *)src1, 0.5, *(Mat *)src2, 0.5, 0.0, *(Mat *)dst, CV_32F);
+    }
+    void subtractOp(void *src1, void *src2, void *dst)
+    {
+        subtract(*(Mat *)src1, *(Mat *)src2, *(Mat *)dst, noArray(), CV_32F);
+    }
+    struct ComputeDataTerm_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        int nstripes, stripe_sz;
+        int h;
+        RedBlackBuffer *dW_u, *dW_v;
+        bool red_pass;
+        ComputeDataTerm_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_dW_u,
+                                RedBlackBuffer &_dW_v, bool _red_pass);
+        void operator()(const Range &range) const CV_OVERRIDE;
+    };
+    struct ComputeSmoothnessTermHorPass_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        int nstripes, stripe_sz;
+        int h;
+        RedBlackBuffer *W_u, *W_v, *curW_u, *curW_v;
+        bool red_pass;
+        ComputeSmoothnessTermHorPass_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h,
+                                             RedBlackBuffer &_W_u, RedBlackBuffer &_W_v, RedBlackBuffer &_tempW_u,
+                                             RedBlackBuffer &_tempW_v, bool _red_pass);
+        void operator()(const Range &range) const CV_OVERRIDE;
+    };
+    struct ComputeSmoothnessTermVertPass_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        int nstripes, stripe_sz;
+        int h;
+        RedBlackBuffer *W_u, *W_v;
+        bool red_pass;
+        ComputeSmoothnessTermVertPass_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h,
+                                              RedBlackBuffer &W_u, RedBlackBuffer &_W_v, bool _red_pass);
+        void operator()(const Range &range) const CV_OVERRIDE;
+    };
+    struct RedBlackSOR_ParBody : public ParallelLoopBody
+    {
+        VariationalRefinementImpl *var;
+        int nstripes, stripe_sz;
+        int h;
+        RedBlackBuffer *dW_u, *dW_v;
+        bool red_pass;
+        RedBlackSOR_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_dW_u,
+                            RedBlackBuffer &_dW_v, bool _red_pass);
+        void operator()(const Range &range) const CV_OVERRIDE;
+    };
+};
+VariationalRefinementImpl::VariationalRefinementImpl()
+{
+    fixedPointIterations = 5;
+    sorIterations = 5;
+    alpha = 20.0f;
+    delta = 5.0f;
+    gamma = 10.0f;
+    omega = 1.6f;
+    zeta = 0.1f;
+    epsilon = 0.001f;
+}
+/* This function converts an input Mat into the RedBlackBuffer format, which involves
+ * splitting the input buffer into two and adding repeated borders. Assumes that enough
+ * memory in dst is already allocated.
+ */
+void VariationalRefinementImpl::splitCheckerboard(RedBlackBuffer &dst, Mat &src)
+{
+    int buf_j, j;
+    int buf_w = (int)ceil(src.cols / 2.0) + 2; //!< max width of red/black buffers with borders
+    /* Rows of red and black buffers can have different actual width, some extra repeated values are
+     * added for padding in such cases.
+     */
+    for (int i = 0; i < src.rows; i++)
+    {
+        float *src_buf = src.ptr<float>(i);
+        float *r_buf = dst.red.ptr<float>(i + 1);
+        float *b_buf = dst.black.ptr<float>(i + 1);
+        r_buf[0] = b_buf[0] = src_buf[0];
+        buf_j = 1;
+        if (i % 2 == 0)
+        {
+            for (j = 0; j < src.cols - 1; j += 2)
+            {
+                r_buf[buf_j] = src_buf[j];
+                b_buf[buf_j] = src_buf[j + 1];
+                buf_j++;
+            }
+            if (j < src.cols)
+                r_buf[buf_j] = b_buf[buf_j] = src_buf[j];
+            else
+                j--;
+        }
+        else
+        {
+            for (j = 0; j < src.cols - 1; j += 2)
+            {
+                b_buf[buf_j] = src_buf[j];
+                r_buf[buf_j] = src_buf[j + 1];
+                buf_j++;
+            }
+            if (j < src.cols)
+                r_buf[buf_j] = b_buf[buf_j] = src_buf[j];
+            else
+                j--;
+        }
+        r_buf[buf_w - 1] = b_buf[buf_w - 1] = src_buf[j];
+    }
+    /* Fill top and bottom borders: */
+    {
+        float *r_buf_border = dst.red.ptr<float>(dst.red.rows - 1);
+        float *b_buf_border = dst.black.ptr<float>(dst.black.rows - 1);
+        float *r_buf = dst.red.ptr<float>(dst.red.rows - 2);
+        float *b_buf = dst.black.ptr<float>(dst.black.rows - 2);
+        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
+        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
+    }
+    {
+        float *r_buf_border = dst.red.ptr<float>(0);
+        float *b_buf_border = dst.black.ptr<float>(0);
+        float *r_buf = dst.red.ptr<float>(1);
+        float *b_buf = dst.black.ptr<float>(1);
+        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
+        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
+    }
+}
+/* The inverse of splitCheckerboard, i.e. converting the RedBlackBuffer back into Mat.
+ * Assumes that enough memory in dst is already allocated.
+ */
+void VariationalRefinementImpl::mergeCheckerboard(Mat &dst, RedBlackBuffer &src)
+{
+    int buf_j, j;
+    for (int i = 0; i < dst.rows; i++)
+    {
+        float *src_r_buf = src.red.ptr<float>(i + 1);
+        float *src_b_buf = src.black.ptr<float>(i + 1);
+        float *dst_buf = dst.ptr<float>(i);
+        buf_j = 1;
+        if (i % 2 == 0)
+        {
+            for (j = 0; j < dst.cols - 1; j += 2)
+            {
+                dst_buf[j] = src_r_buf[buf_j];
+                dst_buf[j + 1] = src_b_buf[buf_j];
+                buf_j++;
+            }
+            if (j < dst.cols)
+                dst_buf[j] = src_r_buf[buf_j];
+        }
+        else
+        {
+            for (j = 0; j < dst.cols - 1; j += 2)
+            {
+                dst_buf[j] = src_b_buf[buf_j];
+                dst_buf[j + 1] = src_r_buf[buf_j];
+                buf_j++;
+            }
+            if (j < dst.cols)
+                dst_buf[j] = src_b_buf[buf_j];
+        }
+    }
+}
+/* An auxiliary function that updates the borders. Used to enforce that border values repeat
+ * the ones adjacent to the border.
+ */
+void VariationalRefinementImpl::updateRepeatedBorders(RedBlackBuffer &dst)
+{
+    int buf_w = dst.red.cols;
+    for (int i = 0; i < dst.red.rows - 2; i++)
+    {
+        float *r_buf = dst.red.ptr<float>(i + 1);
+        float *b_buf = dst.black.ptr<float>(i + 1);
+        if (i % 2 == 0)
+        {
+            b_buf[0] = r_buf[1];
+            if (dst.red_even_len > dst.black_even_len)
+                b_buf[dst.black_even_len + 1] = r_buf[dst.red_even_len];
+            else
+                r_buf[dst.red_even_len + 1] = b_buf[dst.black_even_len];
+        }
+        else
+        {
+            r_buf[0] = b_buf[1];
+            if (dst.red_odd_len < dst.black_odd_len)
+                r_buf[dst.red_odd_len + 1] = b_buf[dst.black_odd_len];
+            else
+                b_buf[dst.black_odd_len + 1] = r_buf[dst.red_odd_len];
+        }
+    }
+    {
+        float *r_buf_border = dst.red.ptr<float>(dst.red.rows - 1);
+        float *b_buf_border = dst.black.ptr<float>(dst.black.rows - 1);
+        float *r_buf = dst.red.ptr<float>(dst.red.rows - 2);
+        float *b_buf = dst.black.ptr<float>(dst.black.rows - 2);
+        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
+        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
+    }
+    {
+        float *r_buf_border = dst.red.ptr<float>(0);
+        float *b_buf_border = dst.black.ptr<float>(0);
+        float *r_buf = dst.red.ptr<float>(1);
+        float *b_buf = dst.black.ptr<float>(1);
+        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
+        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
+    }
+}
+void VariationalRefinementImpl::RedBlackBuffer::create(Size s)
+{
+    /* Allocate enough memory to include borders */
+    int w = (int)ceil(s.width / 2.0) + 2;
+    red.create(s.height + 2, w);
+    black.create(s.height + 2, w);
+    if (s.width % 2 == 0)
+        red_even_len = red_odd_len = black_even_len = black_odd_len = w - 2;
+    else
+    {
+        red_even_len = black_odd_len = w - 2;
+        red_odd_len = black_even_len = w - 3;
+    }
+}
+void VariationalRefinementImpl::RedBlackBuffer::release()
+{
+    red.release();
+    black.release();
+}
+VariationalRefinementImpl::ParallelOp_ParBody::ParallelOp_ParBody(VariationalRefinementImpl &_var, vector<Op> _ops,
+                                                                  vector<void *> &_op1s, vector<void *> &_op2s,
+                                                                  vector<void *> &_op3s)
+    : var(&_var), ops(_ops), op1s(_op1s), op2s(_op2s), op3s(_op3s)
+{
+}
+void VariationalRefinementImpl::ParallelOp_ParBody::operator()(const Range &range) const
+{
+    for (int i = range.start; i < range.end; i++)
+        (var->*ops[i])(op1s[i], op2s[i], op3s[i]);
+}
+void VariationalRefinementImpl::warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat &flow_v)
+{
+    for (int i = 0; i < flow_u.rows; i++)
+    {
+        float *pFlowU = flow_u.ptr<float>(i);
+        float *pFlowV = flow_v.ptr<float>(i);
+        float *pMapX = mapX.ptr<float>(i);
+        float *pMapY = mapY.ptr<float>(i);
+        for (int j = 0; j < flow_u.cols; j++)
+        {
+            pMapX[j] = j + pFlowU[j];
+            pMapY[j] = i + pFlowV[j];
+        }
+    }
+    remap(src, dst, mapX, mapY, INTER_LINEAR, BORDER_REPLICATE);
+}
+void VariationalRefinementImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &W_u, Mat &W_v)
+{
+    Size s = I0.size();
+    A11.create(s);
+    A12.create(s);
+    A22.create(s);
+    b1.create(s);
+    b2.create(s);
+    weights.create(s);
+    weights.red.setTo(0.0f);
+    weights.black.setTo(0.0f);
+    tempW_u.create(s);
+    tempW_v.create(s);
+    dW_u.create(s);
+    dW_v.create(s);
+    W_u_rb.create(s);
+    W_v_rb.create(s);
+    Ix.create(s);
+    Iy.create(s);
+    Iz.create(s);
+    Ixx.create(s);
+    Ixy.create(s);
+    Iyy.create(s);
+    Ixz.create(s);
+    Iyz.create(s);
+    Ix_rb.create(s);
+    Iy_rb.create(s);
+    Iz_rb.create(s);
+    Ixx_rb.create(s);
+    Ixy_rb.create(s);
+    Iyy_rb.create(s);
+    Ixz_rb.create(s);
+    Iyz_rb.create(s);
+    mapX.create(s);
+    mapY.create(s);
+    /* Floating point warps work significantly better than fixed-point */
+    Mat I1flt, warpedI;
+    I1.convertTo(I1flt, CV_32F);
+    warpImage(warpedI, I1flt, W_u, W_v);
+    /* Computing an average of the current and warped next frames (to compute the derivatives on) and
+     * temporal derivative Iz
+     */
+    Mat averagedI;
+    {
+        vector<void *> op1s;
+        op1s.push_back((void *)&I0);
+        op1s.push_back((void *)&warpedI);
+        vector<void *> op2s;
+        op2s.push_back((void *)&warpedI);
+        op2s.push_back((void *)&I0);
+        vector<void *> op3s;
+        op3s.push_back((void *)&averagedI);
+        op3s.push_back((void *)&Iz);
+        vector<Op> ops;
+        ops.push_back(&VariationalRefinementImpl::averageOp);
+        ops.push_back(&VariationalRefinementImpl::subtractOp);
+        parallel_for_(Range(0, 2), ParallelOp_ParBody(*this, ops, op1s, op2s, op3s));
+    }
+    splitCheckerboard(Iz_rb, Iz);
+    /* Computing first-order derivatives */
+    {
+        vector<void *> op1s;
+        op1s.push_back((void *)&averagedI);
+        op1s.push_back((void *)&averagedI);
+        op1s.push_back((void *)&Iz);
+        op1s.push_back((void *)&Iz);
+        vector<void *> op2s;
+        op2s.push_back((void *)&Ix);
+        op2s.push_back((void *)&Iy);
+        op2s.push_back((void *)&Ixz);
+        op2s.push_back((void *)&Iyz);
+        vector<void *> op3s;
+        op3s.push_back((void *)&Ix_rb);
+        op3s.push_back((void *)&Iy_rb);
+        op3s.push_back((void *)&Ixz_rb);
+        op3s.push_back((void *)&Iyz_rb);
+        vector<Op> ops;
+        ops.push_back(&VariationalRefinementImpl::gradHorizAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradHorizAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
+        parallel_for_(Range(0, 4), ParallelOp_ParBody(*this, ops, op1s, op2s, op3s));
+    }
+    /* Computing second-order derivatives */
+    {
+        vector<void *> op1s;
+        op1s.push_back((void *)&Ix);
+        op1s.push_back((void *)&Ix);
+        op1s.push_back((void *)&Iy);
+        vector<void *> op2s;
+        op2s.push_back((void *)&Ixx);
+        op2s.push_back((void *)&Ixy);
+        op2s.push_back((void *)&Iyy);
+        vector<void *> op3s;
+        op3s.push_back((void *)&Ixx_rb);
+        op3s.push_back((void *)&Ixy_rb);
+        op3s.push_back((void *)&Iyy_rb);
+        vector<Op> ops;
+        ops.push_back(&VariationalRefinementImpl::gradHorizAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
+        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
+        parallel_for_(Range(0, 3), ParallelOp_ParBody(*this, ops, op1s, op2s, op3s));
+    }
+}
+VariationalRefinementImpl::ComputeDataTerm_ParBody::ComputeDataTerm_ParBody(VariationalRefinementImpl &_var,
+                                                                            int _nstripes, int _h,
+                                                                            RedBlackBuffer &_dW_u,
+                                                                            RedBlackBuffer &_dW_v, bool _red_pass)
+    : var(&_var), nstripes(_nstripes), h(_h), dW_u(&_dW_u), dW_v(&_dW_v), red_pass(_red_pass)
+{
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+/* This function computes parts of the main linear system coefficients A11,A12,A22,b1,b1
+ * that correspond to the data term, which includes color and gradient constancy assumptions.
+ */
+void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range &range) const
+{
+    int start_i = min(range.start * stripe_sz, h);
+    int end_i = min(range.end * stripe_sz, h);
+    float zeta_squared = var->zeta * var->zeta;
+    float epsilon_squared = var->epsilon * var->epsilon;
+    float gamma2 = var->gamma / 2;
+    float delta2 = var->delta / 2;
+    float *pIx, *pIy, *pIz;
+    float *pIxx, *pIxy, *pIyy, *pIxz, *pIyz;
+    float *pdU, *pdV;
+    float *pa11, *pa12, *pa22, *pb1, *pb2;
+    float derivNorm, derivNorm2;
+    float Ik1z, Ik1zx, Ik1zy;
+    float weight;
+    int len;
+    for (int i = start_i; i < end_i; i++)
+    {
+#define INIT_ROW_POINTERS(color)                                                                                       \
+    pIx = var->Ix_rb.color.ptr<float>(i + 1) + 1;                                                                      \
+    pIy = var->Iy_rb.color.ptr<float>(i + 1) + 1;                                                                      \
+    pIz = var->Iz_rb.color.ptr<float>(i + 1) + 1;                                                                      \
+    pIxx = var->Ixx_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pIxy = var->Ixy_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pIyy = var->Iyy_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pIxz = var->Ixz_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pIyz = var->Iyz_rb.color.ptr<float>(i + 1) + 1;                                                                    \
+    pa11 = var->A11.color.ptr<float>(i + 1) + 1;                                                                       \
+    pa12 = var->A12.color.ptr<float>(i + 1) + 1;                                                                       \
+    pa22 = var->A22.color.ptr<float>(i + 1) + 1;                                                                       \
+    pb1 = var->b1.color.ptr<float>(i + 1) + 1;                                                                         \
+    pb2 = var->b2.color.ptr<float>(i + 1) + 1;                                                                         \
+    pdU = dW_u->color.ptr<float>(i + 1) + 1;                                                                           \
+    pdV = dW_v->color.ptr<float>(i + 1) + 1;                                                                           \
+    if (i % 2 == 0)                                                                                                    \
+        len = var->Ix_rb.color##_even_len;                                                                             \
+    else                                                                                                               \
+        len = var->Ix_rb.color##_odd_len;
+        if (red_pass)
+        {
+            INIT_ROW_POINTERS(red);
+        }
+        else
+        {
+            INIT_ROW_POINTERS(black);
+        }
+#undef INIT_ROW_POINTERS
+        int j = 0;
+#ifdef CV_SIMD128
+        v_float32x4 zeta_vec = v_setall_f32(zeta_squared);
+        v_float32x4 eps_vec = v_setall_f32(epsilon_squared);
+        v_float32x4 delta_vec = v_setall_f32(delta2);
+        v_float32x4 gamma_vec = v_setall_f32(gamma2);
+        v_float32x4 zero_vec = v_setall_f32(0.0f);
+        v_float32x4 pIx_vec, pIy_vec, pIz_vec, pdU_vec, pdV_vec;
+        v_float32x4 pIxx_vec, pIxy_vec, pIyy_vec, pIxz_vec, pIyz_vec;
+        v_float32x4 derivNorm_vec, derivNorm2_vec, weight_vec;
+        v_float32x4 Ik1z_vec, Ik1zx_vec, Ik1zy_vec;
+        v_float32x4 pa11_vec, pa12_vec, pa22_vec, pb1_vec, pb2_vec;
+        for (; j < len - 3; j += 4)
+        {
+            pIx_vec = v_load(pIx + j);
+            pIy_vec = v_load(pIy + j);
+            pIz_vec = v_load(pIz + j);
+            pdU_vec = v_load(pdU + j);
+            pdV_vec = v_load(pdV + j);
+            derivNorm_vec = pIx_vec * pIx_vec + pIy_vec * pIy_vec + zeta_vec;
+            Ik1z_vec = pIz_vec + pIx_vec * pdU_vec + pIy_vec * pdV_vec;
+            weight_vec = (delta_vec / v_sqrt(Ik1z_vec * Ik1z_vec / derivNorm_vec + eps_vec)) / derivNorm_vec;
+            pa11_vec = weight_vec * (pIx_vec * pIx_vec) + zeta_vec;
+            pa12_vec = weight_vec * (pIx_vec * pIy_vec);
+            pa22_vec = weight_vec * (pIy_vec * pIy_vec) + zeta_vec;
+            pb1_vec = zero_vec - weight_vec * (pIz_vec * pIx_vec);
+            pb2_vec = zero_vec - weight_vec * (pIz_vec * pIy_vec);
+            pIxx_vec = v_load(pIxx + j);
+            pIxy_vec = v_load(pIxy + j);
+            pIyy_vec = v_load(pIyy + j);
+            pIxz_vec = v_load(pIxz + j);
+            pIyz_vec = v_load(pIyz + j);
+            derivNorm_vec = pIxx_vec * pIxx_vec + pIxy_vec * pIxy_vec + zeta_vec;
+            derivNorm2_vec = pIyy_vec * pIyy_vec + pIxy_vec * pIxy_vec + zeta_vec;
+            Ik1zx_vec = pIxz_vec + pIxx_vec * pdU_vec + pIxy_vec * pdV_vec;
+            Ik1zy_vec = pIyz_vec + pIxy_vec * pdU_vec + pIyy_vec * pdV_vec;
+            weight_vec = gamma_vec / v_sqrt(Ik1zx_vec * Ik1zx_vec / derivNorm_vec +
+                                            Ik1zy_vec * Ik1zy_vec / derivNorm2_vec + eps_vec);
+            pa11_vec += weight_vec * (pIxx_vec * pIxx_vec / derivNorm_vec + pIxy_vec * pIxy_vec / derivNorm2_vec);
+            pa12_vec += weight_vec * (pIxx_vec * pIxy_vec / derivNorm_vec + pIxy_vec * pIyy_vec / derivNorm2_vec);
+            pa22_vec += weight_vec * (pIxy_vec * pIxy_vec / derivNorm_vec + pIyy_vec * pIyy_vec / derivNorm2_vec);
+            pb1_vec -= weight_vec * (pIxx_vec * pIxz_vec / derivNorm_vec + pIxy_vec * pIyz_vec / derivNorm2_vec);
+            pb2_vec -= weight_vec * (pIxy_vec * pIxz_vec / derivNorm_vec + pIyy_vec * pIyz_vec / derivNorm2_vec);
+            v_store(pa11 + j, pa11_vec);
+            v_store(pa12 + j, pa12_vec);
+            v_store(pa22 + j, pa22_vec);
+            v_store(pb1 + j, pb1_vec);
+            v_store(pb2 + j, pb2_vec);
+        }
+#endif
+        for (; j < len; j++)
+        {
+            /* Step 1: Compute color constancy terms */
+            /* Normalization factor:*/
+            derivNorm = pIx[j] * pIx[j] + pIy[j] * pIy[j] + zeta_squared;
+            /* Color constancy penalty (computed by Taylor expansion):*/
+            Ik1z = pIz[j] + pIx[j] * pdU[j] + pIy[j] * pdV[j];
+            /* Weight of the color constancy term in the current fixed-point iteration divided by derivNorm: */
+            weight = (delta2 / sqrt(Ik1z * Ik1z / derivNorm + epsilon_squared)) / derivNorm;
+            /* Add respective color constancy terms to the linear system coefficients: */
+            pa11[j] = weight * (pIx[j] * pIx[j]) + zeta_squared;
+            pa12[j] = weight * (pIx[j] * pIy[j]);
+            pa22[j] = weight * (pIy[j] * pIy[j]) + zeta_squared;
+            pb1[j] = -weight * (pIz[j] * pIx[j]);
+            pb2[j] = -weight * (pIz[j] * pIy[j]);
+            /* Step 2: Compute gradient constancy terms */
+            /* Normalization factor for x gradient: */
+            derivNorm = pIxx[j] * pIxx[j] + pIxy[j] * pIxy[j] + zeta_squared;
+            /* Normalization factor for y gradient: */
+            derivNorm2 = pIyy[j] * pIyy[j] + pIxy[j] * pIxy[j] + zeta_squared;
+            /* Gradient constancy penalties (computed by Taylor expansion): */
+            Ik1zx = pIxz[j] + pIxx[j] * pdU[j] + pIxy[j] * pdV[j];
+            Ik1zy = pIyz[j] + pIxy[j] * pdU[j] + pIyy[j] * pdV[j];
+            /* Weight of the gradient constancy term in the current fixed-point iteration: */
+            weight = gamma2 / sqrt(Ik1zx * Ik1zx / derivNorm + Ik1zy * Ik1zy / derivNorm2 + epsilon_squared);
+            /* Add respective gradient constancy components to the linear system coefficients: */
+            pa11[j] += weight * (pIxx[j] * pIxx[j] / derivNorm + pIxy[j] * pIxy[j] / derivNorm2);
+            pa12[j] += weight * (pIxx[j] * pIxy[j] / derivNorm + pIxy[j] * pIyy[j] / derivNorm2);
+            pa22[j] += weight * (pIxy[j] * pIxy[j] / derivNorm + pIyy[j] * pIyy[j] / derivNorm2);
+            pb1[j] += -weight * (pIxx[j] * pIxz[j] / derivNorm + pIxy[j] * pIyz[j] / derivNorm2);
+            pb2[j] += -weight * (pIxy[j] * pIxz[j] / derivNorm + pIyy[j] * pIyz[j] / derivNorm2);
+        }
+    }
+}
+VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::ComputeSmoothnessTermHorPass_ParBody(
+  VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_W_u, RedBlackBuffer &_W_v,
+  RedBlackBuffer &_tempW_u, RedBlackBuffer &_tempW_v, bool _red_pass)
+    : var(&_var), nstripes(_nstripes), h(_h), W_u(&_W_u), W_v(&_W_v), curW_u(&_tempW_u), curW_v(&_tempW_v),
+      red_pass(_red_pass)
+{
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+/* This function updates the linear system coefficients A11,A22,b1,b1 according to the
+ * flow smoothness term and computes corresponding weights for the current fixed point iteration.
+ * A11,A22,b1,b1 are updated only partially (horizontal pass). Doing both horizontal and vertical
+ * passes in one loop complicates parallelization (different threads write to the same elements).
+ */
+void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator()(const Range &range) const
+{
+    int start_i = min(range.start * stripe_sz, h);
+    int end_i = min(range.end * stripe_sz, h);
+    float epsilon_squared = var->epsilon * var->epsilon;
+    float alpha2 = var->alpha / 2;
+    float *pWeight;
+    float *pA_u, *pA_u_next, *pA_v, *pA_v_next;
+    float *pB_u, *pB_u_next, *pB_v, *pB_v_next;
+    float *cW_u, *cW_u_next, *cW_u_next_row;
+    float *cW_v, *cW_v_next, *cW_v_next_row;
+    float *pW_u, *pW_u_next;
+    float *pW_v, *pW_v_next;
+    float ux, uy, vx, vy;
+    int len;
+    bool touches_right_border = true;
+#define INIT_ROW_POINTERS(cur_color, next_color, next_offs_even, next_offs_odd, bool_default)                          \
+    pWeight = var->weights.cur_color.ptr<float>(i + 1) + 1;                                                            \
+    pA_u = var->A11.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pB_u = var->b1.cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    cW_u = curW_u->cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    pW_u = W_u->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+    pA_v = var->A22.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pB_v = var->b2.cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    cW_v = curW_v->cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    pW_v = W_v->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+                                                                                                                       \
+    cW_u_next_row = curW_u->next_color.ptr<float>(i + 2) + 1;                                                          \
+    cW_v_next_row = curW_v->next_color.ptr<float>(i + 2) + 1;                                                          \
+                                                                                                                       \
+    if (i % 2 == 0)                                                                                                    \
+    {                                                                                                                  \
+        pA_u_next = var->A11.next_color.ptr<float>(i + 1) + next_offs_even;                                            \
+        pB_u_next = var->b1.next_color.ptr<float>(i + 1) + next_offs_even;                                             \
+        cW_u_next = curW_u->next_color.ptr<float>(i + 1) + next_offs_even;                                             \
+        pW_u_next = W_u->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
+        pA_v_next = var->A22.next_color.ptr<float>(i + 1) + next_offs_even;                                            \
+        pB_v_next = var->b2.next_color.ptr<float>(i + 1) + next_offs_even;                                             \
+        cW_v_next = curW_v->next_color.ptr<float>(i + 1) + next_offs_even;                                             \
+        pW_v_next = W_v->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
+        len = var->A11.cur_color##_even_len;                                                                           \
+        if (var->A11.cur_color##_even_len != var->A11.cur_color##_odd_len)                                             \
+            touches_right_border = bool_default;                                                                       \
+        else                                                                                                           \
+            touches_right_border = !bool_default;                                                                      \
+    }                                                                                                                  \
+    else                                                                                                               \
+    {                                                                                                                  \
+        pA_u_next = var->A11.next_color.ptr<float>(i + 1) + next_offs_odd;                                             \
+        pB_u_next = var->b1.next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
+        cW_u_next = curW_u->next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
+        pW_u_next = W_u->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
+        pA_v_next = var->A22.next_color.ptr<float>(i + 1) + next_offs_odd;                                             \
+        pB_v_next = var->b2.next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
+        cW_v_next = curW_v->next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
+        pW_v_next = W_v->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
+        len = var->A11.cur_color##_odd_len;                                                                            \
+        if (var->A11.cur_color##_even_len != var->A11.cur_color##_odd_len)                                             \
+            touches_right_border = !bool_default;                                                                      \
+        else                                                                                                           \
+            touches_right_border = bool_default;                                                                       \
+    }
+    for (int i = start_i; i < end_i; i++)
+    {
+        if (red_pass)
+        {
+            INIT_ROW_POINTERS(red, black, 1, 2, true);
+        }
+        else
+        {
+            INIT_ROW_POINTERS(black, red, 2, 1, false);
+        }
+#undef INIT_ROW_POINTERS
+#define COMPUTE                                                                                                        \
+    /* Gradients for the flow on the current fixed-point iteration: */                                                 \
+    ux = cW_u_next[j] - cW_u[j];                                                                                       \
+    vx = cW_v_next[j] - cW_v[j];                                                                                       \
+    uy = cW_u_next_row[j] - cW_u[j];                                                                                   \
+    vy = cW_v_next_row[j] - cW_v[j];                                                                                   \
+    /* Weight of the smoothness term in the current fixed-point iteration: */                                          \
+    pWeight[j] = alpha2 / sqrt(ux * ux + vx * vx + uy * uy + vy * vy + epsilon_squared);                               \
+    /* Gradients for initial raw flow multiplied by weight:*/                                                          \
+    ux = pWeight[j] * (pW_u_next[j] - pW_u[j]);                                                                        \
+    vx = pWeight[j] * (pW_v_next[j] - pW_v[j]);
+#define UPDATE                                                                                                         \
+    pB_u[j] += ux;                                                                                                     \
+    pA_u[j] += pWeight[j];                                                                                             \
+    pB_v[j] += vx;                                                                                                     \
+    pA_v[j] += pWeight[j];                                                                                             \
+    pB_u_next[j] -= ux;                                                                                                \
+    pA_u_next[j] += pWeight[j];                                                                                        \
+    pB_v_next[j] -= vx;                                                                                                \
+    pA_v_next[j] += pWeight[j];
+        int j = 0;
+#ifdef CV_SIMD128
+        v_float32x4 alpha2_vec = v_setall_f32(alpha2);
+        v_float32x4 eps_vec = v_setall_f32(epsilon_squared);
+        v_float32x4 cW_u_vec, cW_v_vec;
+        v_float32x4 pWeight_vec, ux_vec, vx_vec, uy_vec, vy_vec;
+        for (; j < len - 4; j += 4)
+        {
+            cW_u_vec = v_load(cW_u + j);
+            cW_v_vec = v_load(cW_v + j);
+            ux_vec = v_load(cW_u_next + j) - cW_u_vec;
+            vx_vec = v_load(cW_v_next + j) - cW_v_vec;
+            uy_vec = v_load(cW_u_next_row + j) - cW_u_vec;
+            vy_vec = v_load(cW_v_next_row + j) - cW_v_vec;
+            pWeight_vec =
+              alpha2_vec / v_sqrt(ux_vec * ux_vec + vx_vec * vx_vec + uy_vec * uy_vec + vy_vec * vy_vec + eps_vec);
+            v_store(pWeight + j, pWeight_vec);
+            ux_vec = pWeight_vec * (v_load(pW_u_next + j) - v_load(pW_u + j));
+            vx_vec = pWeight_vec * (v_load(pW_v_next + j) - v_load(pW_v + j));
+            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
+            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
+            v_store(pB_u + j, v_load(pB_u + j) + ux_vec);
+            v_store(pB_v + j, v_load(pB_v + j) + vx_vec);
+            v_store(pA_u_next + j, v_load(pA_u_next + j) + pWeight_vec);
+            v_store(pA_v_next + j, v_load(pA_v_next + j) + pWeight_vec);
+            v_store(pB_u_next + j, v_load(pB_u_next + j) - ux_vec);
+            v_store(pB_v_next + j, v_load(pB_v_next + j) - vx_vec);
+        }
+#endif
+        for (; j < len - 1; j++)
+        {
+            COMPUTE;
+            UPDATE;
+        }
+        /* Omit the update on the rightmost elements */
+        if (touches_right_border)
+        {
+            COMPUTE;
+        }
+        else
+        {
+            COMPUTE;
+            UPDATE;
+        }
+    }
+#undef COMPUTE
+#undef UPDATE
+}
+VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::ComputeSmoothnessTermVertPass_ParBody(
+  VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_W_u, RedBlackBuffer &_W_v, bool _red_pass)
+    : var(&_var), nstripes(_nstripes), W_u(&_W_u), W_v(&_W_v), red_pass(_red_pass)
+{
+    /* Omit the last row in the vertical pass */
+    h = _h - 1;
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+/* This function adds the last remaining terms to the linear system coefficients A11,A22,b1,b1. */
+void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator()(const Range &range) const
+{
+    int start_i = min(range.start * stripe_sz, h);
+    int end_i = min(range.end * stripe_sz, h);
+    float *pWeight;
+    float *pA_u, *pA_u_next_row, *pA_v, *pA_v_next_row;
+    float *pB_u, *pB_u_next_row, *pB_v, *pB_v_next_row;
+    float *pW_u, *pW_u_next_row, *pW_v, *pW_v_next_row;
+    float vy, uy;
+    int len;
+    for (int i = start_i; i < end_i; i++)
+    {
+#define INIT_ROW_POINTERS(cur_color, next_color)                                                                       \
+    pWeight = var->weights.cur_color.ptr<float>(i + 1) + 1;                                                            \
+    pA_u = var->A11.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pB_u = var->b1.cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    pW_u = W_u->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+    pA_v = var->A22.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pB_v = var->b2.cur_color.ptr<float>(i + 1) + 1;                                                                    \
+    pW_v = W_v->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+                                                                                                                       \
+    pA_u_next_row = var->A11.next_color.ptr<float>(i + 2) + 1;                                                         \
+    pB_u_next_row = var->b1.next_color.ptr<float>(i + 2) + 1;                                                          \
+    pW_u_next_row = W_u->next_color.ptr<float>(i + 2) + 1;                                                             \
+    pA_v_next_row = var->A22.next_color.ptr<float>(i + 2) + 1;                                                         \
+    pB_v_next_row = var->b2.next_color.ptr<float>(i + 2) + 1;                                                          \
+    pW_v_next_row = W_v->next_color.ptr<float>(i + 2) + 1;                                                             \
+                                                                                                                       \
+    if (i % 2 == 0)                                                                                                    \
+        len = var->A11.cur_color##_even_len;                                                                           \
+    else                                                                                                               \
+        len = var->A11.cur_color##_odd_len;
+        if (red_pass)
+        {
+            INIT_ROW_POINTERS(red, black);
+        }
+        else
+        {
+            INIT_ROW_POINTERS(black, red);
+        }
+#undef INIT_ROW_POINTERS
+        int j = 0;
+#ifdef CV_SIMD128
+        v_float32x4 pWeight_vec, uy_vec, vy_vec;
+        for (; j < len - 3; j += 4)
+        {
+            pWeight_vec = v_load(pWeight + j);
+            uy_vec = pWeight_vec * (v_load(pW_u_next_row + j) - v_load(pW_u + j));
+            vy_vec = pWeight_vec * (v_load(pW_v_next_row + j) - v_load(pW_v + j));
+            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
+            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
+            v_store(pB_u + j, v_load(pB_u + j) + uy_vec);
+            v_store(pB_v + j, v_load(pB_v + j) + vy_vec);
+            v_store(pA_u_next_row + j, v_load(pA_u_next_row + j) + pWeight_vec);
+            v_store(pA_v_next_row + j, v_load(pA_v_next_row + j) + pWeight_vec);
+            v_store(pB_u_next_row + j, v_load(pB_u_next_row + j) - uy_vec);
+            v_store(pB_v_next_row + j, v_load(pB_v_next_row + j) - vy_vec);
+        }
+#endif
+        for (; j < len; j++)
+        {
+            uy = pWeight[j] * (pW_u_next_row[j] - pW_u[j]);
+            vy = pWeight[j] * (pW_v_next_row[j] - pW_v[j]);
+            pB_u[j] += uy;
+            pA_u[j] += pWeight[j];
+            pB_v[j] += vy;
+            pA_v[j] += pWeight[j];
+            pB_u_next_row[j] -= uy;
+            pA_u_next_row[j] += pWeight[j];
+            pB_v_next_row[j] -= vy;
+            pA_v_next_row[j] += pWeight[j];
+        }
+    }
+}
+VariationalRefinementImpl::RedBlackSOR_ParBody::RedBlackSOR_ParBody(VariationalRefinementImpl &_var, int _nstripes,
+                                                                    int _h, RedBlackBuffer &_dW_u,
+                                                                    RedBlackBuffer &_dW_v, bool _red_pass)
+    : var(&_var), nstripes(_nstripes), h(_h), dW_u(&_dW_u), dW_v(&_dW_v), red_pass(_red_pass)
+{
+    stripe_sz = (int)ceil(h / (double)nstripes);
+}
+/* This function implements the Red-Black SOR (successive-over relaxation) method for solving the main
+ * linear system in the current fixed-point iteration.
+ */
+void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &range) const
+{
+    int start = min(range.start * stripe_sz, h);
+    int end = min(range.end * stripe_sz, h);
+    float *pa11, *pa12, *pa22, *pb1, *pb2, *pW, *pdu, *pdv;
+    float *pW_next, *pdu_next, *pdv_next;
+    float *pW_prev_row, *pdu_prev_row, *pdv_prev_row;
+    float *pdu_next_row, *pdv_next_row;
+    float sigmaU, sigmaV;
+    int j, len;
+    for (int i = start; i < end; i++)
+    {
+#define INIT_ROW_POINTERS(cur_color, next_color, next_offs_even, next_offs_odd)                                        \
+    pW = var->weights.cur_color.ptr<float>(i + 1) + 1;                                                                 \
+    pa11 = var->A11.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pa12 = var->A12.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pa22 = var->A22.cur_color.ptr<float>(i + 1) + 1;                                                                   \
+    pb1 = var->b1.cur_color.ptr<float>(i + 1) + 1;                                                                     \
+    pb2 = var->b2.cur_color.ptr<float>(i + 1) + 1;                                                                     \
+    pdu = dW_u->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+    pdv = dW_v->cur_color.ptr<float>(i + 1) + 1;                                                                       \
+                                                                                                                       \
+    pdu_next_row = dW_u->next_color.ptr<float>(i + 2) + 1;                                                             \
+    pdv_next_row = dW_v->next_color.ptr<float>(i + 2) + 1;                                                             \
+                                                                                                                       \
+    pW_prev_row = var->weights.next_color.ptr<float>(i) + 1;                                                           \
+    pdu_prev_row = dW_u->next_color.ptr<float>(i) + 1;                                                                 \
+    pdv_prev_row = dW_v->next_color.ptr<float>(i) + 1;                                                                 \
+                                                                                                                       \
+    if (i % 2 == 0)                                                                                                    \
+    {                                                                                                                  \
+        pW_next = var->weights.next_color.ptr<float>(i + 1) + next_offs_even;                                          \
+        pdu_next = dW_u->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
+        pdv_next = dW_v->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
+        len = var->A11.cur_color##_even_len;                                                                           \
+    }                                                                                                                  \
+    else                                                                                                               \
+    {                                                                                                                  \
+        pW_next = var->weights.next_color.ptr<float>(i + 1) + next_offs_odd;                                           \
+        pdu_next = dW_u->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
+        pdv_next = dW_v->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
+        len = var->A11.cur_color##_odd_len;                                                                            \
+    }
+        if (red_pass)
+        {
+            INIT_ROW_POINTERS(red, black, 1, 2);
+        }
+        else
+        {
+            INIT_ROW_POINTERS(black, red, 2, 1);
+        }
+#undef INIT_ROW_POINTERS
+        j = 0;
+#ifdef CV_SIMD128
+        v_float32x4 pW_prev_vec = v_setall_f32(pW_next[-1]);
+        v_float32x4 pdu_prev_vec = v_setall_f32(pdu_next[-1]);
+        v_float32x4 pdv_prev_vec = v_setall_f32(pdv_next[-1]);
+        v_float32x4 omega_vec = v_setall_f32(var->omega);
+        v_float32x4 pW_vec, pW_next_vec, pW_prev_row_vec;
+        v_float32x4 pdu_next_vec, pdu_prev_row_vec, pdu_next_row_vec;
+        v_float32x4 pdv_next_vec, pdv_prev_row_vec, pdv_next_row_vec;
+        v_float32x4 pW_shifted_vec, pdu_shifted_vec, pdv_shifted_vec;
+        v_float32x4 pa12_vec, sigmaU_vec, sigmaV_vec, pdu_vec, pdv_vec;
+        for (; j < len - 3; j += 4)
+        {
+            pW_vec = v_load(pW + j);
+            pW_next_vec = v_load(pW_next + j);
+            pW_prev_row_vec = v_load(pW_prev_row + j);
+            pdu_next_vec = v_load(pdu_next + j);
+            pdu_prev_row_vec = v_load(pdu_prev_row + j);
+            pdu_next_row_vec = v_load(pdu_next_row + j);
+            pdv_next_vec = v_load(pdv_next + j);
+            pdv_prev_row_vec = v_load(pdv_prev_row + j);
+            pdv_next_row_vec = v_load(pdv_next_row + j);
+            pa12_vec = v_load(pa12 + j);
+            pW_shifted_vec = v_reinterpret_as_f32(
+              v_extract<3>(v_reinterpret_as_s32(pW_prev_vec), v_reinterpret_as_s32(pW_next_vec)));
+            pdu_shifted_vec = v_reinterpret_as_f32(
+              v_extract<3>(v_reinterpret_as_s32(pdu_prev_vec), v_reinterpret_as_s32(pdu_next_vec)));
+            pdv_shifted_vec = v_reinterpret_as_f32(
+              v_extract<3>(v_reinterpret_as_s32(pdv_prev_vec), v_reinterpret_as_s32(pdv_next_vec)));
+            sigmaU_vec = pW_shifted_vec * pdu_shifted_vec + pW_vec * pdu_next_vec + pW_prev_row_vec * pdu_prev_row_vec +
+                         pW_vec * pdu_next_row_vec;
+            sigmaV_vec = pW_shifted_vec * pdv_shifted_vec + pW_vec * pdv_next_vec + pW_prev_row_vec * pdv_prev_row_vec +
+                         pW_vec * pdv_next_row_vec;
+            pdu_vec = v_load(pdu + j);
+            pdv_vec = v_load(pdv + j);
+            pdu_vec += omega_vec * ((sigmaU_vec + v_load(pb1 + j) - pdv_vec * pa12_vec) / v_load(pa11 + j) - pdu_vec);
+            pdv_vec += omega_vec * ((sigmaV_vec + v_load(pb2 + j) - pdu_vec * pa12_vec) / v_load(pa22 + j) - pdv_vec);
+            v_store(pdu + j, pdu_vec);
+            v_store(pdv + j, pdv_vec);
+            pW_prev_vec = pW_next_vec;
+            pdu_prev_vec = pdu_next_vec;
+            pdv_prev_vec = pdv_next_vec;
+        }
+#endif
+        for (; j < len; j++)
+        {
+            sigmaU = pW_next[j - 1] * pdu_next[j - 1] + pW[j] * pdu_next[j] + pW_prev_row[j] * pdu_prev_row[j] +
+                     pW[j] * pdu_next_row[j];
+            sigmaV = pW_next[j - 1] * pdv_next[j - 1] + pW[j] * pdv_next[j] + pW_prev_row[j] * pdv_prev_row[j] +
+                     pW[j] * pdv_next_row[j];
+            pdu[j] += var->omega * ((sigmaU + pb1[j] - pdv[j] * pa12[j]) / pa11[j] - pdu[j]);
+            pdv[j] += var->omega * ((sigmaV + pb2[j] - pdu[j] * pa12[j]) / pa22[j] - pdv[j]);
+        }
+    }
+}
+void VariationalRefinementImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
+{
+    CV_Assert(!I0.empty() && I0.channels() == 1);
+    CV_Assert(!I1.empty() && I1.channels() == 1);
+    CV_Assert(I0.sameSize(I1));
+    CV_Assert((I0.depth() == CV_8U && I1.depth() == CV_8U) || (I0.depth() == CV_32F && I1.depth() == CV_32F));
+    CV_Assert(!flow.empty() && flow.depth() == CV_32F && flow.channels() == 2);
+    CV_Assert(I0.sameSize(flow));
+    Mat uv[2];
+    Mat &flowMat = flow.getMatRef();
+    split(flowMat, uv);
+    calcUV(I0, I1, uv[0], uv[1]);
+    merge(uv, 2, flowMat);
+}
+void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v)
+{
+    CV_Assert(!I0.empty() && I0.channels() == 1);
+    CV_Assert(!I1.empty() && I1.channels() == 1);
+    CV_Assert(I0.sameSize(I1));
+    CV_Assert((I0.depth() == CV_8U && I1.depth() == CV_8U) || (I0.depth() == CV_32F && I1.depth() == CV_32F));
+    CV_Assert(!flow_u.empty() && flow_u.depth() == CV_32F && flow_u.channels() == 1);
+    CV_Assert(!flow_v.empty() && flow_v.depth() == CV_32F && flow_v.channels() == 1);
+    CV_Assert(I0.sameSize(flow_u));
+    CV_Assert(flow_u.sameSize(flow_v));
+    int num_stripes = getNumThreads();
+    Mat I0Mat = I0.getMat();
+    Mat I1Mat = I1.getMat();
+    Mat &W_u = flow_u.getMatRef();
+    Mat &W_v = flow_v.getMatRef();
+    prepareBuffers(I0Mat, I1Mat, W_u, W_v);
+    splitCheckerboard(W_u_rb, W_u);
+    splitCheckerboard(W_v_rb, W_v);
+    W_u_rb.red.copyTo(tempW_u.red);
+    W_u_rb.black.copyTo(tempW_u.black);
+    W_v_rb.red.copyTo(tempW_v.red);
+    W_v_rb.black.copyTo(tempW_v.black);
+    dW_u.red.setTo(0.0f);
+    dW_u.black.setTo(0.0f);
+    dW_v.red.setTo(0.0f);
+    dW_v.black.setTo(0.0f);
+    for (int i = 0; i < fixedPointIterations; i++)
+    {
+        parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true));
+        parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false));
+        parallel_for_(Range(0, num_stripes), ComputeSmoothnessTermHorPass_ParBody(
+                                               *this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, tempW_u, tempW_v, true));
+        parallel_for_(Range(0, num_stripes), ComputeSmoothnessTermHorPass_ParBody(
+                                               *this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, tempW_u, tempW_v, false));
+        parallel_for_(Range(0, num_stripes),
+                      ComputeSmoothnessTermVertPass_ParBody(*this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, true));
+        parallel_for_(Range(0, num_stripes),
+                      ComputeSmoothnessTermVertPass_ParBody(*this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, false));
+        for (int j = 0; j < sorIterations; j++)
+        {
+            parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true));
+            parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false));
+        }
+        tempW_u.red = W_u_rb.red + dW_u.red;
+        tempW_u.black = W_u_rb.black + dW_u.black;
+        updateRepeatedBorders(tempW_u);
+        tempW_v.red = W_v_rb.red + dW_v.red;
+        tempW_v.black = W_v_rb.black + dW_v.black;
+        updateRepeatedBorders(tempW_v);
+    }
+    mergeCheckerboard(W_u, tempW_u);
+    mergeCheckerboard(W_v, tempW_v);
+}
+void VariationalRefinementImpl::collectGarbage()
+{
+    Ix.release();
+    Iy.release();
+    Iz.release();
+    Ixx.release();
+    Ixy.release();
+    Iyy.release();
+    Ixz.release();
+    Iyz.release();
+    Ix_rb.release();
+    Iy_rb.release();
+    Iz_rb.release();
+    Ixx_rb.release();
+    Ixy_rb.release();
+    Iyy_rb.release();
+    Ixz_rb.release();
+    Iyz_rb.release();
+    A11.release();
+    A12.release();
+    A22.release();
+    b1.release();
+    b2.release();
+    weights.release();
+    mapX.release();
+    mapY.release();
+    tempW_u.release();
+    tempW_v.release();
+    dW_u.release();
+    dW_v.release();
+    W_u_rb.release();
+    W_v_rb.release();
+}
+Ptr<VariationalRefinement> VariationalRefinement::create()
+{ return makePtr<VariationalRefinementImpl>(); }
+}
--- a/modules/video/test/ocl/test_optflow_tvl1flow.cpp
+++ b/modules/video/test/ocl/test_optflow_tvl1flow.cpp
@@ -7,12 +7,10 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                        Intel License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -25,7 +23,7 @@
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
-//   * The name of the copyright holders may not be used to endorse or promote products
+//   * The name of Intel Corporation may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
@@ -46,72 +44,53 @@
 #ifdef HAVE_OPENCL
-namespace opencv_test {
+namespace opencv_test { namespace {
-namespace ocl {
-/////////////////////////////////////////////////////////////////////////////////////////////////
+PARAM_TEST_CASE(OCL_DenseOpticalFlow_DIS, int)
-// Optical_flow_tvl1
-namespace
 {
-    IMPLEMENT_PARAM_CLASS(UseInitFlow, bool)
+    int preset;
-    IMPLEMENT_PARAM_CLASS(MedianFiltering, int)
-    IMPLEMENT_PARAM_CLASS(ScaleStep, double)
-}
-PARAM_TEST_CASE(OpticalFlowTVL1, UseInitFlow, MedianFiltering, ScaleStep)
-{
-    bool useInitFlow;
-    int medianFiltering;
-    double scaleStep;
    virtual void SetUp()
    {
-        useInitFlow = GET_PARAM(0);
+        preset = GET_PARAM(0);
-        medianFiltering = GET_PARAM(1);
-        scaleStep = GET_PARAM(2);
    }
 };
-OCL_TEST_P(OpticalFlowTVL1, Mat)
+OCL_TEST_P(OCL_DenseOpticalFlow_DIS, Mat)
 {
-    cv::Mat frame0 = readImage("optflow/RubberWhale1.png", cv::IMREAD_GRAYSCALE);
+    Mat frame1, frame2, GT;
-    ASSERT_FALSE(frame0.empty());
-    cv::Mat frame1 = readImage("optflow/RubberWhale2.png", cv::IMREAD_GRAYSCALE);
+    frame1 = imread(TS::ptr()->get_data_path() + "optflow/RubberWhale1.png");
-    ASSERT_FALSE(frame1.empty());
+    frame2 = imread(TS::ptr()->get_data_path() + "optflow/RubberWhale2.png");
-    cv::Mat flow; cv::UMat uflow;
+    CV_Assert(!frame1.empty() && !frame2.empty());
-    //create algorithm
+    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
-    cv::Ptr<cv::DualTVL1OpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
-    //set parameters
+    Ptr<DenseOpticalFlow> algo;
-    alg->setScaleStep(scaleStep);
-    alg->setMedianFiltering(medianFiltering);
-    //create initial flow as result of algorithm calculation
+    // iterate over presets:
-    if (useInitFlow)
+    for (int i = 0; i < cvtest::ocl::test_loop_times; i++)
    {
-        OCL_ON(alg->calc(frame0, frame1, uflow));
+        Mat flow;
-        uflow.copyTo(flow);
+        UMat ocl_flow;
-    }
-    //set flag to use initial flow as it is ready to use
+        algo = DISOpticalFlow::create(preset);
-    alg->setUseInitialFlow(useInitFlow);
+        OCL_OFF(algo->calc(frame1, frame2, flow));
+        OCL_ON(algo->calc(frame1, frame2, ocl_flow));
+        ASSERT_EQ(flow.rows, ocl_flow.rows);
+        ASSERT_EQ(flow.cols, ocl_flow.cols);
-    OCL_OFF(alg->calc(frame0, frame1, flow));
+        EXPECT_MAT_SIMILAR(flow, ocl_flow, 6e-3);
-    OCL_ON(alg->calc(frame0, frame1, uflow));
+    }
-    EXPECT_MAT_SIMILAR(flow, uflow, 1e-2);
 }
-OCL_INSTANTIATE_TEST_CASE_P(Video, OpticalFlowTVL1,
+OCL_INSTANTIATE_TEST_CASE_P(Video, OCL_DenseOpticalFlow_DIS,
-    Combine(
+                            Values(DISOpticalFlow::PRESET_ULTRAFAST,
-    Values(UseInitFlow(false), UseInitFlow(true)),
+                                   DISOpticalFlow::PRESET_FAST,
-    Values(MedianFiltering(3), MedianFiltering(-1)),
+                                   DISOpticalFlow::PRESET_MEDIUM));
-    Values(ScaleStep(0.3),ScaleStep(0.5))
-    )
-    );
-} } // namespace opencv_test::ocl
+}} // namespace
 #endif // HAVE_OPENCL
--- a/modules/video/test/test_tvl1optflow.cpp
+++ b/modules/video/test/test_tvl1optflow.cpp
@@ -43,131 +43,105 @@
 namespace opencv_test { namespace {
-//#define DUMP
+static string getDataDir() { return TS::ptr()->get_data_path(); }
-    // first four bytes, should be the same in little endian
+static string getRubberWhaleFrame1() { return getDataDir() + "optflow/RubberWhale1.png"; }
-    const float FLO_TAG_FLOAT = 202021.25f;  // check for this when READING the file
-#ifdef DUMP
+static string getRubberWhaleFrame2() { return getDataDir() + "optflow/RubberWhale2.png"; }
-    // binary file format for flow data specified here:
-    // http://vision.middlebury.edu/flow/data/
-    void writeOpticalFlowToFile(const Mat_<Point2f>& flow, const string& fileName)
-    {
-        const char FLO_TAG_STRING[] = "PIEH";    // use this when WRITING the file
-        ofstream file(fileName.c_str(), ios_base::binary);
-        file << FLO_TAG_STRING;
+static string getRubberWhaleGroundTruth() { return getDataDir() + "optflow/RubberWhale.flo"; }
-        file.write((const char*) &flow.cols, sizeof(int));
+static bool isFlowCorrect(float u) { return !cvIsNaN(u) && (fabs(u) < 1e9); }
-        file.write((const char*) &flow.rows, sizeof(int));
-        for (int i = 0; i < flow.rows; ++i)
+static float calcRMSE(Mat flow1, Mat flow2)
-        {
+{
-            for (int j = 0; j < flow.cols; ++j)
+    float sum = 0;
-            {
+    int counter = 0;
-                const Point2f u = flow(i, j);
+    const int rows = flow1.rows;
+    const int cols = flow1.cols;
-                file.write((const char*) &u.x, sizeof(float));
-                file.write((const char*) &u.y, sizeof(float));
-            }
-        }
-    }
-#endif
-    // binary file format for flow data specified here:
+    for (int y = 0; y < rows; ++y)
-    // http://vision.middlebury.edu/flow/data/
-    void readOpticalFlowFromFile(Mat_<Point2f>& flow, const string& fileName)
    {
-        std::ifstream file(fileName.c_str(), std::ios_base::binary);
+        for (int x = 0; x < cols; ++x)
-        float tag;
-        file.read((char*) &tag, sizeof(float));
-        CV_Assert( tag == FLO_TAG_FLOAT );
-        Size size;
-        file.read((char*) &size.width, sizeof(int));
-        file.read((char*) &size.height, sizeof(int));
-        flow.create(size);
-        for (int i = 0; i < flow.rows; ++i)
        {
-            for (int j = 0; j < flow.cols; ++j)
+            Vec2f flow1_at_point = flow1.at<Vec2f>(y, x);
-            {
+            Vec2f flow2_at_point = flow2.at<Vec2f>(y, x);
-                Point2f u;
-                file.read((char*) &u.x, sizeof(float));
-                file.read((char*) &u.y, sizeof(float));
-                flow(i, j) = u;
-            }
-        }
-        file.close();
-    }
-    bool isFlowCorrect(Point2f u)
+            float u1 = flow1_at_point[0];
-    {
+            float v1 = flow1_at_point[1];
-        return !cvIsNaN(u.x) && !cvIsNaN(u.y) && (fabs(u.x) < 1e9) && (fabs(u.y) < 1e9);
+            float u2 = flow2_at_point[0];
-    }
+            float v2 = flow2_at_point[1];
-    void check(const Mat_<Point2f>& gold, const Mat_<Point2f>& flow, double threshold = 0.1, double expectedAccuracy = 0.95)
+            if (isFlowCorrect(u1) && isFlowCorrect(u2) && isFlowCorrect(v1) && isFlowCorrect(v2))
-    {
-        threshold = threshold*threshold;
-        size_t gold_counter = 0;
-        size_t valid_counter = 0;
-        for (int i = 0; i < gold.rows; ++i)
-        {
-            for (int j = 0; j < gold.cols; ++j)
            {
-                const Point2f u1 = gold(i, j);
+                sum += (u1 - u2) * (u1 - u2) + (v1 - v2) * (v1 - v2);
-                const Point2f u2 = flow(i, j);
+                counter++;
-                if (isFlowCorrect(u1))
-                {
-                    gold_counter++;
-                    if (isFlowCorrect(u2))
-                    {
-                        const Point2f diff = u1 - u2;
-                        double err = diff.ddot(diff);
-                        if (err <= threshold)
-                            valid_counter++;
-                    }
-                }
            }
        }
-        EXPECT_GE(valid_counter, expectedAccuracy * gold_counter);
    }
+    return (float)sqrt(sum / (1e-9 + counter));
+}
-TEST(Video_calcOpticalFlowDual_TVL1, Regression)
+bool readRubberWhale(Mat &dst_frame_1, Mat &dst_frame_2, Mat &dst_GT)
 {
-    const string frame1_path = TS::ptr()->get_data_path() + "optflow/RubberWhale1.png";
+    const string frame1_path = getRubberWhaleFrame1();
-    const string frame2_path = TS::ptr()->get_data_path() + "optflow/RubberWhale2.png";
+    const string frame2_path = getRubberWhaleFrame2();
-    const string gold_flow_path = TS::ptr()->get_data_path() + "optflow/tvl1_flow.flo";
+    const string gt_flow_path = getRubberWhaleGroundTruth();
-    Mat frame1 = imread(frame1_path, IMREAD_GRAYSCALE);
+    dst_frame_1 = imread(frame1_path);
-    Mat frame2 = imread(frame2_path, IMREAD_GRAYSCALE);
+    dst_frame_2 = imread(frame2_path);
-    ASSERT_FALSE(frame1.empty());
+    dst_GT = readOpticalFlow(gt_flow_path);
-    ASSERT_FALSE(frame2.empty());
+    if (dst_frame_1.empty() || dst_frame_2.empty() || dst_GT.empty())
-    Mat_<Point2f> flow;
+        return false;
-    Ptr<DualTVL1OpticalFlow> tvl1 = cv::DualTVL1OpticalFlow::create();
+    else
+        return true;
+}
-    tvl1->calc(frame1, frame2, flow);
+TEST(DenseOpticalFlow_DIS, ReferenceAccuracy)
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    int presets[] = {DISOpticalFlow::PRESET_ULTRAFAST, DISOpticalFlow::PRESET_FAST, DISOpticalFlow::PRESET_MEDIUM};
+    float target_RMSE[] = {0.86f, 0.74f, 0.49f};
+    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
+    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
-#ifdef DUMP
+    Ptr<DenseOpticalFlow> algo;
-    writeOpticalFlowToFile(flow, gold_flow_path);
-#else
-    Mat_<Point2f> gold;
-    readOpticalFlowFromFile(gold, gold_flow_path);
-    ASSERT_EQ(gold.rows, flow.rows);
+    // iterate over presets:
-    ASSERT_EQ(gold.cols, flow.cols);
+    for (int i = 0; i < 3; i++)
+    {
+        Mat flow;
+        algo = DISOpticalFlow::create(presets[i]);
+        algo->calc(frame1, frame2, flow);
+        ASSERT_EQ(GT.rows, flow.rows);
+        ASSERT_EQ(GT.cols, flow.cols);
+        EXPECT_LE(calcRMSE(GT, flow), target_RMSE[i]);
+    }
+}
-    check(gold, flow);
+TEST(DenseOpticalFlow_VariationalRefinement, ReferenceAccuracy)
-#endif
+{
+    Mat frame1, frame2, GT;
+    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
+    float target_RMSE = 0.86f;
+    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
+    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
+    Ptr<VariationalRefinement> var_ref;
+    var_ref = VariationalRefinement::create();
+    var_ref->setAlpha(20.0f);
+    var_ref->setDelta(5.0f);
+    var_ref->setGamma(10.0f);
+    var_ref->setSorIterations(25);
+    var_ref->setFixedPointIterations(25);
+    Mat flow(frame1.size(), CV_32FC2);
+    flow.setTo(0.0f);
+    var_ref->calc(frame1, frame2, flow);
+    ASSERT_EQ(GT.rows, flow.rows);
+    ASSERT_EQ(GT.cols, flow.cols);
+    EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
 }
 }} // namespace
--- a/modules/video/test/test_OF_reproducibility.cpp
+++ b/modules/video/test/test_OF_reproducibility.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "test_precomp.hpp"
+namespace opencv_test { namespace {
+typedef tuple<Size> OFParams;
+typedef TestWithParam<OFParams> DenseOpticalFlow_DIS;
+typedef TestWithParam<OFParams> DenseOpticalFlow_VariationalRefinement;
+TEST_P(DenseOpticalFlow_DIS, MultithreadReproducibility)
+{
+    double MAX_DIF = 0.01;
+    double MAX_MEAN_DIF = 0.001;
+    int loopsCount = 2;
+    RNG rng(0);
+    OFParams params = GetParam();
+    Size size = get<0>(params);
+    int nThreads = cv::getNumThreads();
+    if (nThreads == 1)
+        throw SkipTestException("Single thread environment");
+    for (int iter = 0; iter <= loopsCount; iter++)
+    {
+        Mat frame1(size, CV_8U);
+        randu(frame1, 0, 255);
+        Mat frame2(size, CV_8U);
+        randu(frame2, 0, 255);
+        Ptr<DISOpticalFlow> algo = DISOpticalFlow::create();
+        int psz = rng.uniform(4, 16);
+        int pstr = rng.uniform(1, psz - 1);
+        int grad_iter = rng.uniform(1, 64);
+        int var_iter = rng.uniform(0, 10);
+        bool use_mean_normalization = !!rng.uniform(0, 2);
+        bool use_spatial_propagation = !!rng.uniform(0, 2);
+        algo->setFinestScale(0);
+        algo->setPatchSize(psz);
+        algo->setPatchStride(pstr);
+        algo->setGradientDescentIterations(grad_iter);
+        algo->setVariationalRefinementIterations(var_iter);
+        algo->setUseMeanNormalization(use_mean_normalization);
+        algo->setUseSpatialPropagation(use_spatial_propagation);
+        cv::setNumThreads(nThreads);
+        Mat resMultiThread;
+        algo->calc(frame1, frame2, resMultiThread);
+        cv::setNumThreads(1);
+        Mat resSingleThread;
+        algo->calc(frame1, frame2, resSingleThread);
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_INF), MAX_DIF);
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_L1), MAX_MEAN_DIF * frame1.total());
+        // resulting flow should be within the frame bounds:
+        double min_val, max_val;
+        minMaxLoc(resMultiThread, &min_val, &max_val);
+        EXPECT_LE(abs(min_val), sqrt( static_cast<double>(size.height * size.height + size.width * size.width)) );
+        EXPECT_LE(abs(max_val), sqrt( static_cast<double>(size.height * size.height + size.width * size.width)) );
+    }
+}
+INSTANTIATE_TEST_CASE_P(FullSet, DenseOpticalFlow_DIS, Values(szODD, szQVGA));
+TEST_P(DenseOpticalFlow_VariationalRefinement, MultithreadReproducibility)
+{
+    double MAX_DIF = 0.01;
+    double MAX_MEAN_DIF = 0.001;
+    float input_flow_rad = 5.0;
+    int loopsCount = 2;
+    RNG rng(0);
+    OFParams params = GetParam();
+    Size size = get<0>(params);
+    int nThreads = cv::getNumThreads();
+    if (nThreads == 1)
+        throw SkipTestException("Single thread environment");
+    for (int iter = 0; iter <= loopsCount; iter++)
+    {
+        Mat frame1(size, CV_8U);
+        randu(frame1, 0, 255);
+        Mat frame2(size, CV_8U);
+        randu(frame2, 0, 255);
+        Mat flow(size, CV_32FC2);
+        randu(flow, -input_flow_rad, input_flow_rad);
+        Ptr<VariationalRefinement> var = VariationalRefinement::create();
+        var->setAlpha(rng.uniform(1.0f, 100.0f));
+        var->setGamma(rng.uniform(0.1f, 10.0f));
+        var->setDelta(rng.uniform(0.1f, 10.0f));
+        var->setSorIterations(rng.uniform(1, 20));
+        var->setFixedPointIterations(rng.uniform(1, 20));
+        var->setOmega(rng.uniform(1.01f, 1.99f));
+        cv::setNumThreads(nThreads);
+        Mat resMultiThread;
+        flow.copyTo(resMultiThread);
+        var->calc(frame1, frame2, resMultiThread);
+        cv::setNumThreads(1);
+        Mat resSingleThread;
+        flow.copyTo(resSingleThread);
+        var->calc(frame1, frame2, resSingleThread);
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_INF), MAX_DIF);
+        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_L1), MAX_MEAN_DIF * frame1.total());
+        // resulting flow should be within the frame bounds:
+        double min_val, max_val;
+        minMaxLoc(resMultiThread, &min_val, &max_val);
+        EXPECT_LE(abs(min_val), sqrt( static_cast<double>(size.height * size.height + size.width * size.width)) );
+        EXPECT_LE(abs(max_val), sqrt( static_cast<double>(size.height * size.height + size.width * size.width)) );
+    }
+}
+INSTANTIATE_TEST_CASE_P(FullSet, DenseOpticalFlow_VariationalRefinement, Values(szODD, szQVGA));
+}} // namespace
--- a/modules/video/test/test_precomp.hpp
+++ b/modules/video/test/test_precomp.hpp
@@ -6,5 +6,10 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/video.hpp"
+#include <opencv2/ts/ts_perf.hpp>
+namespace opencv_test {
+using namespace perf;
+}
 #endif
--- a/samples/cpp/dis_opticalflow.cpp
+++ b/samples/cpp/dis_opticalflow.cpp
+#include "opencv2/core/utility.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/video.hpp"
+using namespace std;
+using namespace cv;
+static void help()
+{
+    printf("Usage: dis_optflow <video_file>\n");
+}
+int main(int argc, char **argv)
+{
+    VideoCapture cap;
+    if (argc < 2)
+    {
+        help();
+        exit(1);
+    }
+    cap.open(argv[1]);
+    if(!cap.isOpened())
+    {
+        printf("ERROR: Cannot open file %s\n", argv[1]);
+        return -1;
+    }
+    Mat prevgray, gray, rgb, frame;
+    Mat flow, flow_uv[2];
+    Mat mag, ang;
+    Mat hsv_split[3], hsv;
+    char ret;
+    namedWindow("flow", 1);
+    namedWindow("orig", 1);
+    Ptr<DenseOpticalFlow> algorithm = DISOpticalFlow::create(DISOpticalFlow::PRESET_MEDIUM);
+    while(true)
+    {
+        cap >> frame;
+        if (frame.empty())
+            break;
+        cvtColor(frame, gray, COLOR_BGR2GRAY);
+        if (!prevgray.empty())
+        {
+            algorithm->calc(prevgray, gray, flow);
+            split(flow, flow_uv);
+            multiply(flow_uv[1], -1, flow_uv[1]);
+            cartToPolar(flow_uv[0], flow_uv[1], mag, ang, true);
+            normalize(mag, mag, 0, 1, NORM_MINMAX);
+            hsv_split[0] = ang;
+            hsv_split[1] = mag;
+            hsv_split[2] = Mat::ones(ang.size(), ang.type());
+            merge(hsv_split, 3, hsv);
+            cvtColor(hsv, rgb, COLOR_HSV2BGR);
+            imshow("flow", rgb);
+            imshow("orig", frame);
+        }
+        if ((ret = (char)waitKey(20)) > 0)
+            break;
+        std::swap(prevgray, gray);
+    }
+    return 0;
+}
--- a/samples/cpp/tvl1_optical_flow.cpp
+++ b/samples/cpp/tvl1_optical_flow.cpp
-#include <iostream>
-#include <fstream>
-#include <opencv2/core/utility.hpp>
-#include "opencv2/video.hpp"
-#include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui.hpp"
-using namespace cv;
-using namespace std;
-inline bool isFlowCorrect(Point2f u)
-{
-    return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9;
-}
-static Vec3b computeColor(float fx, float fy)
-{
-    static bool first = true;
-    // relative lengths of color transitions:
-    // these are chosen based on perceptual similarity
-    // (e.g. one can distinguish more shades between red and yellow
-    //  than between yellow and green)
-    const int RY = 15;
-    const int YG = 6;
-    const int GC = 4;
-    const int CB = 11;
-    const int BM = 13;
-    const int MR = 6;
-    const int NCOLS = RY + YG + GC + CB + BM + MR;
-    static Vec3i colorWheel[NCOLS];
-    if (first)
-    {
-        int k = 0;
-        for (int i = 0; i < RY; ++i, ++k)
-            colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
-        for (int i = 0; i < YG; ++i, ++k)
-            colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
-        for (int i = 0; i < GC; ++i, ++k)
-            colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
-        for (int i = 0; i < CB; ++i, ++k)
-            colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
-        for (int i = 0; i < BM; ++i, ++k)
-            colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
-        for (int i = 0; i < MR; ++i, ++k)
-            colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
-        first = false;
-    }
-    const float rad = sqrt(fx * fx + fy * fy);
-    const float a = atan2(-fy, -fx) / (float)CV_PI;
-    const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
-    const int k0 = static_cast<int>(fk);
-    const int k1 = (k0 + 1) % NCOLS;
-    const float f = fk - k0;
-    Vec3b pix;
-    for (int b = 0; b < 3; b++)
-    {
-        const float col0 = colorWheel[k0][b] / 255.f;
-        const float col1 = colorWheel[k1][b] / 255.f;
-        float col = (1 - f) * col0 + f * col1;
-        if (rad <= 1)
-            col = 1 - rad * (1 - col); // increase saturation with radius
-        else
-            col *= .75; // out of range
-        pix[2 - b] = static_cast<uchar>(255.f * col);
-    }
-    return pix;
-}
-static void drawOpticalFlow(const Mat_<Point2f>& flow, Mat& dst, float maxmotion = -1)
-{
-    dst.create(flow.size(), CV_8UC3);
-    dst.setTo(Scalar::all(0));
-    // determine motion range:
-    float maxrad = maxmotion;
-    if (maxmotion <= 0)
-    {
-        maxrad = 1;
-        for (int y = 0; y < flow.rows; ++y)
-        {
-            for (int x = 0; x < flow.cols; ++x)
-            {
-                Point2f u = flow(y, x);
-                if (!isFlowCorrect(u))
-                    continue;
-                maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
-            }
-        }
-    }
-    for (int y = 0; y < flow.rows; ++y)
-    {
-        for (int x = 0; x < flow.cols; ++x)
-        {
-            Point2f u = flow(y, x);
-            if (isFlowCorrect(u))
-                dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
-        }
-    }
-}
-// binary file format for flow data specified here:
-// http://vision.middlebury.edu/flow/data/
-static void writeOpticalFlowToFile(const Mat_<Point2f>& flow, const string& fileName)
-{
-    static const char FLO_TAG_STRING[] = "PIEH";
-    ofstream file(fileName.c_str(), ios_base::binary);
-    file << FLO_TAG_STRING;
-    file.write((const char*) &flow.cols, sizeof(int));
-    file.write((const char*) &flow.rows, sizeof(int));
-    for (int i = 0; i < flow.rows; ++i)
-    {
-        for (int j = 0; j < flow.cols; ++j)
-        {
-            const Point2f u = flow(i, j);
-            file.write((const char*) &u.x, sizeof(float));
-            file.write((const char*) &u.y, sizeof(float));
-        }
-    }
-}
-int main(int argc, const char* argv[])
-{
-    cv::CommandLineParser parser(argc, argv, "{help h || show help message}"
-            "{ @frame0 | | frame 0}{ @frame1 | | frame 1}{ @output | | output flow}");
-    if (parser.has("help"))
-    {
-        parser.printMessage();
-        return 0;
-    }
-    string frame0_name = parser.get<string>("@frame0");
-    string frame1_name = parser.get<string>("@frame1");
-    string file = parser.get<string>("@output");
-    if (frame0_name.empty() || frame1_name.empty() || file.empty())
-    {
-        cerr << "Usage : " << argv[0] << " [<frame0>] [<frame1>] [<output_flow>]" << endl;
-        return -1;
-    }
-    Mat frame0 = imread(frame0_name, IMREAD_GRAYSCALE);
-    Mat frame1 = imread(frame1_name, IMREAD_GRAYSCALE);
-    if (frame0.empty())
-    {
-        cerr << "Can't open image ["  << parser.get<string>("frame0") << "]" << endl;
-        return -1;
-    }
-    if (frame1.empty())
-    {
-        cerr << "Can't open image ["  << parser.get<string>("frame1") << "]" << endl;
-        return -1;
-    }
-    if (frame1.size() != frame0.size())
-    {
-        cerr << "Images should be of equal sizes" << endl;
-        return -1;
-    }
-    Mat_<Point2f> flow;
-    Ptr<DualTVL1OpticalFlow> tvl1 = cv::DualTVL1OpticalFlow::create();
-    const double start = (double)getTickCount();
-    tvl1->calc(frame0, frame1, flow);
-    const double timeSec = (getTickCount() - start) / getTickFrequency();
-    cout << "calcOpticalFlowDual_TVL1 : " << timeSec << " sec" << endl;
-    Mat out;
-    drawOpticalFlow(flow, out);
-    if (!file.empty())
-        writeOpticalFlowToFile(flow, file);
-    imshow("Flow", out);
-    waitKey();
-    return 0;
-}
--- a/samples/python/dis_opt_flow.py
+++ b/samples/python/dis_opt_flow.py
+#!/usr/bin/env python
+'''
+example to show optical flow estimation using DISOpticalFlow
+USAGE: dis_opt_flow.py [<video_source>]
+Keys:
+ 1  - toggle HSV flow visualization
+ 2  - toggle glitch
+ 3  - toggle spatial propagation of flow vectors
+ 4  - toggle temporal propagation of flow vectors
+ESC - exit
+'''
+# Python 2/3 compatibility
+from __future__ import print_function
+import numpy as np
+import cv2 as cv
+import video
+def draw_flow(img, flow, step=16):
+    h, w = img.shape[:2]
+    y, x = np.mgrid[step/2:h:step, step/2:w:step].reshape(2,-1).astype(int)
+    fx, fy = flow[y,x].T
+    lines = np.vstack([x, y, x+fx, y+fy]).T.reshape(-1, 2, 2)
+    lines = np.int32(lines + 0.5)
+    vis = cv.cvtColor(img, cv.COLOR_GRAY2BGR)
+    cv.polylines(vis, lines, 0, (0, 255, 0))
+    for (x1, y1), (x2, y2) in lines:
+        cv.circle(vis, (x1, y1), 1, (0, 255, 0), -1)
+    return vis
+def draw_hsv(flow):
+    h, w = flow.shape[:2]
+    fx, fy = flow[:,:,0], flow[:,:,1]
+    ang = np.arctan2(fy, fx) + np.pi
+    v = np.sqrt(fx*fx+fy*fy)
+    hsv = np.zeros((h, w, 3), np.uint8)
+    hsv[...,0] = ang*(180/np.pi/2)
+    hsv[...,1] = 255
+    hsv[...,2] = np.minimum(v*4, 255)
+    bgr = cv.cvtColor(hsv, cv.COLOR_HSV2BGR)
+    return bgr
+def warp_flow(img, flow):
+    h, w = flow.shape[:2]
+    flow = -flow
+    flow[:,:,0] += np.arange(w)
+    flow[:,:,1] += np.arange(h)[:,np.newaxis]
+    res = cv.remap(img, flow, None, cv.INTER_LINEAR)
+    return res
+if __name__ == '__main__':
+    import sys
+    print(__doc__)
+    try:
+        fn = sys.argv[1]
+    except IndexError:
+        fn = 0
+    cam = video.create_capture(fn)
+    ret, prev = cam.read()
+    prevgray = cv.cvtColor(prev, cv.COLOR_BGR2GRAY)
+    show_hsv = False
+    show_glitch = False
+    use_spatial_propagation = False
+    use_temporal_propagation = True
+    cur_glitch = prev.copy()
+    inst = cv.DISOpticalFlow.create(cv.DISOPTICAL_FLOW_PRESET_MEDIUM)
+    inst.setUseSpatialPropagation(use_spatial_propagation)
+    flow = None
+    while True:
+        ret, img = cam.read()
+        gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
+        if flow is not None and use_temporal_propagation:
+            #warp previous flow to get an initial approximation for the current flow:
+            flow = inst.calc(prevgray, gray, warp_flow(flow,flow))
+        else:
+            flow = inst.calc(prevgray, gray, None)
+        prevgray = gray
+        cv.imshow('flow', draw_flow(gray, flow))
+        if show_hsv:
+            cv.imshow('flow HSV', draw_hsv(flow))
+        if show_glitch:
+            cur_glitch = warp_flow(cur_glitch, flow)
+            cv.imshow('glitch', cur_glitch)
+        ch = 0xFF & cv.waitKey(5)
+        if ch == 27:
+            break
+        if ch == ord('1'):
+            show_hsv = not show_hsv
+            print('HSV flow visualization is', ['off', 'on'][show_hsv])
+        if ch == ord('2'):
+            show_glitch = not show_glitch
+            if show_glitch:
+                cur_glitch = img.copy()
+            print('glitch is', ['off', 'on'][show_glitch])
+        if ch == ord('3'):
+            use_spatial_propagation = not use_spatial_propagation
+            inst.setUseSpatialPropagation(use_spatial_propagation)
+            print('spatial propagation is', ['off', 'on'][use_spatial_propagation])
+        if ch == ord('4'):
+            use_temporal_propagation = not use_temporal_propagation
+            print('temporal propagation is', ['off', 'on'][use_temporal_propagation])
+    cv.destroyAllWindows()
--- a/samples/tapi/dense_optical_flow.cpp
+++ b/samples/tapi/dense_optical_flow.cpp
@@ -50,7 +50,7 @@ int main(int argc, const char* argv[])
    const char* keys =
            "{ h help     |     | print help message }"
            "{ c camera   | 0   | capture video from camera (device index starting from 0) }"
-            "{ a algorithm | fb | algorithm (supported: 'fb', 'tvl')}"
+            "{ a algorithm | fb | algorithm (supported: 'fb', 'dis')}"
            "{ m cpu      |     | run without OpenCL }"
            "{ v video    |     | use video as input }"
            "{ o original |     | use original frame size (do not resize to 640x480)}"
@@ -84,11 +84,11 @@ int main(int argc, const char* argv[])
        return 2;
    }
-    cv::Ptr<cv::DenseOpticalFlow> alg;
+    Ptr<DenseOpticalFlow> alg;
    if (algorithm == "fb")
-        alg = cv::FarnebackOpticalFlow::create();
+        alg = FarnebackOpticalFlow::create();
-    else if (algorithm == "tvl")
+    else if (algorithm == "dis")
-        alg = cv::DualTVL1OpticalFlow::create();
+        alg = DISOpticalFlow::create(DISOpticalFlow::PRESET_FAST);
    else
    {
        cout << "Invalid algorithm: " << algorithm << endl;