Merge pull request #1885 from vpisarev:shuffle_optflow_algos

6389627d · Alexander Alekhin · f39cb5b0 · dbfa4d72 · 6389627d · f39cb5b0
Commit 6389627d authored Nov 09, 2018 by Alexander Alekhin
24 changed files
--- a/modules/optflow/include/opencv2/optflow.hpp
+++ b/modules/optflow/include/opencv2/optflow.hpp
@@ -73,7 +73,7 @@ namespace cv
 {
 namespace optflow
 {
 //! @addtogroup optflow
 //! @{
@@ -137,85 +137,6 @@ CV_EXPORTS_W void calcOpticalFlowSparseToDense ( InputArray from, InputArray to,
                                                 bool use_post_proc = true, float fgs_lambda = 500.0f,
                                                 float fgs_sigma = 1.5f );
-/** @brief Read a .flo file
-@param path Path to the file to be loaded
-The function readOpticalFlow loads a flow field from a file and returns it as a single matrix.
-Resulting Mat has a type CV_32FC2 - floating-point, 2-channel. First channel corresponds to the
-flow in the horizontal direction (u), second - vertical (v).
- */
-CV_EXPORTS_W Mat readOpticalFlow( const String& path );
-/** @brief Write a .flo to disk
-@param path Path to the file to be written
-@param flow Flow field to be stored
-The function stores a flow field in a file, returns true on success, false otherwise.
-The flow field must be a 2-channel, floating-point matrix (CV_32FC2). First channel corresponds
-to the flow in the horizontal direction (u), second - vertical (v).
- */
-CV_EXPORTS_W bool writeOpticalFlow( const String& path, InputArray flow );
-/** @brief Variational optical flow refinement
-This class implements variational refinement of the input flow field, i.e.
-it uses input flow to initialize the minimization of the following functional:
-\f$E(U) = \int_{\Omega} \delta \Psi(E_I) + \gamma \Psi(E_G) + \alpha \Psi(E_S) \f$,
-where \f$E_I,E_G,E_S\f$ are color constancy, gradient constancy and smoothness terms
-respectively. \f$\Psi(s^2)=\sqrt{s^2+\epsilon^2}\f$ is a robust penalizer to limit the
-influence of outliers. A complete formulation and a description of the minimization
-procedure can be found in @cite Brox2004
-*/
-class CV_EXPORTS_W VariationalRefinement : public DenseOpticalFlow
-{
-public:
-    /** @brief @ref calc function overload to handle separate horizontal (u) and vertical (v) flow components
-    (to avoid extra splits/merges) */
-    CV_WRAP virtual void calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v) = 0;
-    /** @brief Number of outer (fixed-point) iterations in the minimization procedure.
-    @see setFixedPointIterations */
-    CV_WRAP virtual int getFixedPointIterations() const = 0;
-    /** @copybrief getFixedPointIterations @see getFixedPointIterations */
-    CV_WRAP virtual void setFixedPointIterations(int val) = 0;
-    /** @brief Number of inner successive over-relaxation (SOR) iterations
-        in the minimization procedure to solve the respective linear system.
-    @see setSorIterations */
-    CV_WRAP virtual int getSorIterations() const = 0;
-    /** @copybrief getSorIterations @see getSorIterations */
-    CV_WRAP virtual void setSorIterations(int val) = 0;
-    /** @brief Relaxation factor in SOR
-    @see setOmega */
-    CV_WRAP virtual float getOmega() const = 0;
-    /** @copybrief getOmega @see getOmega */
-    CV_WRAP virtual void setOmega(float val) = 0;
-    /** @brief Weight of the smoothness term
-    @see setAlpha */
-    CV_WRAP virtual float getAlpha() const = 0;
-    /** @copybrief getAlpha @see getAlpha */
-    CV_WRAP virtual void setAlpha(float val) = 0;
-    /** @brief Weight of the color constancy term
-    @see setDelta */
-    CV_WRAP virtual float getDelta() const = 0;
-    /** @copybrief getDelta @see getDelta */
-    CV_WRAP virtual void setDelta(float val) = 0;
-    /** @brief Weight of the gradient constancy term
-    @see setGamma */
-    CV_WRAP virtual float getGamma() const = 0;
-    /** @copybrief getGamma @see getGamma */
-    CV_WRAP virtual void setGamma(float val) = 0;
-};
-/** @brief Creates an instance of VariationalRefinement
-*/
-CV_EXPORTS_W Ptr<VariationalRefinement> createVariationalFlowRefinement();
 /** @brief DeepFlow optical flow algorithm implementation.
 The class implements the DeepFlow optical flow algorithm described in @cite Weinzaepfel2013 . See
@@ -252,107 +173,131 @@ CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_Farneback();
 //! Additional interface to the SparseToDenseFlow algorithm - calcOpticalFlowSparseToDense()
 CV_EXPORTS_W Ptr<DenseOpticalFlow> createOptFlow_SparseToDense();
-/** @brief DIS optical flow algorithm.
+/** @brief "Dual TV L1" Optical Flow Algorithm.
+The class implements the "Dual TV L1" optical flow algorithm described in @cite Zach2007 and
+@cite Javier2012 .
+Here are important members of the class that control the algorithm, which you can set after
+constructing the class instance:
+-   member double tau
+    Time step of the numerical scheme.
+-   member double lambda
+    Weight parameter for the data term, attachment parameter. This is the most relevant
+    parameter, which determines the smoothness of the output. The smaller this parameter is,
+    the smoother the solutions we obtain. It depends on the range of motions of the images, so
+    its value should be adapted to each image sequence.
-This class implements the Dense Inverse Search (DIS) optical flow algorithm. More
+-   member double theta
-details about the algorithm can be found at @cite Kroeger2016 . Includes three presets with preselected
+    Weight parameter for (u - v)\^2, tightness parameter. It serves as a link between the
-parameters to provide reasonable trade-off between speed and quality. However, even the slowest preset is
+    attachment and the regularization terms. In theory, it should have a small value in order
-still relatively fast, use DeepFlow if you need better quality and don't care about speed.
+    to maintain both parts in correspondence. The method is stable for a large range of values
+    of this parameter.
-This implementation includes several additional features compared to the algorithm described in the paper,
+-   member int nscales
-including spatial propagation of flow vectors (@ref getUseSpatialPropagation), as well as an option to
+    Number of scales used to create the pyramid of images.
-utilize an initial flow approximation passed to @ref calc (which is, essentially, temporal propagation,
-if the previous frame's flow field is passed).
+-   member int warps
+    Number of warpings per scale. Represents the number of times that I1(x+u0) and grad(
+    I1(x+u0) ) are computed per scale. This is a parameter that assures the stability of the
+    method. It also affects the running time, so it is a compromise between speed and
+    accuracy.
+-   member double epsilon
+    Stopping criterion threshold used in the numerical scheme, which is a trade-off between
+    precision and running time. A small value will yield more accurate solutions at the
+    expense of a slower convergence.
+-   member int iterations
+    Stopping criterion iterations number used in the numerical scheme.
+C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
 */
-class CV_EXPORTS_W DISOpticalFlow : public DenseOpticalFlow
+class CV_EXPORTS_W DualTVL1OpticalFlow : public DenseOpticalFlow
 {
 public:
-    enum
+    //! @brief Time step of the numerical scheme
-    {
+    /** @see setTau */
-        PRESET_ULTRAFAST = 0,
+    CV_WRAP virtual double getTau() const = 0;
-        PRESET_FAST = 1,
+    /** @copybrief getTau @see getTau */
-        PRESET_MEDIUM = 2
+    CV_WRAP virtual void setTau(double val) = 0;
-    };
+    //! @brief Weight parameter for the data term, attachment parameter
+    /** @see setLambda */
-    /** @brief Finest level of the Gaussian pyramid on which the flow is computed (zero level
+    CV_WRAP virtual double getLambda() const = 0;
-        corresponds to the original image resolution). The final flow is obtained by bilinear upscaling.
+    /** @copybrief getLambda @see getLambda */
-        @see setFinestScale */
+    CV_WRAP virtual void setLambda(double val) = 0;
-    CV_WRAP virtual int getFinestScale() const = 0;
+    //! @brief Weight parameter for (u - v)^2, tightness parameter
-    /** @copybrief getFinestScale @see getFinestScale */
+    /** @see setTheta */
-    CV_WRAP virtual void setFinestScale(int val) = 0;
+    CV_WRAP virtual double getTheta() const = 0;
+    /** @copybrief getTheta @see getTheta */
-    /** @brief Size of an image patch for matching (in pixels). Normally, default 8x8 patches work well
+    CV_WRAP virtual void setTheta(double val) = 0;
-        enough in most cases.
+    //! @brief coefficient for additional illumination variation term
-        @see setPatchSize */
+    /** @see setGamma */
-    CV_WRAP virtual int getPatchSize() const = 0;
+    CV_WRAP virtual double getGamma() const = 0;
-    /** @copybrief getPatchSize @see getPatchSize */
+    /** @copybrief getGamma @see getGamma */
-    CV_WRAP virtual void setPatchSize(int val) = 0;
+    CV_WRAP virtual void setGamma(double val) = 0;
+    //! @brief Number of scales used to create the pyramid of images
-    /** @brief Stride between neighbor patches. Must be less than patch size. Lower values correspond
+    /** @see setScalesNumber */
-        to higher flow quality.
+    CV_WRAP virtual int getScalesNumber() const = 0;
-        @see setPatchStride */
+    /** @copybrief getScalesNumber @see getScalesNumber */
-    CV_WRAP virtual int getPatchStride() const = 0;
+    CV_WRAP virtual void setScalesNumber(int val) = 0;
-    /** @copybrief getPatchStride @see getPatchStride */
+    //! @brief Number of warpings per scale
-    CV_WRAP virtual void setPatchStride(int val) = 0;
+    /** @see setWarpingsNumber */
+    CV_WRAP virtual int getWarpingsNumber() const = 0;
-    /** @brief Maximum number of gradient descent iterations in the patch inverse search stage. Higher values
+    /** @copybrief getWarpingsNumber @see getWarpingsNumber */
-        may improve quality in some cases.
+    CV_WRAP virtual void setWarpingsNumber(int val) = 0;
-        @see setGradientDescentIterations */
+    //! @brief Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time
-    CV_WRAP virtual int getGradientDescentIterations() const = 0;
+    /** @see setEpsilon */
-    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
+    CV_WRAP virtual double getEpsilon() const = 0;
-    CV_WRAP virtual void setGradientDescentIterations(int val) = 0;
+    /** @copybrief getEpsilon @see getEpsilon */
+    CV_WRAP virtual void setEpsilon(double val) = 0;
-    /** @brief Number of fixed point iterations of variational refinement per scale. Set to zero to
+    //! @brief Inner iterations (between outlier filtering) used in the numerical scheme
-        disable variational refinement completely. Higher values will typically result in more smooth and
+    /** @see setInnerIterations */
-        high-quality flow.
+    CV_WRAP virtual int getInnerIterations() const = 0;
-    @see setGradientDescentIterations */
+    /** @copybrief getInnerIterations @see getInnerIterations */
-    CV_WRAP virtual int getVariationalRefinementIterations() const = 0;
+    CV_WRAP virtual void setInnerIterations(int val) = 0;
-    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
+    //! @brief Outer iterations (number of inner loops) used in the numerical scheme
-    CV_WRAP virtual void setVariationalRefinementIterations(int val) = 0;
+    /** @see setOuterIterations */
+    CV_WRAP virtual int getOuterIterations() const = 0;
-    /** @brief Weight of the smoothness term
+    /** @copybrief getOuterIterations @see getOuterIterations */
-    @see setVariationalRefinementAlpha */
+    CV_WRAP virtual void setOuterIterations(int val) = 0;
-    CV_WRAP virtual float getVariationalRefinementAlpha() const = 0;
+    //! @brief Use initial flow
-    /** @copybrief getVariationalRefinementAlpha @see getVariationalRefinementAlpha */
+    /** @see setUseInitialFlow */
-    CV_WRAP virtual void setVariationalRefinementAlpha(float val) = 0;
+    CV_WRAP virtual bool getUseInitialFlow() const = 0;
+    /** @copybrief getUseInitialFlow @see getUseInitialFlow */
-    /** @brief Weight of the color constancy term
+    CV_WRAP virtual void setUseInitialFlow(bool val) = 0;
-    @see setVariationalRefinementDelta */
+    //! @brief Step between scales (<1)
-    CV_WRAP virtual float getVariationalRefinementDelta() const = 0;
+    /** @see setScaleStep */
-    /** @copybrief getVariationalRefinementDelta @see getVariationalRefinementDelta */
+    CV_WRAP virtual double getScaleStep() const = 0;
-    CV_WRAP virtual void setVariationalRefinementDelta(float val) = 0;
+    /** @copybrief getScaleStep @see getScaleStep */
+    CV_WRAP virtual void setScaleStep(double val) = 0;
-    /** @brief Weight of the gradient constancy term
+    //! @brief Median filter kernel size (1 = no filter) (3 or 5)
-    @see setVariationalRefinementGamma */
+    /** @see setMedianFiltering */
-    CV_WRAP virtual float getVariationalRefinementGamma() const = 0;
+    CV_WRAP virtual int getMedianFiltering() const = 0;
-    /** @copybrief getVariationalRefinementGamma @see getVariationalRefinementGamma */
+    /** @copybrief getMedianFiltering @see getMedianFiltering */
-    CV_WRAP virtual void setVariationalRefinementGamma(float val) = 0;
+    CV_WRAP virtual void setMedianFiltering(int val) = 0;
+    /** @brief Creates instance of cv::DualTVL1OpticalFlow*/
-    /** @brief Whether to use mean-normalization of patches when computing patch distance. It is turned on
+    CV_WRAP static Ptr<DualTVL1OpticalFlow> create(
-        by default as it typically provides a noticeable quality boost because of increased robustness to
+                                            double tau = 0.25,
-        illumination variations. Turn it off if you are certain that your sequence doesn't contain any changes
+                                            double lambda = 0.15,
-        in illumination.
+                                            double theta = 0.3,
-    @see setUseMeanNormalization */
+                                            int nscales = 5,
-    CV_WRAP virtual bool getUseMeanNormalization() const = 0;
+                                            int warps = 5,
-    /** @copybrief getUseMeanNormalization @see getUseMeanNormalization */
+                                            double epsilon = 0.01,
-    CV_WRAP virtual void setUseMeanNormalization(bool val) = 0;
+                                            int innnerIterations = 30,
+                                            int outerIterations = 10,
-    /** @brief Whether to use spatial propagation of good optical flow vectors. This option is turned on by
+                                            double scaleStep = 0.8,
-        default, as it tends to work better on average and can sometimes help recover from major errors
+                                            double gamma = 0.0,
-        introduced by the coarse-to-fine scheme employed by the DIS optical flow algorithm. Turning this
+                                            int medianFiltering = 5,
-        option off can make the output flow field a bit smoother, however.
+                                            bool useInitialFlow = false);
-    @see setUseSpatialPropagation */
-    CV_WRAP virtual bool getUseSpatialPropagation() const = 0;
-    /** @copybrief getUseSpatialPropagation @see getUseSpatialPropagation */
-    CV_WRAP virtual void setUseSpatialPropagation(bool val) = 0;
 };
-/** @brief Creates an instance of DISOpticalFlow
+/** @brief Creates instance of cv::DenseOpticalFlow
-@param preset one of PRESET_ULTRAFAST, PRESET_FAST and PRESET_MEDIUM
 */
-CV_EXPORTS_W Ptr<DISOpticalFlow> createOptFlow_DIS(int preset = DISOpticalFlow::PRESET_FAST);
+CV_EXPORTS_W Ptr<DualTVL1OpticalFlow> createOptFlow_DualTVL1();
 //! @}

--- a/modules/optflow/perf/opencl/perf_dis_optflow.cpp
+++ b/modules/optflow/perf/opencl/perf_dis_optflow.cpp
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "../perf_precomp.hpp"
-#include "opencv2/ts/ocl_perf.hpp"
-namespace opencv_test { namespace {
-#ifdef HAVE_OPENCL
-void MakeArtificialExample(UMat &dst_frame1, UMat &dst_frame2);
-typedef tuple<String, Size> DISParams;
-typedef TestBaseWithParam<DISParams> DenseOpticalFlow_DIS;
-OCL_PERF_TEST_P(DenseOpticalFlow_DIS, perf,
-                Combine(Values("PRESET_ULTRAFAST", "PRESET_FAST", "PRESET_MEDIUM"), Values(szVGA, sz720p, sz1080p)))
-{
-    DISParams params = GetParam();
-    // use strings to print preset names in the perf test results:
-    String preset_string = get<0>(params);
-    int preset = DISOpticalFlow::PRESET_FAST;
-    if (preset_string == "PRESET_ULTRAFAST")
-        preset = DISOpticalFlow::PRESET_ULTRAFAST;
-    else if (preset_string == "PRESET_FAST")
-        preset = DISOpticalFlow::PRESET_FAST;
-    else if (preset_string == "PRESET_MEDIUM")
-        preset = DISOpticalFlow::PRESET_MEDIUM;
-    Size sz = get<1>(params);
-    UMat frame1(sz, CV_8U);
-    UMat frame2(sz, CV_8U);
-    UMat flow;
-    MakeArtificialExample(frame1, frame2);
-    Ptr<DenseOpticalFlow> algo = createOptFlow_DIS(preset);
-    OCL_TEST_CYCLE_N(10)
-    {
-        algo->calc(frame1, frame2, flow);
-    }
-    SANITY_CHECK_NOTHING();
-}
-void MakeArtificialExample(UMat &dst_frame1, UMat &dst_frame2)
-{
-    int src_scale = 2;
-    int OF_scale = 6;
-    double sigma = dst_frame1.cols / 300;
-    UMat tmp(Size(dst_frame1.cols / (1 << src_scale), dst_frame1.rows / (1 << src_scale)), CV_8U);
-    randu(tmp, 0, 255);
-    resize(tmp, dst_frame1, dst_frame1.size(), 0.0, 0.0, INTER_LINEAR_EXACT);
-    resize(tmp, dst_frame2, dst_frame2.size(), 0.0, 0.0, INTER_LINEAR_EXACT);
-    Mat displacement_field(Size(dst_frame1.cols / (1 << OF_scale), dst_frame1.rows / (1 << OF_scale)),
-                           CV_32FC2);
-    randn(displacement_field, 0.0, sigma);
-    resize(displacement_field, displacement_field, dst_frame2.size(), 0.0, 0.0, INTER_CUBIC);
-    for (int i = 0; i < displacement_field.rows; i++)
-        for (int j = 0; j < displacement_field.cols; j++)
-            displacement_field.at<Vec2f>(i, j) += Vec2f((float)j, (float)i);
-    remap(dst_frame2, dst_frame2, displacement_field, Mat(), INTER_LINEAR, BORDER_REPLICATE);
-}
-#endif // HAVE_OPENCL
-}} // namespace
--- a/modules/optflow/perf/opencl/perf_optflow_dualTVL1.cpp
+++ b/modules/optflow/perf/opencl/perf_optflow_dualTVL1.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "../perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+#ifdef HAVE_OPENCL
+namespace opencv_test {
+namespace ocl {
+///////////// OpticalFlow Dual TVL1 ////////////////////////
+typedef tuple< tuple<int, double>, bool> OpticalFlowDualTVL1Params;
+typedef TestBaseWithParam<OpticalFlowDualTVL1Params> OpticalFlowDualTVL1Fixture;
+OCL_PERF_TEST_P(OpticalFlowDualTVL1Fixture, OpticalFlowDualTVL1,
+            ::testing::Combine(
+                        ::testing::Values(make_tuple<int, double>(-1, 0.3),
+                                          make_tuple<int, double>(3, 0.5)),
+                        ::testing::Bool()
+                                )
+            )
+    {
+        Mat frame0 = imread(getDataPath("cv/optflow/RubberWhale1.png"), IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame0.empty()) << "can't load RubberWhale1.png";
+        Mat frame1 = imread(getDataPath("cv/optflow/RubberWhale2.png"), IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame1.empty()) << "can't load RubberWhale2.png";
+        const Size srcSize = frame0.size();
+        const OpticalFlowDualTVL1Params params = GetParam();
+        const tuple<int, double> filteringScale = get<0>(params);
+            const int medianFiltering = get<0>(filteringScale);
+            const double scaleStep = get<1>(filteringScale);
+        const bool useInitFlow = get<1>(params);
+        double eps = 0.9;
+        UMat uFrame0; frame0.copyTo(uFrame0);
+        UMat uFrame1; frame1.copyTo(uFrame1);
+        UMat uFlow(srcSize, CV_32FC2);
+        declare.in(uFrame0, uFrame1, WARMUP_READ).out(uFlow, WARMUP_READ);
+        //create algorithm
+        Ptr<DualTVL1OpticalFlow> alg = createOptFlow_DualTVL1();
+        //set parameters
+        alg->setScaleStep(scaleStep);
+        alg->setMedianFiltering(medianFiltering);
+        if (useInitFlow)
+        {
+            //calculate initial flow as result of optical flow
+            alg->calc(uFrame0, uFrame1, uFlow);
+        }
+        //set flag to use initial flow
+        alg->setUseInitialFlow(useInitFlow);
+        OCL_TEST_CYCLE()
+            alg->calc(uFrame0, uFrame1, uFlow);
+        SANITY_CHECK(uFlow, eps, ERROR_RELATIVE);
+    }
+}
+} // namespace opencv_test::ocl
+#endif // HAVE_OPENCL
--- a/modules/optflow/perf/perf_disflow.cpp
+++ b/modules/optflow/perf/perf_disflow.cpp
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "perf_precomp.hpp"
-namespace opencv_test { namespace {
-void MakeArtificialExample(Mat &dst_frame1, Mat &dst_frame2);
-typedef tuple<String, Size> DISParams;
-typedef TestBaseWithParam<DISParams> DenseOpticalFlow_DIS;
-PERF_TEST_P(DenseOpticalFlow_DIS, perf,
-            Combine(Values("PRESET_ULTRAFAST", "PRESET_FAST", "PRESET_MEDIUM"), Values(szVGA, sz720p, sz1080p)))
-{
-    DISParams params = GetParam();
-    // use strings to print preset names in the perf test results:
-    String preset_string = get<0>(params);
-    int preset = DISOpticalFlow::PRESET_FAST;
-    if (preset_string == "PRESET_ULTRAFAST")
-        preset = DISOpticalFlow::PRESET_ULTRAFAST;
-    else if (preset_string == "PRESET_FAST")
-        preset = DISOpticalFlow::PRESET_FAST;
-    else if (preset_string == "PRESET_MEDIUM")
-        preset = DISOpticalFlow::PRESET_MEDIUM;
-    Size sz = get<1>(params);
-    Mat frame1(sz, CV_8U);
-    Mat frame2(sz, CV_8U);
-    Mat flow;
-    MakeArtificialExample(frame1, frame2);
-    TEST_CYCLE_N(10)
-    {
-        Ptr<DenseOpticalFlow> algo = createOptFlow_DIS(preset);
-        algo->calc(frame1, frame2, flow);
-    }
-    SANITY_CHECK_NOTHING();
-}
-void MakeArtificialExample(Mat &dst_frame1, Mat &dst_frame2)
-{
-    int src_scale = 2;
-    int OF_scale = 6;
-    double sigma = dst_frame1.cols / 300;
-    Mat tmp(Size(dst_frame1.cols / (1 << src_scale), dst_frame1.rows / (1 << src_scale)), CV_8U);
-    randu(tmp, 0, 255);
-    resize(tmp, dst_frame1, dst_frame1.size(), 0.0, 0.0, INTER_LINEAR_EXACT);
-    resize(tmp, dst_frame2, dst_frame2.size(), 0.0, 0.0, INTER_LINEAR_EXACT);
-    Mat displacement_field(Size(dst_frame1.cols / (1 << OF_scale), dst_frame1.rows / (1 << OF_scale)),
-                           CV_32FC2);
-    randn(displacement_field, 0.0, sigma);
-    resize(displacement_field, displacement_field, dst_frame2.size(), 0.0, 0.0, INTER_CUBIC);
-    for (int i = 0; i < displacement_field.rows; i++)
-        for (int j = 0; j < displacement_field.cols; j++)
-            displacement_field.at<Vec2f>(i, j) += Vec2f((float)j, (float)i);
-    remap(dst_frame2, dst_frame2, displacement_field, Mat(), INTER_LINEAR, BORDER_REPLICATE);
-}
-}} // namespace
--- a/modules/optflow/perf/perf_tvl1optflow.cpp
+++ b/modules/optflow/perf/perf_tvl1optflow.cpp
+#include "perf_precomp.hpp"
+namespace opencv_test { namespace {
+using namespace perf;
+typedef TestBaseWithParam< std::pair<string, string> > ImagePair;
+std::pair<string, string> impair(const char* im1, const char* im2)
+{
+    return std::make_pair(string(im1), string(im2));
+}
+PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1, testing::Values(impair("cv/optflow/RubberWhale1.png", "cv/optflow/RubberWhale2.png")))
+{
+    declare.time(260);
+    Mat frame1 = imread(getDataPath(GetParam().first), IMREAD_GRAYSCALE);
+    Mat frame2 = imread(getDataPath(GetParam().second), IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+    ASSERT_FALSE(frame2.empty());
+    Mat flow;
+    Ptr<DenseOpticalFlow> tvl1 = createOptFlow_DualTVL1();
+    TEST_CYCLE() tvl1->calc(frame1, frame2, flow);
+    SANITY_CHECK_NOTHING();
+}
+}} // namespace
--- a/modules/optflow/perf/perf_variational_refinement.cpp
+++ b/modules/optflow/perf/perf_variational_refinement.cpp
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "perf_precomp.hpp"
-namespace opencv_test { namespace {
-typedef tuple<Size, int, int> VarRefParams;
-typedef TestBaseWithParam<VarRefParams> DenseOpticalFlow_VariationalRefinement;
-PERF_TEST_P(DenseOpticalFlow_VariationalRefinement, perf, Combine(Values(szQVGA, szVGA), Values(5, 10), Values(5, 10)))
-{
-    VarRefParams params = GetParam();
-    Size sz = get<0>(params);
-    int sorIter = get<1>(params);
-    int fixedPointIter = get<2>(params);
-    Mat frame1(sz, CV_8U);
-    Mat frame2(sz, CV_8U);
-    Mat flow(sz, CV_32FC2);
-    randu(frame1, 0, 255);
-    randu(frame2, 0, 255);
-    flow.setTo(0.0f);
-    TEST_CYCLE_N(10)
-    {
-        Ptr<VariationalRefinement> var = createVariationalFlowRefinement();
-        var->setAlpha(20.0f);
-        var->setGamma(10.0f);
-        var->setDelta(5.0f);
-        var->setSorIterations(sorIter);
-        var->setFixedPointIterations(fixedPointIter);
-        var->calc(frame1, frame2, flow);
-    }
-    SANITY_CHECK_NOTHING();
-}
-}} // namespace
--- a/modules/optflow/samples/dis_opticalflow.cpp
+++ b/modules/optflow/samples/dis_opticalflow.cpp
-#include "opencv2/core/utility.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/optflow.hpp"
-using namespace std;
-using namespace cv;
-using namespace optflow;
-static void help()
-{
-    printf("Usage: dis_optflow <video_file>\n");
-}
-int main(int argc, char **argv)
-{
-    VideoCapture cap;
-    if (argc < 2)
-    {
-        help();
-        exit(1);
-    }
-    cap.open(argv[1]);
-    if(!cap.isOpened())
-    {
-        printf("ERROR: Cannot open file %s\n", argv[1]);
-        return -1;
-    }
-    Mat prevgray, gray, rgb, frame;
-    Mat flow, flow_uv[2];
-    Mat mag, ang;
-    Mat hsv_split[3], hsv;
-    char ret;
-    namedWindow("flow", 1);
-    namedWindow("orig", 1);
-    Ptr<DenseOpticalFlow> algorithm = createOptFlow_DIS(DISOpticalFlow::PRESET_MEDIUM);
-    while(true)
-    {
-        cap >> frame;
-        if (frame.empty())
-            break;
-        cvtColor(frame, gray, COLOR_BGR2GRAY);
-        if (!prevgray.empty())
-        {
-            algorithm->calc(prevgray, gray, flow);
-            split(flow, flow_uv);
-            multiply(flow_uv[1], -1, flow_uv[1]);
-            cartToPolar(flow_uv[0], flow_uv[1], mag, ang, true);
-            normalize(mag, mag, 0, 1, NORM_MINMAX);
-            hsv_split[0] = ang;
-            hsv_split[1] = mag;
-            hsv_split[2] = Mat::ones(ang.size(), ang.type());
-            merge(hsv_split, 3, hsv);
-            cvtColor(hsv, rgb, COLOR_HSV2BGR);
-            imshow("flow", rgb);
-            imshow("orig", frame);
-        }
-        if ((ret = (char)waitKey(20)) > 0)
-            break;
-        std::swap(prevgray, gray);
-    }
-    return 0;
-}
--- a/modules/optflow/samples/gpc_evaluate.cpp
+++ b/modules/optflow/samples/gpc_evaluate.cpp
@@ -101,7 +101,7 @@ int main( int argc, const char **argv )
  Mat from = imread( fromPath );
  Mat to = imread( toPath );
-  Mat gt = optflow::readOpticalFlow( gtPath );
+  Mat gt = readOpticalFlow( gtPath );
  std::vector< std::pair< Point2i, Point2i > > corr;
  TickMeter meter;

--- a/modules/optflow/samples/optical_flow_evaluation.cpp
+++ b/modules/optflow/samples/optical_flow_evaluation.cpp
@@ -269,11 +269,11 @@ int main( int argc, char** argv )
            algorithm = createOptFlow_PCAFlow();
    }
    else if ( method == "DISflow_ultrafast" )
-        algorithm = createOptFlow_DIS(DISOpticalFlow::PRESET_ULTRAFAST);
+        algorithm = DISOpticalFlow::create(DISOpticalFlow::PRESET_ULTRAFAST);
    else if (method == "DISflow_fast")
-        algorithm = createOptFlow_DIS(DISOpticalFlow::PRESET_FAST);
+        algorithm = DISOpticalFlow::create(DISOpticalFlow::PRESET_FAST);
    else if (method == "DISflow_medium")
-        algorithm = createOptFlow_DIS(DISOpticalFlow::PRESET_MEDIUM);
+        algorithm = DISOpticalFlow::create(DISOpticalFlow::PRESET_MEDIUM);
    else
    {
        printf("Wrong method!\n");
@@ -300,7 +300,7 @@ int main( int argc, char** argv )
    if ( !groundtruth_path.empty() )
    { // compare to ground truth
-        ground_truth = optflow::readOpticalFlow(groundtruth_path);
+        ground_truth = readOpticalFlow(groundtruth_path);
        if ( flow.size() != ground_truth.size() || flow.channels() != 2
                || ground_truth.channels() != 2 )
        {

--- a/modules/optflow/samples/pcaflow_demo.cpp
+++ b/modules/optflow/samples/pcaflow_demo.cpp
@@ -135,7 +135,7 @@ int main( int argc, const char **argv )
  Mat i1 = imread( img1 );
  Mat i2 = imread( img2 );
-  Mat gt = optflow::readOpticalFlow( groundtruth );
+  Mat gt = readOpticalFlow( groundtruth );
  Mat i1g, i2g;
  cvtColor( i1, i1g, COLOR_BGR2GRAY );

--- a/modules/optflow/samples/tvl1_optical_flow.cpp
+++ b/modules/optflow/samples/tvl1_optical_flow.cpp
+#include <iostream>
+#include <fstream>
+#include <opencv2/core/utility.hpp>
+#include "opencv2/video.hpp"
+#include "opencv2/optflow.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/highgui.hpp"
+using namespace cv;
+using namespace std;
+using namespace optflow;
+inline bool isFlowCorrect(Point2f u)
+{
+    return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9;
+}
+static Vec3b computeColor(float fx, float fy)
+{
+    static bool first = true;
+    // relative lengths of color transitions:
+    // these are chosen based on perceptual similarity
+    // (e.g. one can distinguish more shades between red and yellow
+    //  than between yellow and green)
+    const int RY = 15;
+    const int YG = 6;
+    const int GC = 4;
+    const int CB = 11;
+    const int BM = 13;
+    const int MR = 6;
+    const int NCOLS = RY + YG + GC + CB + BM + MR;
+    static Vec3i colorWheel[NCOLS];
+    if (first)
+    {
+        int k = 0;
+        for (int i = 0; i < RY; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
+        for (int i = 0; i < YG; ++i, ++k)
+            colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
+        for (int i = 0; i < GC; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
+        for (int i = 0; i < CB; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
+        for (int i = 0; i < BM; ++i, ++k)
+            colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
+        for (int i = 0; i < MR; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
+        first = false;
+    }
+    const float rad = sqrt(fx * fx + fy * fy);
+    const float a = atan2(-fy, -fx) / (float)CV_PI;
+    const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
+    const int k0 = static_cast<int>(fk);
+    const int k1 = (k0 + 1) % NCOLS;
+    const float f = fk - k0;
+    Vec3b pix;
+    for (int b = 0; b < 3; b++)
+    {
+        const float col0 = colorWheel[k0][b] / 255.f;
+        const float col1 = colorWheel[k1][b] / 255.f;
+        float col = (1 - f) * col0 + f * col1;
+        if (rad <= 1)
+            col = 1 - rad * (1 - col); // increase saturation with radius
+        else
+            col *= .75; // out of range
+        pix[2 - b] = static_cast<uchar>(255.f * col);
+    }
+    return pix;
+}
+static void drawOpticalFlow(const Mat_<Point2f>& flow, Mat& dst, float maxmotion = -1)
+{
+    dst.create(flow.size(), CV_8UC3);
+    dst.setTo(Scalar::all(0));
+    // determine motion range:
+    float maxrad = maxmotion;
+    if (maxmotion <= 0)
+    {
+        maxrad = 1;
+        for (int y = 0; y < flow.rows; ++y)
+        {
+            for (int x = 0; x < flow.cols; ++x)
+            {
+                Point2f u = flow(y, x);
+                if (!isFlowCorrect(u))
+                    continue;
+                maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
+            }
+        }
+    }
+    for (int y = 0; y < flow.rows; ++y)
+    {
+        for (int x = 0; x < flow.cols; ++x)
+        {
+            Point2f u = flow(y, x);
+            if (isFlowCorrect(u))
+                dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
+        }
+    }
+}
+// binary file format for flow data specified here:
+// http://vision.middlebury.edu/flow/data/
+static void writeOpticalFlowToFile(const Mat_<Point2f>& flow, const string& fileName)
+{
+    static const char FLO_TAG_STRING[] = "PIEH";
+    ofstream file(fileName.c_str(), ios_base::binary);
+    file << FLO_TAG_STRING;
+    file.write((const char*) &flow.cols, sizeof(int));
+    file.write((const char*) &flow.rows, sizeof(int));
+    for (int i = 0; i < flow.rows; ++i)
+    {
+        for (int j = 0; j < flow.cols; ++j)
+        {
+            const Point2f u = flow(i, j);
+            file.write((const char*) &u.x, sizeof(float));
+            file.write((const char*) &u.y, sizeof(float));
+        }
+    }
+}
+int main(int argc, const char* argv[])
+{
+    cv::CommandLineParser parser(argc, argv, "{help h || show help message}"
+            "{ @frame0 | | frame 0}{ @frame1 | | frame 1}{ @output | | output flow}");
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+    string frame0_name = parser.get<string>("@frame0");
+    string frame1_name = parser.get<string>("@frame1");
+    string file = parser.get<string>("@output");
+    if (frame0_name.empty() || frame1_name.empty() || file.empty())
+    {
+        cerr << "Usage : " << argv[0] << " [<frame0>] [<frame1>] [<output_flow>]" << endl;
+        return -1;
+    }
+    Mat frame0 = imread(frame0_name, IMREAD_GRAYSCALE);
+    Mat frame1 = imread(frame1_name, IMREAD_GRAYSCALE);
+    if (frame0.empty())
+    {
+        cerr << "Can't open image ["  << parser.get<string>("frame0") << "]" << endl;
+        return -1;
+    }
+    if (frame1.empty())
+    {
+        cerr << "Can't open image ["  << parser.get<string>("frame1") << "]" << endl;
+        return -1;
+    }
+    if (frame1.size() != frame0.size())
+    {
+        cerr << "Images should be of equal sizes" << endl;
+        return -1;
+    }
+    Mat_<Point2f> flow;
+    Ptr<DualTVL1OpticalFlow> tvl1 = DualTVL1OpticalFlow::create();
+    const double start = (double)getTickCount();
+    tvl1->calc(frame0, frame1, flow);
+    const double timeSec = (getTickCount() - start) / getTickFrequency();
+    cout << "calcOpticalFlowDual_TVL1 : " << timeSec << " sec" << endl;
+    Mat out;
+    drawOpticalFlow(flow, out);
+    if (!file.empty())
+        writeOpticalFlowToFile(flow, file);
+    imshow("Flow", out);
+    waitKey();
+    return 0;
+}
--- a/modules/optflow/src/deepflow.cpp
+++ b/modules/optflow/src/deepflow.cpp
@@ -147,7 +147,7 @@ void OpticalFlowDeepFlow::calc( InputArray _I0, InputArray _I1, InputOutputArray
    for ( int level = levelCount - 1; level >= 0; --level )
    { //iterate through  all levels, beginning with the most coarse
-        Ptr<VariationalRefinement> var = createVariationalFlowRefinement();
+        Ptr<VariationalRefinement> var = VariationalRefinement::create();
        var->setAlpha(4 * alpha);
        var->setDelta(delta / 3);

--- a/modules/optflow/src/dis_flow.cpp
+++ b/modules/optflow/src/dis_flow.cpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-#include "opencl_kernels_optflow.hpp"
-using namespace std;
-#define EPS 0.001F
-#define INF 1E+10F
-namespace cv
-{
-namespace optflow
-{
-class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
-{
-  public:
-    DISOpticalFlowImpl();
-    void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
-    void collectGarbage() CV_OVERRIDE;
-  protected: //!< algorithm parameters
-    int finest_scale, coarsest_scale;
-    int patch_size;
-    int patch_stride;
-    int grad_descent_iter;
-    int variational_refinement_iter;
-    float variational_refinement_alpha;
-    float variational_refinement_gamma;
-    float variational_refinement_delta;
-    bool use_mean_normalization;
-    bool use_spatial_propagation;
-  protected: //!< some auxiliary variables
-    int border_size;
-    int w, h;   //!< flow buffer width and height on the current scale
-    int ws, hs; //!< sparse flow buffer width and height on the current scale
-  public:
-    int getFinestScale() const CV_OVERRIDE { return finest_scale; }
-    void setFinestScale(int val) CV_OVERRIDE { finest_scale = val; }
-    int getPatchSize() const CV_OVERRIDE { return patch_size; }
-    void setPatchSize(int val) CV_OVERRIDE { patch_size = val; }
-    int getPatchStride() const CV_OVERRIDE { return patch_stride; }
-    void setPatchStride(int val) CV_OVERRIDE { patch_stride = val; }
-    int getGradientDescentIterations() const CV_OVERRIDE { return grad_descent_iter; }
-    void setGradientDescentIterations(int val) CV_OVERRIDE { grad_descent_iter = val; }
-    int getVariationalRefinementIterations() const CV_OVERRIDE { return variational_refinement_iter; }
-    void setVariationalRefinementIterations(int val) CV_OVERRIDE { variational_refinement_iter = val; }
-    float getVariationalRefinementAlpha() const CV_OVERRIDE { return variational_refinement_alpha; }
-    void setVariationalRefinementAlpha(float val) CV_OVERRIDE { variational_refinement_alpha = val; }
-    float getVariationalRefinementDelta() const CV_OVERRIDE { return variational_refinement_delta; }
-    void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; }
-    float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; }
-    void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; }
-    bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; }
-    void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; }
-    bool getUseSpatialPropagation() const CV_OVERRIDE { return use_spatial_propagation; }
-    void setUseSpatialPropagation(bool val) CV_OVERRIDE { use_spatial_propagation = val; }
-  protected:                      //!< internal buffers
-    vector<Mat_<uchar> > I0s;     //!< Gaussian pyramid for the current frame
-    vector<Mat_<uchar> > I1s;     //!< Gaussian pyramid for the next frame
-    vector<Mat_<uchar> > I1s_ext; //!< I1s with borders
-    vector<Mat_<short> > I0xs; //!< Gaussian pyramid for the x gradient of the current frame
-    vector<Mat_<short> > I0ys; //!< Gaussian pyramid for the y gradient of the current frame
-    vector<Mat_<float> > Ux; //!< x component of the flow vectors
-    vector<Mat_<float> > Uy; //!< y component of the flow vectors
-    vector<Mat_<float> > initial_Ux; //!< x component of the initial flow field, if one was passed as an input
-    vector<Mat_<float> > initial_Uy; //!< y component of the initial flow field, if one was passed as an input
-    Mat_<Vec2f> U; //!< a buffer for the merged flow
-    Mat_<float> Sx; //!< intermediate sparse flow representation (x component)
-    Mat_<float> Sy; //!< intermediate sparse flow representation (y component)
-    /* Structure tensor components: */
-    Mat_<float> I0xx_buf; //!< sum of squares of x gradient values
-    Mat_<float> I0yy_buf; //!< sum of squares of y gradient values
-    Mat_<float> I0xy_buf; //!< sum of x and y gradient products
-    /* Extra buffers that are useful if patch mean-normalization is used: */
-    Mat_<float> I0x_buf; //!< sum of x gradient values
-    Mat_<float> I0y_buf; //!< sum of y gradient values
-    /* Auxiliary buffers used in structure tensor computation: */
-    Mat_<float> I0xx_buf_aux;
-    Mat_<float> I0yy_buf_aux;
-    Mat_<float> I0xy_buf_aux;
-    Mat_<float> I0x_buf_aux;
-    Mat_<float> I0y_buf_aux;
-    vector<Ptr<VariationalRefinement> > variational_refinement_processors;
-  private: //!< private methods and parallel sections
-    void prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow);
-    void precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, Mat &dst_I0y, Mat &I0x,
-                                   Mat &I0y);
-    struct PatchInverseSearch_ParBody : public ParallelLoopBody
-    {
-        DISOpticalFlowImpl *dis;
-        int nstripes, stripe_sz;
-        int hs;
-        Mat *Sx, *Sy, *Ux, *Uy, *I0, *I1, *I0x, *I0y;
-        int num_iter, pyr_level;
-        PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _hs, Mat &dst_Sx, Mat &dst_Sy,
-                                   Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1, Mat &_I0x, Mat &_I0y, int _num_iter,
-                                   int _pyr_level);
-        void operator()(const Range &range) const CV_OVERRIDE;
-    };
-    struct Densification_ParBody : public ParallelLoopBody
-    {
-        DISOpticalFlowImpl *dis;
-        int nstripes, stripe_sz;
-        int h;
-        Mat *Ux, *Uy, *Sx, *Sy, *I0, *I1;
-        Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h, Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx,
-                              Mat &src_Sy, Mat &_I0, Mat &_I1);
-        void operator()(const Range &range) const CV_OVERRIDE;
-    };
-#ifdef HAVE_OPENCL
-    vector<UMat> u_I0s;     //!< Gaussian pyramid for the current frame
-    vector<UMat> u_I1s;     //!< Gaussian pyramid for the next frame
-    vector<UMat> u_I1s_ext; //!< I1s with borders
-    vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame
-    vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame
-    vector<UMat> u_Ux; //!< x component of the flow vectors
-    vector<UMat> u_Uy; //!< y component of the flow vectors
-    vector<UMat> u_initial_Ux; //!< x component of the initial flow field, if one was passed as an input
-    vector<UMat> u_initial_Uy; //!< y component of the initial flow field, if one was passed as an input
-    UMat u_U; //!< a buffer for the merged flow
-    UMat u_Sx; //!< intermediate sparse flow representation (x component)
-    UMat u_Sy; //!< intermediate sparse flow representation (y component)
-    /* Structure tensor components: */
-    UMat u_I0xx_buf; //!< sum of squares of x gradient values
-    UMat u_I0yy_buf; //!< sum of squares of y gradient values
-    UMat u_I0xy_buf; //!< sum of x and y gradient products
-    /* Extra buffers that are useful if patch mean-normalization is used: */
-    UMat u_I0x_buf; //!< sum of x gradient values
-    UMat u_I0y_buf; //!< sum of y gradient values
-    /* Auxiliary buffers used in structure tensor computation: */
-    UMat u_I0xx_buf_aux;
-    UMat u_I0yy_buf_aux;
-    UMat u_I0xy_buf_aux;
-    UMat u_I0x_buf_aux;
-    UMat u_I0y_buf_aux;
-    bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
-                                       UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y);
-    void ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool use_flow);
-    bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow);
-    bool ocl_Densification(UMat &dst_Ux, UMat &dst_Uy, UMat &src_Sx, UMat &src_Sy, UMat &_I0, UMat &_I1);
-    bool ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
-                                UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level);
-#endif
-};
-DISOpticalFlowImpl::DISOpticalFlowImpl()
-{
-    finest_scale = 2;
-    patch_size = 8;
-    patch_stride = 4;
-    grad_descent_iter = 16;
-    variational_refinement_iter = 5;
-    variational_refinement_alpha = 20.f;
-    variational_refinement_gamma = 10.f;
-    variational_refinement_delta = 5.f;
-    border_size = 16;
-    use_mean_normalization = true;
-    use_spatial_propagation = true;
-    /* Use separate variational refinement instances for different scales to avoid repeated memory allocation: */
-    int max_possible_scales = 10;
-    for (int i = 0; i < max_possible_scales; i++)
-        variational_refinement_processors.push_back(createVariationalFlowRefinement());
-}
-void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow)
-{
-    I0s.resize(coarsest_scale + 1);
-    I1s.resize(coarsest_scale + 1);
-    I1s_ext.resize(coarsest_scale + 1);
-    I0xs.resize(coarsest_scale + 1);
-    I0ys.resize(coarsest_scale + 1);
-    Ux.resize(coarsest_scale + 1);
-    Uy.resize(coarsest_scale + 1);
-    Mat flow_uv[2];
-    if (use_flow)
-    {
-        split(flow, flow_uv);
-        initial_Ux.resize(coarsest_scale + 1);
-        initial_Uy.resize(coarsest_scale + 1);
-    }
-    int fraction = 1;
-    int cur_rows = 0, cur_cols = 0;
-    for (int i = 0; i <= coarsest_scale; i++)
-    {
-        /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
-        if (i == finest_scale)
-        {
-            cur_rows = I0.rows / fraction;
-            cur_cols = I0.cols / fraction;
-            I0s[i].create(cur_rows, cur_cols);
-            resize(I0, I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
-            I1s[i].create(cur_rows, cur_cols);
-            resize(I1, I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
-            /* These buffers are reused in each scale so we initialize them once on the finest scale: */
-            Sx.create(cur_rows / patch_stride, cur_cols / patch_stride);
-            Sy.create(cur_rows / patch_stride, cur_cols / patch_stride);
-            I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
-            I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
-            I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
-            I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
-            I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
-            I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride);
-            I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride);
-            I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride);
-            I0x_buf_aux.create(cur_rows, cur_cols / patch_stride);
-            I0y_buf_aux.create(cur_rows, cur_cols / patch_stride);
-            U.create(cur_rows, cur_cols);
-        }
-        else if (i > finest_scale)
-        {
-            cur_rows = I0s[i - 1].rows / 2;
-            cur_cols = I0s[i - 1].cols / 2;
-            I0s[i].create(cur_rows, cur_cols);
-            resize(I0s[i - 1], I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
-            I1s[i].create(cur_rows, cur_cols);
-            resize(I1s[i - 1], I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
-        }
-        if (i >= finest_scale)
-        {
-            I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size);
-            copyMakeBorder(I1s[i], I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
-            I0xs[i].create(cur_rows, cur_cols);
-            I0ys[i].create(cur_rows, cur_cols);
-            spatialGradient(I0s[i], I0xs[i], I0ys[i]);
-            Ux[i].create(cur_rows, cur_cols);
-            Uy[i].create(cur_rows, cur_cols);
-            variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
-            variational_refinement_processors[i]->setDelta(variational_refinement_delta);
-            variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
-            variational_refinement_processors[i]->setSorIterations(5);
-            variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
-            if (use_flow)
-            {
-                resize(flow_uv[0], initial_Ux[i], Size(cur_cols, cur_rows));
-                initial_Ux[i] /= fraction;
-                resize(flow_uv[1], initial_Uy[i], Size(cur_cols, cur_rows));
-                initial_Uy[i] /= fraction;
-            }
-        }
-        fraction *= 2;
-    }
-}
-/* This function computes the structure tensor elements (local sums of I0x^2, I0x*I0y and I0y^2).
- * A simple box filter is not used instead because we need to compute these sums on a sparse grid
- * and store them densely in the output buffers.
- */
-void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x,
-                                                   Mat &dst_I0y, Mat &I0x, Mat &I0y)
-{
-    float *I0xx_ptr = dst_I0xx.ptr<float>();
-    float *I0yy_ptr = dst_I0yy.ptr<float>();
-    float *I0xy_ptr = dst_I0xy.ptr<float>();
-    float *I0x_ptr = dst_I0x.ptr<float>();
-    float *I0y_ptr = dst_I0y.ptr<float>();
-    float *I0xx_aux_ptr = I0xx_buf_aux.ptr<float>();
-    float *I0yy_aux_ptr = I0yy_buf_aux.ptr<float>();
-    float *I0xy_aux_ptr = I0xy_buf_aux.ptr<float>();
-    float *I0x_aux_ptr = I0x_buf_aux.ptr<float>();
-    float *I0y_aux_ptr = I0y_buf_aux.ptr<float>();
-    /* Separable box filter: horizontal pass */
-    for (int i = 0; i < h; i++)
-    {
-        float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
-        short *x_row = I0x.ptr<short>(i);
-        short *y_row = I0y.ptr<short>(i);
-        for (int j = 0; j < patch_size; j++)
-        {
-            sum_xx += x_row[j] * x_row[j];
-            sum_yy += y_row[j] * y_row[j];
-            sum_xy += x_row[j] * y_row[j];
-            sum_x += x_row[j];
-            sum_y += y_row[j];
-        }
-        I0xx_aux_ptr[i * ws] = sum_xx;
-        I0yy_aux_ptr[i * ws] = sum_yy;
-        I0xy_aux_ptr[i * ws] = sum_xy;
-        I0x_aux_ptr[i * ws] = sum_x;
-        I0y_aux_ptr[i * ws] = sum_y;
-        int js = 1;
-        for (int j = patch_size; j < w; j++)
-        {
-            sum_xx += (x_row[j] * x_row[j] - x_row[j - patch_size] * x_row[j - patch_size]);
-            sum_yy += (y_row[j] * y_row[j] - y_row[j - patch_size] * y_row[j - patch_size]);
-            sum_xy += (x_row[j] * y_row[j] - x_row[j - patch_size] * y_row[j - patch_size]);
-            sum_x += (x_row[j] - x_row[j - patch_size]);
-            sum_y += (y_row[j] - y_row[j - patch_size]);
-            if ((j - patch_size + 1) % patch_stride == 0)
-            {
-                I0xx_aux_ptr[i * ws + js] = sum_xx;
-                I0yy_aux_ptr[i * ws + js] = sum_yy;
-                I0xy_aux_ptr[i * ws + js] = sum_xy;
-                I0x_aux_ptr[i * ws + js] = sum_x;
-                I0y_aux_ptr[i * ws + js] = sum_y;
-                js++;
-            }
-        }
-    }
-    AutoBuffer<float> sum_xx(ws), sum_yy(ws), sum_xy(ws), sum_x(ws), sum_y(ws);
-    for (int j = 0; j < ws; j++)
-    {
-        sum_xx[j] = 0.0f;
-        sum_yy[j] = 0.0f;
-        sum_xy[j] = 0.0f;
-        sum_x[j] = 0.0f;
-        sum_y[j] = 0.0f;
-    }
-    /* Separable box filter: vertical pass */
-    for (int i = 0; i < patch_size; i++)
-        for (int j = 0; j < ws; j++)
-        {
-            sum_xx[j] += I0xx_aux_ptr[i * ws + j];
-            sum_yy[j] += I0yy_aux_ptr[i * ws + j];
-            sum_xy[j] += I0xy_aux_ptr[i * ws + j];
-            sum_x[j] += I0x_aux_ptr[i * ws + j];
-            sum_y[j] += I0y_aux_ptr[i * ws + j];
-        }
-    for (int j = 0; j < ws; j++)
-    {
-        I0xx_ptr[j] = sum_xx[j];
-        I0yy_ptr[j] = sum_yy[j];
-        I0xy_ptr[j] = sum_xy[j];
-        I0x_ptr[j] = sum_x[j];
-        I0y_ptr[j] = sum_y[j];
-    }
-    int is = 1;
-    for (int i = patch_size; i < h; i++)
-    {
-        for (int j = 0; j < ws; j++)
-        {
-            sum_xx[j] += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
-            sum_yy[j] += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
-            sum_xy[j] += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
-            sum_x[j] += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
-            sum_y[j] += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
-        }
-        if ((i - patch_size + 1) % patch_stride == 0)
-        {
-            for (int j = 0; j < ws; j++)
-            {
-                I0xx_ptr[is * ws + j] = sum_xx[j];
-                I0yy_ptr[is * ws + j] = sum_yy[j];
-                I0xy_ptr[is * ws + j] = sum_xy[j];
-                I0x_ptr[is * ws + j] = sum_x[j];
-                I0y_ptr[is * ws + j] = sum_y[j];
-            }
-            is++;
-        }
-    }
-}
-DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes,
-                                                                           int _hs, Mat &dst_Sx, Mat &dst_Sy,
-                                                                           Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1,
-                                                                           Mat &_I0x, Mat &_I0y, int _num_iter,
-                                                                           int _pyr_level)
-    : dis(&_dis), nstripes(_nstripes), hs(_hs), Sx(&dst_Sx), Sy(&dst_Sy), Ux(&src_Ux), Uy(&src_Uy), I0(&_I0), I1(&_I1),
-      I0x(&_I0x), I0y(&_I0y), num_iter(_num_iter), pyr_level(_pyr_level)
-{
-    stripe_sz = (int)ceil(hs / (double)nstripes);
-}
-/////////////////////////////////////////////* Patch processing functions */////////////////////////////////////////////
-/* Some auxiliary macros */
-#define HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION                                                                         \
-    v_float32x4 w00v = v_setall_f32(w00);                                                                              \
-    v_float32x4 w01v = v_setall_f32(w01);                                                                              \
-    v_float32x4 w10v = v_setall_f32(w10);                                                                              \
-    v_float32x4 w11v = v_setall_f32(w11);                                                                              \
-                                                                                                                       \
-    v_uint8x16 I0_row_16, I1_row_16, I1_row_shifted_16, I1_row_next_16, I1_row_next_shifted_16;                        \
-    v_uint16x8 I0_row_8, I1_row_8, I1_row_shifted_8, I1_row_next_8, I1_row_next_shifted_8, tmp;                        \
-    v_uint32x4 I0_row_4_left, I1_row_4_left, I1_row_shifted_4_left, I1_row_next_4_left, I1_row_next_shifted_4_left;    \
-    v_uint32x4 I0_row_4_right, I1_row_4_right, I1_row_shifted_4_right, I1_row_next_4_right,                            \
-      I1_row_next_shifted_4_right;                                                                                     \
-    v_float32x4 I_diff_left, I_diff_right;                                                                             \
-                                                                                                                       \
-    /* Preload and expand the first row of I1: */                                                                      \
-    I1_row_16 = v_load(I1_ptr);                                                                                        \
-    I1_row_shifted_16 = v_extract<1>(I1_row_16, I1_row_16);                                                            \
-    v_expand(I1_row_16, I1_row_8, tmp);                                                                                \
-    v_expand(I1_row_shifted_16, I1_row_shifted_8, tmp);                                                                \
-    v_expand(I1_row_8, I1_row_4_left, I1_row_4_right);                                                                 \
-    v_expand(I1_row_shifted_8, I1_row_shifted_4_left, I1_row_shifted_4_right);                                         \
-    I1_ptr += I1_stride;
-#define HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION                                                                      \
-    /* Load the next row of I1: */                                                                                     \
-    I1_row_next_16 = v_load(I1_ptr);                                                                                   \
-    /* Circular shift left by 1 element: */                                                                            \
-    I1_row_next_shifted_16 = v_extract<1>(I1_row_next_16, I1_row_next_16);                                             \
-    /* Expand to 8 ushorts (we only need the first 8 values): */                                                       \
-    v_expand(I1_row_next_16, I1_row_next_8, tmp);                                                                      \
-    v_expand(I1_row_next_shifted_16, I1_row_next_shifted_8, tmp);                                                      \
-    /* Separate the left and right halves: */                                                                          \
-    v_expand(I1_row_next_8, I1_row_next_4_left, I1_row_next_4_right);                                                  \
-    v_expand(I1_row_next_shifted_8, I1_row_next_shifted_4_left, I1_row_next_shifted_4_right);                          \
-                                                                                                                       \
-    /* Load current row of I0: */                                                                                      \
-    I0_row_16 = v_load(I0_ptr);                                                                                        \
-    v_expand(I0_row_16, I0_row_8, tmp);                                                                                \
-    v_expand(I0_row_8, I0_row_4_left, I0_row_4_right);                                                                 \
-                                                                                                                       \
-    /* Compute diffs between I0 and bilinearly interpolated I1: */                                                     \
-    I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) +                                              \
-                  w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) +                                      \
-                  w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) +                                         \
-                  w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) -                                 \
-                  v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left));                                                      \
-    I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) +                                            \
-                   w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) +                                    \
-                   w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) +                                       \
-                   w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) -                               \
-                   v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right));
-#define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW                                                                     \
-    I0_ptr += I0_stride;                                                                                               \
-    I1_ptr += I1_stride;                                                                                               \
-                                                                                                                       \
-    I1_row_4_left = I1_row_next_4_left;                                                                                \
-    I1_row_4_right = I1_row_next_4_right;                                                                              \
-    I1_row_shifted_4_left = I1_row_next_shifted_4_left;                                                                \
-    I1_row_shifted_4_right = I1_row_next_shifted_4_right;
-/* This function essentially performs one iteration of gradient descent when finding the most similar patch in I1 for a
- * given one in I0. It assumes that I0_ptr and I1_ptr already point to the corresponding patches and w00, w01, w10, w11
- * are precomputed bilinear interpolation weights. It returns the SSD (sum of squared differences) between these patches
- * and computes the values (dst_dUx, dst_dUy) that are used in the flow vector update. HAL acceleration is implemented
- * only for the default patch size (8x8). Everything is processed in floats as using fixed-point approximations harms
- * the quality significantly.
- */
-inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr, short *I0y_ptr,
-                          int I0_stride, int I1_stride, float w00, float w01, float w10, float w11, int patch_sz)
-{
-    float SSD = 0.0f;
-#ifdef CV_SIMD128
-    if (patch_sz == 8)
-    {
-        /* Variables to accumulate the sums */
-        v_float32x4 Ux_vec = v_setall_f32(0);
-        v_float32x4 Uy_vec = v_setall_f32(0);
-        v_float32x4 SSD_vec = v_setall_f32(0);
-        v_int16x8 I0x_row, I0y_row;
-        v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
-        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
-        for (int row = 0; row < 8; row++)
-        {
-            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
-            I0x_row = v_load(I0x_ptr);
-            v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
-            I0y_row = v_load(I0y_ptr);
-            v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
-            /* Update the sums: */
-            Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
-            Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
-            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
-            I0x_ptr += I0_stride;
-            I0y_ptr += I0_stride;
-            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
-        }
-        /* Final reduce operations: */
-        dst_dUx = v_reduce_sum(Ux_vec);
-        dst_dUy = v_reduce_sum(Uy_vec);
-        SSD = v_reduce_sum(SSD_vec);
-    }
-    else
-    {
-#endif
-        dst_dUx = 0.0f;
-        dst_dUy = 0.0f;
-        float diff;
-        for (int i = 0; i < patch_sz; i++)
-            for (int j = 0; j < patch_sz; j++)
-            {
-                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
-                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
-                       I0_ptr[i * I0_stride + j];
-                SSD += diff * diff;
-                dst_dUx += diff * I0x_ptr[i * I0_stride + j];
-                dst_dUy += diff * I0y_ptr[i * I0_stride + j];
-            }
-#ifdef CV_SIMD128
-    }
-#endif
-    return SSD;
-}
-/* Same as processPatch, but with patch mean normalization, which improves robustness under changing
- * lighting conditions
- */
-inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr,
-                                  short *I0y_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
-                                  float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
-{
-    float sum_diff = 0.0, sum_diff_sq = 0.0;
-    float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
-    float n = (float)patch_sz * patch_sz;
-#ifdef CV_SIMD128
-    if (patch_sz == 8)
-    {
-        /* Variables to accumulate the sums */
-        v_float32x4 sum_I0x_mul_vec = v_setall_f32(0);
-        v_float32x4 sum_I0y_mul_vec = v_setall_f32(0);
-        v_float32x4 sum_diff_vec = v_setall_f32(0);
-        v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
-        v_int16x8 I0x_row, I0y_row;
-        v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
-        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
-        for (int row = 0; row < 8; row++)
-        {
-            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
-            I0x_row = v_load(I0x_ptr);
-            v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
-            I0y_row = v_load(I0y_ptr);
-            v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
-            /* Update the sums: */
-            sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
-            sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
-            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
-            sum_diff_vec += I_diff_left + I_diff_right;
-            I0x_ptr += I0_stride;
-            I0y_ptr += I0_stride;
-            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
-        }
-        /* Final reduce operations: */
-        sum_I0x_mul = v_reduce_sum(sum_I0x_mul_vec);
-        sum_I0y_mul = v_reduce_sum(sum_I0y_mul_vec);
-        sum_diff = v_reduce_sum(sum_diff_vec);
-        sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
-    }
-    else
-    {
-#endif
-        float diff;
-        for (int i = 0; i < patch_sz; i++)
-            for (int j = 0; j < patch_sz; j++)
-            {
-                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
-                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
-                       I0_ptr[i * I0_stride + j];
-                sum_diff += diff;
-                sum_diff_sq += diff * diff;
-                sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j];
-                sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j];
-            }
-#ifdef CV_SIMD128
-    }
-#endif
-    dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
-    dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
-    return sum_diff_sq - sum_diff * sum_diff / n;
-}
-/* Similar to processPatch, but compute only the sum of squared differences (SSD) between the patches */
-inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
-                        float w11, int patch_sz)
-{
-    float SSD = 0.0f;
-#ifdef CV_SIMD128
-    if (patch_sz == 8)
-    {
-        v_float32x4 SSD_vec = v_setall_f32(0);
-        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
-        for (int row = 0; row < 8; row++)
-        {
-            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
-            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
-            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
-        }
-        SSD = v_reduce_sum(SSD_vec);
-    }
-    else
-    {
-#endif
-        float diff;
-        for (int i = 0; i < patch_sz; i++)
-            for (int j = 0; j < patch_sz; j++)
-            {
-                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
-                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
-                       I0_ptr[i * I0_stride + j];
-                SSD += diff * diff;
-            }
-#ifdef CV_SIMD128
-    }
-#endif
-    return SSD;
-}
-/* Same as computeSSD, but with patch mean normalization */
-inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01,
-                                float w10, float w11, int patch_sz)
-{
-    float sum_diff = 0.0f, sum_diff_sq = 0.0f;
-    float n = (float)patch_sz * patch_sz;
-#ifdef CV_SIMD128
-    if (patch_sz == 8)
-    {
-        v_float32x4 sum_diff_vec = v_setall_f32(0);
-        v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
-        HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
-        for (int row = 0; row < 8; row++)
-        {
-            HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
-            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
-            sum_diff_vec += I_diff_left + I_diff_right;
-            HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
-        }
-        sum_diff = v_reduce_sum(sum_diff_vec);
-        sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
-    }
-    else
-    {
-#endif
-        float diff;
-        for (int i = 0; i < patch_sz; i++)
-            for (int j = 0; j < patch_sz; j++)
-            {
-                diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
-                       w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
-                       I0_ptr[i * I0_stride + j];
-                sum_diff += diff;
-                sum_diff_sq += diff * diff;
-            }
-#ifdef CV_SIMD128
-    }
-#endif
-    return sum_diff_sq - sum_diff * sum_diff / n;
-}
-#undef HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION
-#undef HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION
-#undef HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const
-{
-    // force separate processing of stripes if we are using spatial propagation:
-    if (dis->use_spatial_propagation && range.end > range.start + 1)
-    {
-        for (int n = range.start; n < range.end; n++)
-            (*this)(Range(n, n + 1));
-        return;
-    }
-    int psz = dis->patch_size;
-    int psz2 = psz / 2;
-    int w_ext = dis->w + 2 * dis->border_size; //!< width of I1_ext
-    int bsz = dis->border_size;
-    /* Input dense flow */
-    float *Ux_ptr = Ux->ptr<float>();
-    float *Uy_ptr = Uy->ptr<float>();
-    /* Output sparse flow */
-    float *Sx_ptr = Sx->ptr<float>();
-    float *Sy_ptr = Sy->ptr<float>();
-    uchar *I0_ptr = I0->ptr<uchar>();
-    uchar *I1_ptr = I1->ptr<uchar>();
-    short *I0x_ptr = I0x->ptr<short>();
-    short *I0y_ptr = I0y->ptr<short>();
-    /* Precomputed structure tensor */
-    float *xx_ptr = dis->I0xx_buf.ptr<float>();
-    float *yy_ptr = dis->I0yy_buf.ptr<float>();
-    float *xy_ptr = dis->I0xy_buf.ptr<float>();
-    /* And extra buffers for mean-normalization: */
-    float *x_ptr = dis->I0x_buf.ptr<float>();
-    float *y_ptr = dis->I0y_buf.ptr<float>();
-    bool use_temporal_candidates = false;
-    float *initial_Ux_ptr = NULL, *initial_Uy_ptr = NULL;
-    if (!dis->initial_Ux.empty())
-    {
-        initial_Ux_ptr = dis->initial_Ux[pyr_level].ptr<float>();
-        initial_Uy_ptr = dis->initial_Uy[pyr_level].ptr<float>();
-        use_temporal_candidates = true;
-    }
-    int i, j, dir;
-    int start_is, end_is, start_js, end_js;
-    int start_i, start_j;
-    float i_lower_limit = bsz - psz + 1.0f;
-    float i_upper_limit = bsz + dis->h - 1.0f;
-    float j_lower_limit = bsz - psz + 1.0f;
-    float j_upper_limit = bsz + dis->w - 1.0f;
-    float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
-#define INIT_BILINEAR_WEIGHTS(Ux, Uy)                                                                                  \
-    i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit);                                                       \
-    j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit);                                                       \
-                                                                                                                       \
-    w11 = (i_I1 - floor(i_I1)) * (j_I1 - floor(j_I1));                                                                 \
-    w10 = (i_I1 - floor(i_I1)) * (floor(j_I1) + 1 - j_I1);                                                             \
-    w01 = (floor(i_I1) + 1 - i_I1) * (j_I1 - floor(j_I1));                                                             \
-    w00 = (floor(i_I1) + 1 - i_I1) * (floor(j_I1) + 1 - j_I1);
-#define COMPUTE_SSD(dst, Ux, Uy)                                                                                       \
-    INIT_BILINEAR_WEIGHTS(Ux, Uy);                                                                                     \
-    if (dis->use_mean_normalization)                                                                                   \
-        dst = computeSSDMeanNorm(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00,  \
-                                 w01, w10, w11, psz);                                                                  \
-    else                                                                                                               \
-        dst = computeSSD(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, w01,     \
-                         w10, w11, psz);
-    int num_inner_iter = (int)floor(dis->grad_descent_iter / (float)num_iter);
-    for (int iter = 0; iter < num_iter; iter++)
-    {
-        if (iter % 2 == 0)
-        {
-            dir = 1;
-            start_is = min(range.start * stripe_sz, hs);
-            end_is = min(range.end * stripe_sz, hs);
-            start_js = 0;
-            end_js = dis->ws;
-            start_i = start_is * dis->patch_stride;
-            start_j = 0;
-        }
-        else
-        {
-            dir = -1;
-            start_is = min(range.end * stripe_sz, hs) - 1;
-            end_is = min(range.start * stripe_sz, hs) - 1;
-            start_js = dis->ws - 1;
-            end_js = -1;
-            start_i = start_is * dis->patch_stride;
-            start_j = (dis->ws - 1) * dis->patch_stride;
-        }
-        i = start_i;
-        for (int is = start_is; dir * is < dir * end_is; is += dir)
-        {
-            j = start_j;
-            for (int js = start_js; dir * js < dir * end_js; js += dir)
-            {
-                if (iter == 0)
-                {
-                    /* Using result form the previous pyramid level as the very first approximation: */
-                    Sx_ptr[is * dis->ws + js] = Ux_ptr[(i + psz2) * dis->w + j + psz2];
-                    Sy_ptr[is * dis->ws + js] = Uy_ptr[(i + psz2) * dis->w + j + psz2];
-                }
-                float min_SSD = INF, cur_SSD;
-                if (use_temporal_candidates || dis->use_spatial_propagation)
-                {
-                    COMPUTE_SSD(min_SSD, Sx_ptr[is * dis->ws + js], Sy_ptr[is * dis->ws + js]);
-                }
-                if (use_temporal_candidates)
-                {
-                    /* Try temporal candidates (vectors from the initial flow field that was passed to the function) */
-                    COMPUTE_SSD(cur_SSD, initial_Ux_ptr[(i + psz2) * dis->w + j + psz2],
-                                initial_Uy_ptr[(i + psz2) * dis->w + j + psz2]);
-                    if (cur_SSD < min_SSD)
-                    {
-                        min_SSD = cur_SSD;
-                        Sx_ptr[is * dis->ws + js] = initial_Ux_ptr[(i + psz2) * dis->w + j + psz2];
-                        Sy_ptr[is * dis->ws + js] = initial_Uy_ptr[(i + psz2) * dis->w + j + psz2];
-                    }
-                }
-                if (dis->use_spatial_propagation)
-                {
-                    /* Try spatial candidates: */
-                    if (dir * js > dir * start_js)
-                    {
-                        COMPUTE_SSD(cur_SSD, Sx_ptr[is * dis->ws + js - dir], Sy_ptr[is * dis->ws + js - dir]);
-                        if (cur_SSD < min_SSD)
-                        {
-                            min_SSD = cur_SSD;
-                            Sx_ptr[is * dis->ws + js] = Sx_ptr[is * dis->ws + js - dir];
-                            Sy_ptr[is * dis->ws + js] = Sy_ptr[is * dis->ws + js - dir];
-                        }
-                    }
-                    /* Flow vectors won't actually propagate across different stripes, which is the reason for keeping
-                     * the number of stripes constant. It works well enough in practice and doesn't introduce any
-                     * visible seams.
-                     */
-                    if (dir * is > dir * start_is)
-                    {
-                        COMPUTE_SSD(cur_SSD, Sx_ptr[(is - dir) * dis->ws + js], Sy_ptr[(is - dir) * dis->ws + js]);
-                        if (cur_SSD < min_SSD)
-                        {
-                            min_SSD = cur_SSD;
-                            Sx_ptr[is * dis->ws + js] = Sx_ptr[(is - dir) * dis->ws + js];
-                            Sy_ptr[is * dis->ws + js] = Sy_ptr[(is - dir) * dis->ws + js];
-                        }
-                    }
-                }
-                /* Use the best candidate as a starting point for the gradient descent: */
-                float cur_Ux = Sx_ptr[is * dis->ws + js];
-                float cur_Uy = Sy_ptr[is * dis->ws + js];
-                /* Computing the inverse of the structure tensor: */
-                float detH = xx_ptr[is * dis->ws + js] * yy_ptr[is * dis->ws + js] -
-                             xy_ptr[is * dis->ws + js] * xy_ptr[is * dis->ws + js];
-                if (abs(detH) < EPS)
-                    detH = EPS;
-                float invH11 = yy_ptr[is * dis->ws + js] / detH;
-                float invH12 = -xy_ptr[is * dis->ws + js] / detH;
-                float invH22 = xx_ptr[is * dis->ws + js] / detH;
-                float prev_SSD = INF, SSD;
-                float x_grad_sum = x_ptr[is * dis->ws + js];
-                float y_grad_sum = y_ptr[is * dis->ws + js];
-                for (int t = 0; t < num_inner_iter; t++)
-                {
-                    INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
-                    if (dis->use_mean_normalization)
-                        SSD = processPatchMeanNorm(dUx, dUy, I0_ptr + i * dis->w + j,
-                                                   I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * dis->w + j,
-                                                   I0y_ptr + i * dis->w + j, dis->w, w_ext, w00, w01, w10, w11, psz,
-                                                   x_grad_sum, y_grad_sum);
-                    else
-                        SSD = processPatch(dUx, dUy, I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
-                                           I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j, dis->w, w_ext, w00, w01,
-                                           w10, w11, psz);
-                    dx = invH11 * dUx + invH12 * dUy;
-                    dy = invH12 * dUx + invH22 * dUy;
-                    cur_Ux -= dx;
-                    cur_Uy -= dy;
-                    /* Break when patch distance stops decreasing */
-                    if (SSD >= prev_SSD)
-                        break;
-                    prev_SSD = SSD;
-                }
-                /* If gradient descent converged to a flow vector that is very far from the initial approximation
-                 * (more than patch size) then we don't use it. Noticeably improves the robustness.
-                 */
-                if (norm(Vec2f(cur_Ux - Sx_ptr[is * dis->ws + js], cur_Uy - Sy_ptr[is * dis->ws + js])) <= psz)
-                {
-                    Sx_ptr[is * dis->ws + js] = cur_Ux;
-                    Sy_ptr[is * dis->ws + js] = cur_Uy;
-                }
-                j += dir * dis->patch_stride;
-            }
-            i += dir * dis->patch_stride;
-        }
-    }
-#undef INIT_BILINEAR_WEIGHTS
-#undef COMPUTE_SSD
-}
-DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h,
-                                                                 Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy,
-                                                                 Mat &_I0, Mat &_I1)
-    : dis(&_dis), nstripes(_nstripes), h(_h), Ux(&dst_Ux), Uy(&dst_Uy), Sx(&src_Sx), Sy(&src_Sy), I0(&_I0), I1(&_I1)
-{
-    stripe_sz = (int)ceil(h / (double)nstripes);
-}
-/* This function transforms a sparse optical flow field obtained by PatchInverseSearch (which computes flow values
- * on a sparse grid defined by patch_stride) into a dense optical flow field by weighted averaging of values from the
- * overlapping patches.
- */
-void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const
-{
-    int start_i = min(range.start * stripe_sz, h);
-    int end_i = min(range.end * stripe_sz, h);
-    /* Input sparse flow */
-    float *Sx_ptr = Sx->ptr<float>();
-    float *Sy_ptr = Sy->ptr<float>();
-    /* Output dense flow */
-    float *Ux_ptr = Ux->ptr<float>();
-    float *Uy_ptr = Uy->ptr<float>();
-    uchar *I0_ptr = I0->ptr<uchar>();
-    uchar *I1_ptr = I1->ptr<uchar>();
-    int psz = dis->patch_size;
-    int pstr = dis->patch_stride;
-    int i_l, i_u;
-    int j_l, j_u;
-    float i_m, j_m, diff;
-    /* These values define the set of sparse grid locations that contain patches overlapping with the current dense flow
-     * location */
-    int start_is, end_is;
-    int start_js, end_js;
-/* Some helper macros for updating this set of sparse grid locations */
-#define UPDATE_SPARSE_I_COORDINATES                                                                                    \
-    if (i % pstr == 0 && i + psz <= h)                                                                                 \
-        end_is++;                                                                                                      \
-    if (i - psz >= 0 && (i - psz) % pstr == 0 && start_is < end_is)                                                    \
-        start_is++;
-#define UPDATE_SPARSE_J_COORDINATES                                                                                    \
-    if (j % pstr == 0 && j + psz <= dis->w)                                                                            \
-        end_js++;                                                                                                      \
-    if (j - psz >= 0 && (j - psz) % pstr == 0 && start_js < end_js)                                                    \
-        start_js++;
-    start_is = 0;
-    end_is = -1;
-    for (int i = 0; i < start_i; i++)
-    {
-        UPDATE_SPARSE_I_COORDINATES;
-    }
-    for (int i = start_i; i < end_i; i++)
-    {
-        UPDATE_SPARSE_I_COORDINATES;
-        start_js = 0;
-        end_js = -1;
-        for (int j = 0; j < dis->w; j++)
-        {
-            UPDATE_SPARSE_J_COORDINATES;
-            float coef, sum_coef = 0.0f;
-            float sum_Ux = 0.0f;
-            float sum_Uy = 0.0f;
-            /* Iterate through all the patches that overlap the current location (i,j) */
-            for (int is = start_is; is <= end_is; is++)
-                for (int js = start_js; js <= end_js; js++)
-                {
-                    j_m = min(max(j + Sx_ptr[is * dis->ws + js], 0.0f), dis->w - 1.0f - EPS);
-                    i_m = min(max(i + Sy_ptr[is * dis->ws + js], 0.0f), dis->h - 1.0f - EPS);
-                    j_l = (int)j_m;
-                    j_u = j_l + 1;
-                    i_l = (int)i_m;
-                    i_u = i_l + 1;
-                    diff = (j_m - j_l) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_u] +
-                           (j_u - j_m) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_l] +
-                           (j_m - j_l) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_u] +
-                           (j_u - j_m) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_l] - I0_ptr[i * dis->w + j];
-                    coef = 1 / max(1.0f, abs(diff));
-                    sum_Ux += coef * Sx_ptr[is * dis->ws + js];
-                    sum_Uy += coef * Sy_ptr[is * dis->ws + js];
-                    sum_coef += coef;
-                }
-            Ux_ptr[i * dis->w + j] = sum_Ux / sum_coef;
-            Uy_ptr[i * dis->w + j] = sum_Uy / sum_coef;
-        }
-    }
-#undef UPDATE_SPARSE_I_COORDINATES
-#undef UPDATE_SPARSE_J_COORDINATES
-}
-#ifdef HAVE_OPENCL
-bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
-                                                UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level)
-{
-    size_t globalSize[] = {(size_t)ws, (size_t)hs};
-    size_t localSize[]  = {16, 16};
-    int idx;
-    int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter);
-    for (int iter = 0; iter < num_iter; iter++)
-    {
-        if (iter == 0)
-        {
-            ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::optflow::dis_flow_oclsrc);
-            size_t global_sz[] = {(size_t)hs * 8};
-            size_t local_sz[]  = {8};
-            idx = 0;
-            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(src_Ux));
-            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(src_Uy));
-            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(I0));
-            idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(I1));
-            idx = k1.set(idx, (int)border_size);
-            idx = k1.set(idx, (int)patch_size);
-            idx = k1.set(idx, (int)patch_stride);
-            idx = k1.set(idx, (int)w);
-            idx = k1.set(idx, (int)h);
-            idx = k1.set(idx, (int)ws);
-            idx = k1.set(idx, (int)hs);
-            idx = k1.set(idx, (int)pyr_level);
-            idx = k1.set(idx, ocl::KernelArg::PtrWriteOnly(u_Sx));
-            idx = k1.set(idx, ocl::KernelArg::PtrWriteOnly(u_Sy));
-            if (!k1.run(1, global_sz, local_sz, false))
-                return false;
-            ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::optflow::dis_flow_oclsrc);
-            idx = 0;
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(src_Ux));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(src_Uy));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I1));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0x));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0y));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xx_buf));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0yy_buf));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xy_buf));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0x_buf));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0y_buf));
-            idx = k2.set(idx, (int)border_size);
-            idx = k2.set(idx, (int)patch_size);
-            idx = k2.set(idx, (int)patch_stride);
-            idx = k2.set(idx, (int)w);
-            idx = k2.set(idx, (int)h);
-            idx = k2.set(idx, (int)ws);
-            idx = k2.set(idx, (int)hs);
-            idx = k2.set(idx, (int)num_inner_iter);
-            idx = k2.set(idx, (int)pyr_level);
-            idx = k2.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
-            idx = k2.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
-            if (!k2.run(2, globalSize, localSize, false))
-                return false;
-        }
-        else
-        {
-            ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::optflow::dis_flow_oclsrc);
-            size_t global_sz[] = {(size_t)hs * 8};
-            size_t local_sz[]  = {8};
-            idx = 0;
-            idx = k3.set(idx, ocl::KernelArg::PtrReadOnly(I0));
-            idx = k3.set(idx, ocl::KernelArg::PtrReadOnly(I1));
-            idx = k3.set(idx, (int)border_size);
-            idx = k3.set(idx, (int)patch_size);
-            idx = k3.set(idx, (int)patch_stride);
-            idx = k3.set(idx, (int)w);
-            idx = k3.set(idx, (int)h);
-            idx = k3.set(idx, (int)ws);
-            idx = k3.set(idx, (int)hs);
-            idx = k3.set(idx, (int)pyr_level);
-            idx = k3.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
-            idx = k3.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
-            if (!k3.run(1, global_sz, local_sz, false))
-                return false;
-            ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::optflow::dis_flow_oclsrc);
-            idx = 0;
-            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0));
-            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I1));
-            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0x));
-            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0y));
-            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xx_buf));
-            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0yy_buf));
-            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xy_buf));
-            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0x_buf));
-            idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0y_buf));
-            idx = k4.set(idx, (int)border_size);
-            idx = k4.set(idx, (int)patch_size);
-            idx = k4.set(idx, (int)patch_stride);
-            idx = k4.set(idx, (int)w);
-            idx = k4.set(idx, (int)h);
-            idx = k4.set(idx, (int)ws);
-            idx = k4.set(idx, (int)hs);
-            idx = k4.set(idx, (int)num_inner_iter);
-            idx = k4.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
-            idx = k4.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
-            if (!k4.run(2, globalSize, localSize, false))
-                return false;
-        }
-    }
-    return true;
-}
-bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_Ux, UMat &dst_Uy, UMat &src_Sx, UMat &src_Sy, UMat &_I0, UMat &_I1)
-{
-    size_t globalSize[] = {(size_t)w, (size_t)h};
-    size_t localSize[]  = {16, 16};
-    ocl::Kernel kernel("dis_densification", ocl::optflow::dis_flow_oclsrc);
-    kernel.args(ocl::KernelArg::PtrReadOnly(src_Sx),
-                ocl::KernelArg::PtrReadOnly(src_Sy),
-                ocl::KernelArg::PtrReadOnly(_I0),
-                ocl::KernelArg::PtrReadOnly(_I1),
-                (int)patch_size, (int)patch_stride,
-                (int)w, (int)h, (int)ws,
-                ocl::KernelArg::PtrWriteOnly(dst_Ux),
-                ocl::KernelArg::PtrWriteOnly(dst_Uy));
-    return kernel.run(2, globalSize, localSize, false);
-}
-void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool use_flow)
-{
-    u_I0s.resize(coarsest_scale + 1);
-    u_I1s.resize(coarsest_scale + 1);
-    u_I1s_ext.resize(coarsest_scale + 1);
-    u_I0xs.resize(coarsest_scale + 1);
-    u_I0ys.resize(coarsest_scale + 1);
-    u_Ux.resize(coarsest_scale + 1);
-    u_Uy.resize(coarsest_scale + 1);
-    vector<UMat> flow_uv(2);
-    if (use_flow)
-    {
-        split(flow, flow_uv);
-        u_initial_Ux.resize(coarsest_scale + 1);
-        u_initial_Uy.resize(coarsest_scale + 1);
-    }
-    int fraction = 1;
-    int cur_rows = 0, cur_cols = 0;
-    for (int i = 0; i <= coarsest_scale; i++)
-    {
-        /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
-        if (i == finest_scale)
-        {
-            cur_rows = I0.rows / fraction;
-            cur_cols = I0.cols / fraction;
-            u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
-            resize(I0, u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
-            u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
-            resize(I1, u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
-            /* These buffers are reused in each scale so we initialize them once on the finest scale: */
-            u_Sx.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
-            u_Sy.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
-            u_I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
-            u_I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
-            u_I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
-            u_I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
-            u_I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
-            u_I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
-            u_I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
-            u_I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
-            u_I0x_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
-            u_I0y_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
-            u_U.create(cur_rows, cur_cols, CV_32FC2);
-        }
-        else if (i > finest_scale)
-        {
-            cur_rows = u_I0s[i - 1].rows / 2;
-            cur_cols = u_I0s[i - 1].cols / 2;
-            u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
-            resize(u_I0s[i - 1], u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
-            u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
-            resize(u_I1s[i - 1], u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
-        }
-        if (i >= finest_scale)
-        {
-            u_I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size, CV_8UC1);
-            copyMakeBorder(u_I1s[i], u_I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
-            u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1);
-            u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1);
-            spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]);
-            u_Ux[i].create(cur_rows, cur_cols, CV_32FC1);
-            u_Uy[i].create(cur_rows, cur_cols, CV_32FC1);
-            variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
-            variational_refinement_processors[i]->setDelta(variational_refinement_delta);
-            variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
-            variational_refinement_processors[i]->setSorIterations(5);
-            variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
-            if (use_flow)
-            {
-                resize(flow_uv[0], u_initial_Ux[i], Size(cur_cols, cur_rows));
-                divide(u_initial_Ux[i], static_cast<float>(fraction), u_initial_Ux[i]);
-                resize(flow_uv[1], u_initial_Uy[i], Size(cur_cols, cur_rows));
-                divide(u_initial_Uy[i], static_cast<float>(fraction), u_initial_Uy[i]);
-            }
-        }
-        fraction *= 2;
-    }
-}
-bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
-                                                       UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y)
-{
-    size_t globalSizeX[] = {(size_t)h};
-    size_t localSizeX[]  = {16};
-    ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::optflow::dis_flow_oclsrc);
-    kernelX.args(ocl::KernelArg::PtrReadOnly(I0x),
-                 ocl::KernelArg::PtrReadOnly(I0y),
-                 (int)patch_size, (int)patch_stride,
-                 (int)w, (int)h, (int)ws,
-                 ocl::KernelArg::PtrWriteOnly(u_I0xx_buf_aux),
-                 ocl::KernelArg::PtrWriteOnly(u_I0yy_buf_aux),
-                 ocl::KernelArg::PtrWriteOnly(u_I0xy_buf_aux),
-                 ocl::KernelArg::PtrWriteOnly(u_I0x_buf_aux),
-                 ocl::KernelArg::PtrWriteOnly(u_I0y_buf_aux));
-    if (!kernelX.run(1, globalSizeX, localSizeX, false))
-        return false;
-    size_t globalSizeY[] = {(size_t)ws};
-    size_t localSizeY[]  = {16};
-    ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::optflow::dis_flow_oclsrc);
-    kernelY.args(ocl::KernelArg::PtrReadOnly(u_I0xx_buf_aux),
-                 ocl::KernelArg::PtrReadOnly(u_I0yy_buf_aux),
-                 ocl::KernelArg::PtrReadOnly(u_I0xy_buf_aux),
-                 ocl::KernelArg::PtrReadOnly(u_I0x_buf_aux),
-                 ocl::KernelArg::PtrReadOnly(u_I0y_buf_aux),
-                 (int)patch_size, (int)patch_stride,
-                 (int)w, (int)h, (int)ws,
-                 ocl::KernelArg::PtrWriteOnly(dst_I0xx),
-                 ocl::KernelArg::PtrWriteOnly(dst_I0yy),
-                 ocl::KernelArg::PtrWriteOnly(dst_I0xy),
-                 ocl::KernelArg::PtrWriteOnly(dst_I0x),
-                 ocl::KernelArg::PtrWriteOnly(dst_I0y));
-    return kernelY.run(1, globalSizeY, localSizeY, false);
-}
-bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow)
-{
-    UMat I0Mat = I0.getUMat();
-    UMat I1Mat = I1.getUMat();
-    bool use_input_flow = false;
-    if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
-        use_input_flow = true;
-    else
-        flow.create(I1Mat.size(), CV_32FC2);
-    UMat &u_flowMat = flow.getUMatRef();
-    coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code serach for maximal movement of width/4 */
-                         (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0)));              /* Deepest pyramid level greater or equal than patch*/
-    ocl_prepareBuffers(I0Mat, I1Mat, u_flowMat, use_input_flow);
-    u_Ux[coarsest_scale].setTo(0.0f);
-    u_Uy[coarsest_scale].setTo(0.0f);
-    for (int i = coarsest_scale; i >= finest_scale; i--)
-    {
-        w = u_I0s[i].cols;
-        h = u_I0s[i].rows;
-        ws = 1 + (w - patch_size) / patch_stride;
-        hs = 1 + (h - patch_size) / patch_stride;
-        if (!ocl_precomputeStructureTensor(u_I0xx_buf, u_I0yy_buf, u_I0xy_buf,
-                                           u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i]))
-            return false;
-        if (!ocl_PatchInverseSearch(u_Ux[i], u_Uy[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], 2, i))
-            return false;
-        if (!ocl_Densification(u_Ux[i], u_Uy[i], u_Sx, u_Sy, u_I0s[i], u_I1s[i]))
-            return false;
-        if (variational_refinement_iter > 0)
-            variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i],
-                                                         u_Ux[i].getMat(ACCESS_WRITE), u_Uy[i].getMat(ACCESS_WRITE));
-        if (i > finest_scale)
-        {
-            resize(u_Ux[i], u_Ux[i - 1], u_Ux[i - 1].size());
-            resize(u_Uy[i], u_Uy[i - 1], u_Uy[i - 1].size());
-            multiply(u_Ux[i - 1], 2, u_Ux[i - 1]);
-            multiply(u_Uy[i - 1], 2, u_Uy[i - 1]);
-        }
-    }
-    vector<UMat> uxy(2);
-    uxy[0] = u_Ux[finest_scale];
-    uxy[1] = u_Uy[finest_scale];
-    merge(uxy, u_U);
-    resize(u_U, u_flowMat, u_flowMat.size());
-    multiply(u_flowMat, 1 << finest_scale, u_flowMat);
-    return true;
-}
-#endif
-void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
-{
-    CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
-    CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1);
-    CV_Assert(I0.sameSize(I1));
-    CV_Assert(I0.isContinuous());
-    CV_Assert(I1.isContinuous());
-    CV_OCL_RUN(ocl::Device::getDefault().isIntel() && flow.isUMat() &&
-               (patch_size == 8) && (use_spatial_propagation == true),
-               ocl_calc(I0, I1, flow));
-    Mat I0Mat = I0.getMat();
-    Mat I1Mat = I1.getMat();
-    bool use_input_flow = false;
-    if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
-        use_input_flow = true;
-    else
-        flow.create(I1Mat.size(), CV_32FC2);
-    Mat flowMat = flow.getMat();
-    coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code serach for maximal movement of width/4 */
-                         (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0)));              /* Deepest pyramid level greater or equal than patch*/
-    int num_stripes = getNumThreads();
-    prepareBuffers(I0Mat, I1Mat, flowMat, use_input_flow);
-    Ux[coarsest_scale].setTo(0.0f);
-    Uy[coarsest_scale].setTo(0.0f);
-    for (int i = coarsest_scale; i >= finest_scale; i--)
-    {
-        w = I0s[i].cols;
-        h = I0s[i].rows;
-        ws = 1 + (w - patch_size) / patch_stride;
-        hs = 1 + (h - patch_size) / patch_stride;
-        precomputeStructureTensor(I0xx_buf, I0yy_buf, I0xy_buf, I0x_buf, I0y_buf, I0xs[i], I0ys[i]);
-        if (use_spatial_propagation)
-        {
-            /* Use a fixed number of stripes regardless the number of threads to make inverse search
-             * with spatial propagation reproducible
-             */
-            parallel_for_(Range(0, 8), PatchInverseSearch_ParBody(*this, 8, hs, Sx, Sy, Ux[i], Uy[i], I0s[i],
-                                                                  I1s_ext[i], I0xs[i], I0ys[i], 2, i));
-        }
-        else
-        {
-            parallel_for_(Range(0, num_stripes),
-                          PatchInverseSearch_ParBody(*this, num_stripes, hs, Sx, Sy, Ux[i], Uy[i], I0s[i], I1s_ext[i],
-                                                     I0xs[i], I0ys[i], 1, i));
-        }
-        parallel_for_(Range(0, num_stripes),
-                      Densification_ParBody(*this, num_stripes, I0s[i].rows, Ux[i], Uy[i], Sx, Sy, I0s[i], I1s[i]));
-        if (variational_refinement_iter > 0)
-            variational_refinement_processors[i]->calcUV(I0s[i], I1s[i], Ux[i], Uy[i]);
-        if (i > finest_scale)
-        {
-            resize(Ux[i], Ux[i - 1], Ux[i - 1].size());
-            resize(Uy[i], Uy[i - 1], Uy[i - 1].size());
-            Ux[i - 1] *= 2;
-            Uy[i - 1] *= 2;
-        }
-    }
-    Mat uxy[] = {Ux[finest_scale], Uy[finest_scale]};
-    merge(uxy, 2, U);
-    resize(U, flowMat, flowMat.size());
-    flowMat *= 1 << finest_scale;
-}
-void DISOpticalFlowImpl::collectGarbage()
-{
-    I0s.clear();
-    I1s.clear();
-    I1s_ext.clear();
-    I0xs.clear();
-    I0ys.clear();
-    Ux.clear();
-    Uy.clear();
-    U.release();
-    Sx.release();
-    Sy.release();
-    I0xx_buf.release();
-    I0yy_buf.release();
-    I0xy_buf.release();
-    I0xx_buf_aux.release();
-    I0yy_buf_aux.release();
-    I0xy_buf_aux.release();
-#ifdef HAVE_OPENCL
-    u_I0s.clear();
-    u_I1s.clear();
-    u_I1s_ext.clear();
-    u_I0xs.clear();
-    u_I0ys.clear();
-    u_Ux.clear();
-    u_Uy.clear();
-    u_U.release();
-    u_Sx.release();
-    u_Sy.release();
-    u_I0xx_buf.release();
-    u_I0yy_buf.release();
-    u_I0xy_buf.release();
-    u_I0xx_buf_aux.release();
-    u_I0yy_buf_aux.release();
-    u_I0xy_buf_aux.release();
-#endif
-    for (int i = finest_scale; i <= coarsest_scale; i++)
-        variational_refinement_processors[i]->collectGarbage();
-    variational_refinement_processors.clear();
-}
-Ptr<DISOpticalFlow> createOptFlow_DIS(int preset)
-{
-    Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>();
-    dis->setPatchSize(8);
-    if (preset == DISOpticalFlow::PRESET_ULTRAFAST)
-    {
-        dis->setFinestScale(2);
-        dis->setPatchStride(4);
-        dis->setGradientDescentIterations(12);
-        dis->setVariationalRefinementIterations(0);
-    }
-    else if (preset == DISOpticalFlow::PRESET_FAST)
-    {
-        dis->setFinestScale(2);
-        dis->setPatchStride(4);
-        dis->setGradientDescentIterations(16);
-        dis->setVariationalRefinementIterations(5);
-    }
-    else if (preset == DISOpticalFlow::PRESET_MEDIUM)
-    {
-        dis->setFinestScale(1);
-        dis->setPatchStride(3);
-        dis->setGradientDescentIterations(25);
-        dis->setVariationalRefinementIterations(5);
-    }
-    return dis;
-}
-}
-}
--- a/modules/optflow/src/opencl/dis_flow.cl
+++ b/modules/optflow/src/opencl/dis_flow.cl
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#define EPS 0.001f
-#define INF 1E+10F
-__kernel void dis_precomputeStructureTensor_hor(__global const short *I0x,
-                                                __global const short *I0y,
-                                                int patch_size, int patch_stride,
-                                                int w, int h, int ws,
-                                                __global float *I0xx_aux_ptr,
-                                                __global float *I0yy_aux_ptr,
-                                                __global float *I0xy_aux_ptr,
-                                                __global float *I0x_aux_ptr,
-                                                __global float *I0y_aux_ptr)
-{
-    int i = get_global_id(0);
-    if (i >= h) return;
-    const __global short *x_row = I0x + i * w;
-    const __global short *y_row = I0y + i * w;
-    float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
-    float8 x_vec = convert_float8(vload8(0, x_row));
-    float8 y_vec = convert_float8(vload8(0, y_row));
-    sum_xx = dot(x_vec.lo, x_vec.lo) + dot(x_vec.hi, x_vec.hi);
-    sum_yy = dot(y_vec.lo, y_vec.lo) + dot(y_vec.hi, y_vec.hi);
-    sum_xy = dot(x_vec.lo, y_vec.lo) + dot(x_vec.hi, y_vec.hi);
-    sum_x = dot(x_vec.lo, 1.0f) + dot(x_vec.hi, 1.0f);
-    sum_y = dot(y_vec.lo, 1.0f) + dot(y_vec.hi, 1.0f);
-    I0xx_aux_ptr[i * ws] = sum_xx;
-    I0yy_aux_ptr[i * ws] = sum_yy;
-    I0xy_aux_ptr[i * ws] = sum_xy;
-    I0x_aux_ptr[i * ws] = sum_x;
-    I0y_aux_ptr[i * ws] = sum_y;
-    int js = 1;
-    for (int j = patch_size; j < w; j++)
-    {
-        short x_val1 = x_row[j];
-        short x_val2 = x_row[j - patch_size];
-        short y_val1 = y_row[j];
-        short y_val2 = y_row[j - patch_size];
-        sum_xx += (x_val1 * x_val1 - x_val2 * x_val2);
-        sum_yy += (y_val1 * y_val1 - y_val2 * y_val2);
-        sum_xy += (x_val1 * y_val1 - x_val2 * y_val2);
-        sum_x += (x_val1 - x_val2);
-        sum_y += (y_val1 - y_val2);
-        if ((j - patch_size + 1) % patch_stride == 0)
-        {
-            int index = i * ws + js;
-            I0xx_aux_ptr[index] = sum_xx;
-            I0yy_aux_ptr[index] = sum_yy;
-            I0xy_aux_ptr[index] = sum_xy;
-            I0x_aux_ptr[index] = sum_x;
-            I0y_aux_ptr[index] = sum_y;
-            js++;
-        }
-    }
-}
-__kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_ptr,
-                                                __global const float *I0yy_aux_ptr,
-                                                __global const float *I0xy_aux_ptr,
-                                                __global const float *I0x_aux_ptr,
-                                                __global const float *I0y_aux_ptr,
-                                                int patch_size, int patch_stride,
-                                                int w, int h, int ws,
-                                                __global float *I0xx_ptr,
-                                                __global float *I0yy_ptr,
-                                                __global float *I0xy_ptr,
-                                                __global float *I0x_ptr,
-                                                __global float *I0y_ptr)
-{
-    int j = get_global_id(0);
-    if (j >= ws) return;
-    float sum_xx, sum_yy, sum_xy, sum_x, sum_y;
-    sum_xx = sum_yy = sum_xy = sum_x = sum_y = 0.0f;
-    for (int i = 0; i < patch_size; i++)
-    {
-        sum_xx += I0xx_aux_ptr[i * ws + j];
-        sum_yy += I0yy_aux_ptr[i * ws + j];
-        sum_xy += I0xy_aux_ptr[i * ws + j];
-        sum_x  += I0x_aux_ptr[i * ws + j];
-        sum_y  += I0y_aux_ptr[i * ws + j];
-    }
-    I0xx_ptr[j] = sum_xx;
-    I0yy_ptr[j] = sum_yy;
-    I0xy_ptr[j] = sum_xy;
-    I0x_ptr[j] = sum_x;
-    I0y_ptr[j] = sum_y;
-    int is = 1;
-    for (int i = patch_size; i < h; i++)
-    {
-        sum_xx += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
-        sum_yy += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
-        sum_xy += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
-        sum_x  += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
-        sum_y  += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
-        if ((i - patch_size + 1) % patch_stride == 0)
-        {
-            I0xx_ptr[is * ws + j] = sum_xx;
-            I0yy_ptr[is * ws + j] = sum_yy;
-            I0xy_ptr[is * ws + j] = sum_xy;
-            I0x_ptr[is * ws + j] = sum_x;
-            I0y_ptr[is * ws + j] = sum_y;
-            is++;
-        }
-    }
-}
-__kernel void dis_densification(__global const float *sx, __global const float *sy,
-                                __global const uchar *i0, __global const uchar *i1,
-                                int psz, int pstr,
-                                int w, int h, int ws,
-                                __global float *ux, __global float *uy)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int i, j;
-    if (x >= w || y >= h) return;
-    int start_is, end_is;
-    int start_js, end_js;
-    end_is = min(y / pstr, (h - psz) / pstr);
-    start_is = max(0, y - psz + pstr) / pstr;
-    start_is = min(start_is, end_is);
-    end_js = min(x / pstr, (w - psz) / pstr);
-    start_js = max(0, x - psz + pstr) / pstr;
-    start_js = min(start_js, end_js);
-    float coef, sum_coef = 0.0f;
-    float sum_Ux = 0.0f;
-    float sum_Uy = 0.0f;
-    int i_l, i_u;
-    int j_l, j_u;
-    float i_m, j_m, diff;
-    i = y;
-    j = x;
-    /* Iterate through all the patches that overlap the current location (i,j) */
-    for (int is = start_is; is <= end_is; is++)
-        for (int js = start_js; js <= end_js; js++)
-        {
-            float sx_val = sx[is * ws + js];
-            float sy_val = sy[is * ws + js];
-            uchar2 i1_vec1, i1_vec2;
-            j_m = min(max(j + sx_val, 0.0f), w - 1.0f - EPS);
-            i_m = min(max(i + sy_val, 0.0f), h - 1.0f - EPS);
-            j_l = (int)j_m;
-            j_u = j_l + 1;
-            i_l = (int)i_m;
-            i_u = i_l + 1;
-            i1_vec1 = vload2(0, i1 + i_u * w + j_l);
-            i1_vec2 = vload2(0, i1 + i_l * w + j_l);
-            diff = (j_m - j_l) * (i_m - i_l) * i1_vec1.y +
-                   (j_u - j_m) * (i_m - i_l) * i1_vec1.x +
-                   (j_m - j_l) * (i_u - i_m) * i1_vec2.y +
-                   (j_u - j_m) * (i_u - i_m) * i1_vec2.x - i0[i * w + j];
-            coef = 1 / max(1.0f, fabs(diff));
-            sum_Ux += coef * sx_val;
-            sum_Uy += coef * sy_val;
-            sum_coef += coef;
-        }
-    ux[i * w + j] = sum_Ux / sum_coef;
-    uy[i * w + j] = sum_Uy / sum_coef;
-}
-#define INIT_BILINEAR_WEIGHTS(Ux, Uy)                                                                                  \
-    i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit);                                                       \
-    j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit);                                                       \
-                                                                                                                       \
-    w11 = (i_I1 - floor(i_I1)) * (j_I1 - floor(j_I1));                                                                 \
-    w10 = (i_I1 - floor(i_I1)) * (floor(j_I1) + 1 - j_I1);                                                             \
-    w01 = (floor(i_I1) + 1 - i_I1) * (j_I1 - floor(j_I1));                                                             \
-    w00 = (floor(i_I1) + 1 - i_I1) * (floor(j_I1) + 1 - j_I1);
-float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
-                         int I0_stride, int I1_stride,
-                         float w00, float w01, float w10, float w11, int patch_sz, int i)
-{
-    float sum_diff = 0.0f, sum_diff_sq = 0.0f;
-    int n = patch_sz * patch_sz;
-    uchar8 I1_vec1, I1_vec2, I0_vec;
-    uchar I1_val1, I1_val2;
-    I0_vec  = vload8(0, I0_ptr + i * I0_stride);
-    I1_vec1 = vload8(0, I1_ptr + i * I1_stride);
-    I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);
-    I1_val1 = I1_ptr[i * I1_stride + 8];
-    I1_val2 = I1_ptr[(i + 1) * I1_stride + 8];
-    float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +
-                 w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -
-                 convert_float8(I0_vec);
-    sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
-    sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
-    sum_diff = sub_group_reduce_add(sum_diff);
-    sum_diff_sq = sub_group_reduce_add(sum_diff_sq);
-    return sum_diff_sq - sum_diff * sum_diff / n;
-}
-__kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr,
-                                             __global const uchar *I0_ptr, __global const uchar *I1_ptr,
-                                             int border_size, int patch_size, int patch_stride,
-                                             int w, int h, int ws, int hs, int pyr_level,
-                                             __global float *Sx_ptr, __global float *Sy_ptr)
-{
-    int id = get_global_id(0);
-    int is = id / 8;
-    if (id >= (hs * 8)) return;
-    int i = is * patch_stride;
-    int j = 0;
-    int psz = patch_size;
-    int psz2 = psz / 2;
-    int w_ext = w + 2 * border_size;
-    int bsz = border_size;
-    float i_lower_limit = bsz - psz + 1.0f;
-    float i_upper_limit = bsz + h - 1.0f;
-    float j_lower_limit = bsz - psz + 1.0f;
-    float j_upper_limit = bsz + w - 1.0f;
-    float i_I1, j_I1, w00, w01, w10, w11;
-    float prev_Ux = Ux_ptr[(i + psz2) * w + j + psz2];
-    float prev_Uy = Uy_ptr[(i + psz2) * w + j + psz2];
-    Sx_ptr[is * ws] = prev_Ux;
-    Sy_ptr[is * ws] = prev_Uy;
-    j += patch_stride;
-    int sid = get_sub_group_local_id();
-    for (int js = 1; js < ws; js++, j += patch_stride)
-    {
-        float min_SSD, cur_SSD;
-        float Ux = Ux_ptr[(i + psz2) * w + j + psz2];
-        float Uy = Uy_ptr[(i + psz2) * w + j + psz2];
-        INIT_BILINEAR_WEIGHTS(Ux, Uy);
-        min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
-                                     w, w_ext, w00, w01, w10, w11, psz, sid);
-        INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy);
-        cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
-                                     w, w_ext, w00, w01, w10, w11, psz, sid);
-        if (cur_SSD < min_SSD)
-        {
-            Ux = prev_Ux;
-            Uy = prev_Uy;
-        }
-        prev_Ux = Ux;
-        prev_Uy = Uy;
-        Sx_ptr[is * ws + js] = Ux;
-        Sy_ptr[is * ws + js] = Uy;
-    }
-}
-float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
-                            const __global short *I0x_ptr, const __global short *I0y_ptr,
-                            int I0_stride, int I1_stride, float w00, float w01, float w10,
-                            float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
-{
-    float sum_diff = 0.0, sum_diff_sq = 0.0;
-    float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
-    int n = patch_sz * patch_sz;
-    uchar8 I1_vec1, I1_vec2;
-    uchar I1_val1, I1_val2;
-    for (int i = 0; i < 8; i++)
-    {
-        uchar8 I0_vec = vload8(0, I0_ptr + i * I0_stride);
-        I1_vec1 = (i == 0) ? vload8(0, I1_ptr + i * I1_stride) : I1_vec2;
-        I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);
-        I1_val1 = (i == 0) ? I1_ptr[i * I1_stride + patch_sz] : I1_val2;
-        I1_val2 = I1_ptr[(i + 1) * I1_stride + patch_sz];
-        float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +
-                     w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -
-                     convert_float8(I0_vec);
-        sum_diff += (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
-        sum_diff_sq += (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
-        short8 I0x_vec = vload8(0, I0x_ptr + i * I0_stride);
-        short8 I0y_vec = vload8(0, I0y_ptr + i * I0_stride);
-        sum_I0x_mul += dot(vec.lo, convert_float4(I0x_vec.lo));
-        sum_I0x_mul += dot(vec.hi, convert_float4(I0x_vec.hi));
-        sum_I0y_mul += dot(vec.lo, convert_float4(I0y_vec.lo));
-        sum_I0y_mul += dot(vec.hi, convert_float4(I0y_vec.hi));
-    }
-    float dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
-    float dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
-    float SSD = sum_diff_sq - sum_diff * sum_diff / n;
-    return (float3)(SSD, dst_dUx, dst_dUy);
-}
-__kernel void dis_patch_inverse_search_fwd_2(__global const float *Ux_ptr, __global const float *Uy_ptr,
-                                             __global const uchar *I0_ptr, __global const uchar *I1_ptr,
-                                             __global const short *I0x_ptr, __global const short *I0y_ptr,
-                                             __global const float *xx_ptr, __global const float *yy_ptr,
-                                             __global const float *xy_ptr,
-                                             __global const float *x_ptr, __global const float *y_ptr,
-                                             int border_size, int patch_size, int patch_stride,
-                                             int w, int h, int ws, int hs, int num_inner_iter, int pyr_level,
-                                             __global float *Sx_ptr, __global float *Sy_ptr)
-{
-    int js = get_global_id(0);
-    int is = get_global_id(1);
-    int i = is * patch_stride;
-    int j = js * patch_stride;
-    int psz = patch_size;
-    int psz2 = psz / 2;
-    int w_ext = w + 2 * border_size;
-    int bsz = border_size;
-    int index = is * ws + js;
-    if (js >= ws || is >= hs) return;
-    float Ux = Sx_ptr[index];
-    float Uy = Sy_ptr[index];
-    float cur_Ux = Ux;
-    float cur_Uy = Uy;
-    float cur_xx = xx_ptr[index];
-    float cur_yy = yy_ptr[index];
-    float cur_xy = xy_ptr[index];
-    float detH = cur_xx * cur_yy - cur_xy * cur_xy;
-    if (fabs(detH) < EPS) detH = EPS;
-    float invH11 = cur_yy / detH;
-    float invH12 = -cur_xy / detH;
-    float invH22 = cur_xx / detH;
-    float prev_SSD = INF, SSD;
-    float x_grad_sum = x_ptr[index];
-    float y_grad_sum = y_ptr[index];
-    float i_lower_limit = bsz - psz + 1.0f;
-    float i_upper_limit = bsz + h - 1.0f;
-    float j_lower_limit = bsz - psz + 1.0f;
-    float j_upper_limit = bsz + w - 1.0f;
-    float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
-    float3 res;
-    for (int t = 0; t < num_inner_iter; t++)
-    {
-        INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
-        res = processPatchMeanNorm(I0_ptr + i * w + j,
-                                   I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * w + j,
-                                   I0y_ptr + i * w + j, w, w_ext, w00, w01, w10, w11, psz,
-                                   x_grad_sum, y_grad_sum);
-        SSD = res.x;
-        dUx = res.y;
-        dUy = res.z;
-        dx = invH11 * dUx + invH12 * dUy;
-        dy = invH12 * dUx + invH22 * dUy;
-        cur_Ux -= dx;
-        cur_Uy -= dy;
-        if (SSD >= prev_SSD)
-            break;
-        prev_SSD = SSD;
-    }
-    float2 vec = (float2)(cur_Ux - Ux, cur_Uy - Uy);
-    if (dot(vec, vec) <= (float)(psz * psz))
-    {
-        Sx_ptr[index] = cur_Ux;
-        Sy_ptr[index] = cur_Uy;
-    }
-}
-__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
-                                             int border_size, int patch_size, int patch_stride,
-                                             int w, int h, int ws, int hs, int pyr_level,
-                                             __global float *Sx_ptr, __global float *Sy_ptr)
-{
-    int id = get_global_id(0);
-    int is = id / 8;
-    if (id >= (hs * 8)) return;
-    is = (hs - 1 - is);
-    int i = is * patch_stride;
-    int j = (ws - 2) * patch_stride;
-    int psz = patch_size;
-    int psz2 = psz / 2;
-    int w_ext = w + 2 * border_size;
-    int bsz = border_size;
-    float i_lower_limit = bsz - psz + 1.0f;
-    float i_upper_limit = bsz + h - 1.0f;
-    float j_lower_limit = bsz - psz + 1.0f;
-    float j_upper_limit = bsz + w - 1.0f;
-    float i_I1, j_I1, w00, w01, w10, w11;
-    int sid = get_sub_group_local_id();
-    for (int js = (ws - 2); js > -1; js--, j -= patch_stride)
-    {
-        float min_SSD, cur_SSD;
-        float2 Ux = vload2(0, Sx_ptr + is * ws + js);
-        float2 Uy = vload2(0, Sy_ptr + is * ws + js);
-        INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x);
-        min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
-                                     w, w_ext, w00, w01, w10, w11, psz, sid);
-        INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y);
-        cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
-                                     w, w_ext, w00, w01, w10, w11, psz, sid);
-        if (cur_SSD < min_SSD)
-        {
-            Sx_ptr[is * ws + js] = Ux.y;
-            Sy_ptr[is * ws + js] = Uy.y;
-        }
-    }
-}
-__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
-                                             __global const short *I0x_ptr, __global const short *I0y_ptr,
-                                             __global const float *xx_ptr, __global const float *yy_ptr,
-                                             __global const float *xy_ptr,
-                                             __global const float *x_ptr, __global const float *y_ptr,
-                                             int border_size, int patch_size, int patch_stride,
-                                             int w, int h, int ws, int hs, int num_inner_iter,
-                                             __global float *Sx_ptr, __global float *Sy_ptr)
-{
-    int js = get_global_id(0);
-    int is = get_global_id(1);
-    if (js >= ws || is >= hs) return;
-    js = (ws - 1 - js);
-    is = (hs - 1 - is);
-    int j = js * patch_stride;
-    int i = is * patch_stride;
-    int psz = patch_size;
-    int psz2 = psz / 2;
-    int w_ext = w + 2 * border_size;
-    int bsz = border_size;
-    int index = is * ws + js;
-    float Ux = Sx_ptr[index];
-    float Uy = Sy_ptr[index];
-    float cur_Ux = Ux;
-    float cur_Uy = Uy;
-    float cur_xx = xx_ptr[index];
-    float cur_yy = yy_ptr[index];
-    float cur_xy = xy_ptr[index];
-    float detH = cur_xx * cur_yy - cur_xy * cur_xy;
-    if (fabs(detH) < EPS) detH = EPS;
-    float invH11 = cur_yy / detH;
-    float invH12 = -cur_xy / detH;
-    float invH22 = cur_xx / detH;
-    float prev_SSD = INF, SSD;
-    float x_grad_sum = x_ptr[index];
-    float y_grad_sum = y_ptr[index];
-    float i_lower_limit = bsz - psz + 1.0f;
-    float i_upper_limit = bsz + h - 1.0f;
-    float j_lower_limit = bsz - psz + 1.0f;
-    float j_upper_limit = bsz + w - 1.0f;
-    float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
-    float3 res;
-    for (int t = 0; t < num_inner_iter; t++)
-    {
-        INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
-        res = processPatchMeanNorm(I0_ptr + i * w + j,
-                                   I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * w + j,
-                                   I0y_ptr + i * w + j, w, w_ext, w00, w01, w10, w11, psz,
-                                   x_grad_sum, y_grad_sum);
-        SSD = res.x;
-        dUx = res.y;
-        dUy = res.z;
-        dx = invH11 * dUx + invH12 * dUy;
-        dy = invH12 * dUx + invH22 * dUy;
-        cur_Ux -= dx;
-        cur_Uy -= dy;
-        if (SSD >= prev_SSD)
-            break;
-        prev_SSD = SSD;
-    }
-    float2 vec = (float2)(cur_Ux - Ux, cur_Uy - Uy);
-    if ((dot(vec, vec)) <= (float)(psz * psz))
-    {
-        Sx_ptr[index] = cur_Ux;
-        Sy_ptr[index] = cur_Uy;
-    }
-}
--- a/modules/optflow/src/opencl/optical_flow_tvl1.cl
+++ b/modules/optflow/src/opencl/optical_flow_tvl1.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+__kernel void centeredGradientKernel(__global const float* src_ptr, int src_col, int src_row, int src_step,
+                                     __global float* dx, __global float* dy, int d_step)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if((x < src_col)&&(y < src_row))
+    {
+        int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);
+        int src_x2 = (x - 1) > 0 ? (x -1) : 0;
+        dx[y * d_step+ x] = 0.5f * (src_ptr[y * src_step + src_x1] - src_ptr[y * src_step+ src_x2]);
+        int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);
+        int src_y2 = (y - 1) > 0 ? (y - 1) : 0;
+        dy[y * d_step+ x] = 0.5f * (src_ptr[src_y1 * src_step + x] - src_ptr[src_y2 * src_step+ x]);
+    }
+}
+inline float bicubicCoeff(float x_)
+{
+    float x = fabs(x_);
+    if (x <= 1.0f)
+        return x * x * (1.5f * x - 2.5f) + 1.0f;
+    else if (x < 2.0f)
+        return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+    else
+        return 0.0f;
+}
+__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,
+    image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,
+    __global const float* u1, int u1_step,
+    __global const float* u2,
+    __global float* I1w,
+    __global float* I1wx, /*int I1wx_step,*/
+    __global float* I1wy, /*int I1wy_step,*/
+    __global float* grad, /*int grad_step,*/
+    __global float* rho,
+    int I1w_step,
+    int u2_step,
+    int u1_offset_x,
+    int u1_offset_y,
+    int u2_offset_x,
+    int u2_offset_y)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if(x < I0_col&&y < I0_row)
+    {
+        //float u1Val = u1(y, x);
+        float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+        //float u2Val = u2(y, x);
+        float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+        float wx = x + u1Val;
+        float wy = y + u2Val;
+        int xmin = ceil(wx - 2.0f);
+        int xmax = floor(wx + 2.0f);
+        int ymin = ceil(wy - 2.0f);
+        int ymax = floor(wy + 2.0f);
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+        sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+                //sum  += w * tex2D(tex_I1 , cx, cy);
+                int2 cood = (int2)(cx, cy);
+                sum += w * read_imagef(tex_I1, sampleri, cood).x;
+                //sumx += w * tex2D(tex_I1x, cx, cy);
+                sumx += w * read_imagef(tex_I1x, sampleri, cood).x;
+                //sumy += w * tex2D(tex_I1y, cx, cy);
+                sumy += w * read_imagef(tex_I1y, sampleri, cood).x;
+                wsum += w;
+            }
+        }
+        float coeff = 1.0f / wsum;
+        float I1wVal  = sum  * coeff;
+        float I1wxVal = sumx * coeff;
+        float I1wyVal = sumy * coeff;
+        I1w[y * I1w_step + x]  = I1wVal;
+        I1wx[y * I1w_step + x] = I1wxVal;
+        I1wy[y * I1w_step + x] = I1wyVal;
+        float Ix2 = I1wxVal * I1wxVal;
+        float Iy2 = I1wyVal * I1wyVal;
+        // store the |Grad(I1)|^2
+        grad[y * I1w_step + x] = Ix2 + Iy2;
+        // compute the constant part of the rho function
+        float I0Val = I0[y * I0_step + x];
+        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+}
+inline float readImage(__global const float *image,  int x,  int y,  int rows,  int cols, int elemCntPerRow)
+{
+    int i0 = clamp(x, 0, cols - 1);
+    int j0 = clamp(y, 0, rows - 1);
+    return image[j0 * elemCntPerRow + i0];
+}
+__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,
+    __global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,
+    __global const float* u1, int u1_step,
+    __global const float* u2,
+    __global float* I1w,
+    __global float* I1wx, /*int I1wx_step,*/
+    __global float* I1wy, /*int I1wy_step,*/
+    __global float* grad, /*int grad_step,*/
+    __global float* rho,
+    int I1w_step,
+    int u2_step,
+    int I1_step,
+    int I1x_step)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if(x < I0_col&&y < I0_row)
+    {
+        //float u1Val = u1(y, x);
+        float u1Val = u1[y * u1_step + x];
+        //float u2Val = u2(y, x);
+        float u2Val = u2[y * u2_step + x];
+        float wx = x + u1Val;
+        float wy = y + u2Val;
+        int xmin = ceil(wx - 2.0f);
+        int xmax = floor(wx + 2.0f);
+        int ymin = ceil(wy - 2.0f);
+        int ymax = floor(wy + 2.0f);
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+                int2 cood = (int2)(cx, cy);
+                sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);
+                sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);
+                sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);
+                wsum += w;
+            }
+        }
+        float coeff = 1.0f / wsum;
+        float I1wVal  = sum  * coeff;
+        float I1wxVal = sumx * coeff;
+        float I1wyVal = sumy * coeff;
+        I1w[y * I1w_step + x]  = I1wVal;
+        I1wx[y * I1w_step + x] = I1wxVal;
+        I1wy[y * I1w_step + x] = I1wyVal;
+        float Ix2 = I1wxVal * I1wxVal;
+        float Iy2 = I1wyVal * I1wyVal;
+        // store the |Grad(I1)|^2
+        grad[y * I1w_step + x] = Ix2 + Iy2;
+        // compute the constant part of the rho function
+        float I0Val = I0[y * I0_step + x];
+        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+}
+__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,
+    __global const float* u2,
+    __global float* p11, int p11_step,
+    __global float* p12,
+    __global float* p21,
+    __global float* p22,
+    float taut,
+    int u2_step,
+    int u1_offset_x,
+    int u1_offset_y,
+    int u2_offset_x,
+    int u2_offset_y)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if(x < u1_col && y < u1_row)
+    {
+        int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
+        float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+        int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);
+        float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+        int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
+        float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+        int src_y2 = (y + 1) <  (u1_row - 1) ? (y + 1) : (u1_row - 1);
+        float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+        float g1 = hypot(u1x, u1y);
+        float g2 = hypot(u2x, u2y);
+        float ng1 = 1.0f + taut * g1;
+        float ng2 = 1.0f + taut * g2;
+        p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;
+        p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;
+        p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;
+        p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;
+    }
+}
+inline float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)
+{
+    if (x > 0 && y > 0)
+    {
+        float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];
+        float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];
+        return v1x + v2y;
+    }
+    else
+    {
+        if (y > 0)
+            return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];
+        else
+        {
+            if (x > 0)
+                return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];
+            else
+                return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];
+        }
+    }
+}
+__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,
+    __global const float* I1wy, /*int I1wy_step,*/
+    __global const float* grad, /*int grad_step,*/
+    __global const float* rho_c, /*int rho_c_step,*/
+    __global const float* p11, /*int p11_step,*/
+    __global const float* p12, /*int p12_step,*/
+    __global const float* p21, /*int p21_step,*/
+    __global const float* p22, /*int p22_step,*/
+    __global float* u1, int u1_step,
+    __global float* u2,
+    __global float* error, float l_t, float theta, int u2_step,
+    int u1_offset_x,
+    int u1_offset_y,
+    int u2_offset_x,
+    int u2_offset_y,
+    char calc_error)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if(x < I1wx_col && y < I1wx_row)
+    {
+        float I1wxVal = I1wx[y * I1wx_step + x];
+        float I1wyVal = I1wy[y * I1wx_step + x];
+        float gradVal = grad[y * I1wx_step + x];
+        float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+        float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+        float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
+        // estimate the values of the variable (v1, v2) (thresholding operator TH)
+        float d1 = 0.0f;
+        float d2 = 0.0f;
+        if (rho < -l_t * gradVal)
+        {
+            d1 = l_t * I1wxVal;
+            d2 = l_t * I1wyVal;
+        }
+        else if (rho > l_t * gradVal)
+        {
+            d1 = -l_t * I1wxVal;
+            d2 = -l_t * I1wyVal;
+        }
+        else if (gradVal > 1.192092896e-07f)
+        {
+            float fi = -rho / gradVal;
+            d1 = fi * I1wxVal;
+            d2 = fi * I1wyVal;
+        }
+        float v1 = u1OldVal + d1;
+        float v2 = u2OldVal + d2;
+        // compute the divergence of the dual variable (p1, p2)
+        float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);
+        float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);
+        // estimate the values of the optical flow (u1, u2)
+        float u1NewVal = v1 + theta * div_p1;
+        float u2NewVal = v2 + theta * div_p2;
+        u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;
+        u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;
+        if(calc_error)
+        {
+            float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
+            float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
+            error[y * I1wx_step + x] = n1 + n2;
+        }
+    }
+}
--- a/modules/optflow/src/optical_flow_io.cpp
+++ b/modules/optflow/src/optical_flow_io.cpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
- //
- //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- //  By downloading, copying, installing or using the software you agree to this license.
- //  If you do not agree to this license, do not download, install,
- //  copy or use the software.
- //
- //
- //                           License Agreement
- //                For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- //   * Redistribution's of source code must retain the above copyright notice,
- //     this list of conditions and the following disclaimer.
- //
- //   * Redistribution's in binary form must reproduce the above copyright notice,
- //     this list of conditions and the following disclaimer in the documentation
- //     and/or other materials provided with the distribution.
- //
- //   * The name of the copyright holders may not be used to endorse or promote products
- //     derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
-#include "precomp.hpp"
-#include<iostream>
-#include<fstream>
-namespace cv {
-namespace optflow {
-const float FLOW_TAG_FLOAT = 202021.25f;
-const char *FLOW_TAG_STRING = "PIEH";
-CV_EXPORTS_W Mat readOpticalFlow( const String& path )
-{
-//    CV_Assert(sizeof(float) == 4);
-    //FIXME: ensure right sizes of int and float - here and in writeOpticalFlow()
-    Mat_<Point2f> flow;
-    std::ifstream file(path.c_str(), std::ios_base::binary);
-    if ( !file.good() )
-        return flow; // no file - return empty matrix
-    float tag;
-    file.read((char*) &tag, sizeof(float));
-    if ( tag != FLOW_TAG_FLOAT )
-        return flow;
-    int width, height;
-    file.read((char*) &width, 4);
-    file.read((char*) &height, 4);
-    flow.create(height, width);
-    for ( int i = 0; i < flow.rows; ++i )
-    {
-        for ( int j = 0; j < flow.cols; ++j )
-        {
-            Point2f u;
-            file.read((char*) &u.x, sizeof(float));
-            file.read((char*) &u.y, sizeof(float));
-            if ( !file.good() )
-            {
-                flow.release();
-                return flow;
-            }
-            flow(i, j) = u;
-        }
-    }
-    file.close();
-    return flow;
-}
-CV_EXPORTS_W bool writeOpticalFlow( const String& path, InputArray flow )
-{
-//    CV_Assert(sizeof(float) == 4);
-    const int nChannels = 2;
-    Mat input = flow.getMat();
-    if ( input.channels() != nChannels || input.depth() != CV_32F || path.length() == 0 )
-        return false;
-    std::ofstream file(path.c_str(), std::ofstream::binary);
-    if ( !file.good() )
-        return false;
-    int nRows, nCols;
-    nRows = (int) input.size().height;
-    nCols = (int) input.size().width;
-    const int headerSize = 12;
-    char header[headerSize];
-    memcpy(header, FLOW_TAG_STRING, 4);
-    // size of ints is known - has been asserted in the current function
-    memcpy(header + 4, reinterpret_cast<const char*>(&nCols), sizeof(nCols));
-    memcpy(header + 8, reinterpret_cast<const char*>(&nRows), sizeof(nRows));
-    file.write(header, headerSize);
-    if ( !file.good() )
-        return false;
-//    if ( input.isContinuous() ) //matrix is continous - treat it as a single row
-//    {
-//        nCols *= nRows;
-//        nRows = 1;
-//    }
-    int row;
-    char* p;
-    for ( row = 0; row < nRows; row++ )
-    {
-        p = input.ptr<char>(row);
-        file.write(p, nCols * nChannels * sizeof(float));
-        if ( !file.good() )
-            return false;
-    }
-    file.close();
-    return true;
-}
-}
-}
--- a/modules/optflow/src/tvl1flow.cpp
+++ b/modules/optflow/src/tvl1flow.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+/*
+//
+// This implementation is based on Javier Sánchez Pérez <jsanchez@dis.ulpgc.es> implementation.
+// Original BSD license:
+//
+// Copyright (c) 2011, Javier Sánchez Pérez, Enric Meinhardt Llopis
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+*/
+#include "precomp.hpp"
+#include "opencl_kernels_optflow.hpp"
+#include <limits>
+#include <iomanip>
+#include <iostream>
+#include "opencv2/core/opencl/ocl_defs.hpp"
+namespace cv {
+namespace optflow {
+class OpticalFlowDual_TVL1 : public DualTVL1OpticalFlow
+{
+public:
+    OpticalFlowDual_TVL1(double tau_, double lambda_, double theta_, int nscales_, int warps_,
+                         double epsilon_, int innerIterations_, int outerIterations_,
+                         double scaleStep_, double gamma_, int medianFiltering_,
+                         bool useInitialFlow_) :
+        tau(tau_), lambda(lambda_), theta(theta_), gamma(gamma_), nscales(nscales_),
+        warps(warps_), epsilon(epsilon_), innerIterations(innerIterations_),
+        outerIterations(outerIterations_), useInitialFlow(useInitialFlow_),
+        scaleStep(scaleStep_), medianFiltering(medianFiltering_)
+    {
+    }
+    OpticalFlowDual_TVL1();
+    void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
+    void collectGarbage() CV_OVERRIDE;
+    inline double getTau() const CV_OVERRIDE { return tau; }
+    inline void setTau(double val) CV_OVERRIDE { tau = val; }
+    inline double getLambda() const CV_OVERRIDE { return lambda; }
+    inline void setLambda(double val) CV_OVERRIDE { lambda = val; }
+    inline double getTheta() const CV_OVERRIDE { return theta; }
+    inline void setTheta(double val) CV_OVERRIDE { theta = val; }
+    inline double getGamma() const CV_OVERRIDE { return gamma; }
+    inline void setGamma(double val) CV_OVERRIDE { gamma = val; }
+    inline int getScalesNumber() const CV_OVERRIDE { return nscales; }
+    inline void setScalesNumber(int val) CV_OVERRIDE { nscales = val; }
+    inline int getWarpingsNumber() const CV_OVERRIDE { return warps; }
+    inline void setWarpingsNumber(int val) CV_OVERRIDE { warps = val; }
+    inline double getEpsilon() const CV_OVERRIDE { return epsilon; }
+    inline void setEpsilon(double val) CV_OVERRIDE { epsilon = val; }
+    inline int getInnerIterations() const CV_OVERRIDE { return innerIterations; }
+    inline void setInnerIterations(int val) CV_OVERRIDE { innerIterations = val; }
+    inline int getOuterIterations() const CV_OVERRIDE { return outerIterations; }
+    inline void setOuterIterations(int val) CV_OVERRIDE { outerIterations = val; }
+    inline bool getUseInitialFlow() const CV_OVERRIDE { return useInitialFlow; }
+    inline void setUseInitialFlow(bool val) CV_OVERRIDE { useInitialFlow = val; }
+    inline double getScaleStep() const CV_OVERRIDE { return scaleStep; }
+    inline void setScaleStep(double val) CV_OVERRIDE { scaleStep = val; }
+    inline int getMedianFiltering() const CV_OVERRIDE { return medianFiltering; }
+    inline void setMedianFiltering(int val) CV_OVERRIDE { medianFiltering = val; }
+protected:
+    double tau;
+    double lambda;
+    double theta;
+    double gamma;
+    int nscales;
+    int warps;
+    double epsilon;
+    int innerIterations;
+    int outerIterations;
+    bool useInitialFlow;
+    double scaleStep;
+    int medianFiltering;
+private:
+    void procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2, Mat_<float>& u3);
+#ifdef HAVE_OPENCL
+    bool procOneScale_ocl(const UMat& I0, const UMat& I1, UMat& u1, UMat& u2);
+    bool calc_ocl(InputArray I0, InputArray I1, InputOutputArray flow);
+#endif
+    struct dataMat
+    {
+        std::vector<Mat_<float> > I0s;
+        std::vector<Mat_<float> > I1s;
+        std::vector<Mat_<float> > u1s;
+        std::vector<Mat_<float> > u2s;
+        std::vector<Mat_<float> > u3s;
+        Mat_<float> I1x_buf;
+        Mat_<float> I1y_buf;
+        Mat_<float> flowMap1_buf;
+        Mat_<float> flowMap2_buf;
+        Mat_<float> I1w_buf;
+        Mat_<float> I1wx_buf;
+        Mat_<float> I1wy_buf;
+        Mat_<float> grad_buf;
+        Mat_<float> rho_c_buf;
+        Mat_<float> v1_buf;
+        Mat_<float> v2_buf;
+        Mat_<float> v3_buf;
+        Mat_<float> p11_buf;
+        Mat_<float> p12_buf;
+        Mat_<float> p21_buf;
+        Mat_<float> p22_buf;
+        Mat_<float> p31_buf;
+        Mat_<float> p32_buf;
+        Mat_<float> div_p1_buf;
+        Mat_<float> div_p2_buf;
+        Mat_<float> div_p3_buf;
+        Mat_<float> u1x_buf;
+        Mat_<float> u1y_buf;
+        Mat_<float> u2x_buf;
+        Mat_<float> u2y_buf;
+        Mat_<float> u3x_buf;
+        Mat_<float> u3y_buf;
+    } dm;
+#ifdef HAVE_OPENCL
+    struct dataUMat
+    {
+        std::vector<UMat> I0s;
+        std::vector<UMat> I1s;
+        std::vector<UMat> u1s;
+        std::vector<UMat> u2s;
+        UMat I1x_buf;
+        UMat I1y_buf;
+        UMat I1w_buf;
+        UMat I1wx_buf;
+        UMat I1wy_buf;
+        UMat grad_buf;
+        UMat rho_c_buf;
+        UMat p11_buf;
+        UMat p12_buf;
+        UMat p21_buf;
+        UMat p22_buf;
+        UMat diff_buf;
+        UMat norm_buf;
+    } dum;
+#endif
+};
+#ifdef HAVE_OPENCL
+namespace cv_ocl_tvl1flow
+{
+    bool centeredGradient(const UMat &src, UMat &dx, UMat &dy);
+    bool warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UMat &I1y,
+        UMat &u1, UMat &u2, UMat &I1w, UMat &I1wx, UMat &I1wy,
+        UMat &grad, UMat &rho);
+    bool estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
+        UMat &rho_c, UMat &p11, UMat &p12,
+        UMat &p21, UMat &p22, UMat &u1,
+        UMat &u2, UMat &error, float l_t, float theta, char calc_error);
+    bool estimateDualVariables(UMat &u1, UMat &u2,
+        UMat &p11, UMat &p12, UMat &p21, UMat &p22, float taut);
+}
+bool cv_ocl_tvl1flow::centeredGradient(const UMat &src, UMat &dx, UMat &dy)
+{
+    size_t globalsize[2] = { (size_t)src.cols, (size_t)src.rows };
+    ocl::Kernel kernel;
+    if (!kernel.create("centeredGradientKernel", ocl::optflow::optical_flow_tvl1_oclsrc, ""))
+        return false;
+    int idxArg = 0;
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));//src mat
+    idxArg = kernel.set(idxArg, (int)(src.cols));//src mat col
+    idxArg = kernel.set(idxArg, (int)(src.rows));//src mat rows
+    idxArg = kernel.set(idxArg, (int)(src.step / src.elemSize()));//src mat step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dx));//res mat dx
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dy));//res mat dy
+    idxArg = kernel.set(idxArg, (int)(dx.step/dx.elemSize()));//res mat step
+    return kernel.run(2, globalsize, NULL, false);
+}
+bool cv_ocl_tvl1flow::warpBackward(const UMat &I0, const UMat &I1, UMat &I1x, UMat &I1y,
+    UMat &u1, UMat &u2, UMat &I1w, UMat &I1wx, UMat &I1wy,
+    UMat &grad, UMat &rho)
+{
+    size_t globalsize[2] = { (size_t)I0.cols, (size_t)I0.rows };
+    ocl::Kernel kernel;
+    if (!kernel.create("warpBackwardKernel", ocl::optflow::optical_flow_tvl1_oclsrc, ""))
+        return false;
+    int idxArg = 0;
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I0));//I0 mat
+    int I0_step = (int)(I0.step / I0.elemSize());
+    idxArg = kernel.set(idxArg, I0_step);//I0_step
+    idxArg = kernel.set(idxArg, (int)(I0.cols));//I0_col
+    idxArg = kernel.set(idxArg, (int)(I0.rows));//I0_row
+    ocl::Image2D imageI1(I1);
+    ocl::Image2D imageI1x(I1x);
+    ocl::Image2D imageI1y(I1y);
+    idxArg = kernel.set(idxArg, imageI1);//image2d_t tex_I1
+    idxArg = kernel.set(idxArg, imageI1x);//image2d_t tex_I1x
+    idxArg = kernel.set(idxArg, imageI1y);//image2d_t tex_I1y
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u1));//const float* u1
+    idxArg = kernel.set(idxArg, (int)(u1.step / u1.elemSize()));//int u1_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u2));//const float* u2
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(I1w));///float* I1w
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(I1wx));//float* I1wx
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(I1wy));//float* I1wy
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(grad));//float* grad
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(rho));//float* rho
+    idxArg = kernel.set(idxArg, (int)(I1w.step / I1w.elemSize()));//I1w_step
+    idxArg = kernel.set(idxArg, (int)(u2.step / u2.elemSize()));//u2_step
+    int u1_offset_x = (int)((u1.offset) % (u1.step));
+    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
+    idxArg = kernel.set(idxArg, (int)u1_offset_x );//u1_offset_x
+    idxArg = kernel.set(idxArg, (int)(u1.offset/u1.step));//u1_offset_y
+    int u2_offset_x = (int)((u2.offset) % (u2.step));
+    u2_offset_x = (int) (u2_offset_x / u2.elemSize());
+    idxArg = kernel.set(idxArg, (int)u2_offset_x);//u2_offset_x
+    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step));//u2_offset_y
+    return kernel.run(2, globalsize, NULL, false);
+}
+bool cv_ocl_tvl1flow::estimateU(UMat &I1wx, UMat &I1wy, UMat &grad,
+    UMat &rho_c, UMat &p11, UMat &p12,
+    UMat &p21, UMat &p22, UMat &u1,
+    UMat &u2, UMat &error, float l_t, float theta, char calc_error)
+{
+    size_t globalsize[2] = { (size_t)I1wx.cols, (size_t)I1wx.rows };
+    ocl::Kernel kernel;
+    if (!kernel.create("estimateUKernel", ocl::optflow::optical_flow_tvl1_oclsrc, ""))
+        return false;
+    int idxArg = 0;
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I1wx)); //const float* I1wx
+    idxArg = kernel.set(idxArg, (int)(I1wx.cols)); //int I1wx_col
+    idxArg = kernel.set(idxArg, (int)(I1wx.rows)); //int I1wx_row
+    idxArg = kernel.set(idxArg, (int)(I1wx.step/I1wx.elemSize())); //int I1wx_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(I1wy)); //const float* I1wy
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(grad)); //const float* grad
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(rho_c)); //const float* rho_c
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p11)); //const float* p11
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p12)); //const float* p12
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p21)); //const float* p21
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(p22)); //const float* p22
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(u1)); //float* u1
+    idxArg = kernel.set(idxArg, (int)(u1.step / u1.elemSize())); //int u1_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(u2)); //float* u2
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(error)); //float* error
+    idxArg = kernel.set(idxArg, (float)l_t); //float l_t
+    idxArg = kernel.set(idxArg, (float)theta); //float theta
+    idxArg = kernel.set(idxArg, (int)(u2.step / u2.elemSize()));//int u2_step
+    int u1_offset_x = (int)(u1.offset % u1.step);
+    u1_offset_x = (int) (u1_offset_x  / u1.elemSize());
+    idxArg = kernel.set(idxArg, (int)u1_offset_x); //int u1_offset_x
+    idxArg = kernel.set(idxArg, (int)(u1.offset/u1.step)); //int u1_offset_y
+    int u2_offset_x = (int)(u2.offset % u2.step);
+    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
+    idxArg = kernel.set(idxArg, (int)u2_offset_x ); //int u2_offset_x
+    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
+    idxArg = kernel.set(idxArg, (char)calc_error);    //char calc_error
+    return kernel.run(2, globalsize, NULL, false);
+}
+bool cv_ocl_tvl1flow::estimateDualVariables(UMat &u1, UMat &u2,
+    UMat &p11, UMat &p12, UMat &p21, UMat &p22, float taut)
+{
+    size_t globalsize[2] = { (size_t)u1.cols, (size_t)u1.rows };
+    ocl::Kernel kernel;
+    if (!kernel.create("estimateDualVariablesKernel", ocl::optflow::optical_flow_tvl1_oclsrc, ""))
+        return false;
+    int idxArg = 0;
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u1));// const float* u1
+    idxArg = kernel.set(idxArg, (int)(u1.cols)); //int u1_col
+    idxArg = kernel.set(idxArg, (int)(u1.rows)); //int u1_row
+    idxArg = kernel.set(idxArg, (int)(u1.step/u1.elemSize())); //int u1_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(u2)); // const float* u2
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p11)); // float* p11
+    idxArg = kernel.set(idxArg, (int)(p11.step/p11.elemSize())); //int p11_step
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p12)); // float* p12
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p21)); // float* p21
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(p22)); // float* p22
+    idxArg = kernel.set(idxArg, (float)(taut));    //float taut
+    idxArg = kernel.set(idxArg, (int)(u2.step/u2.elemSize())); //int u2_step
+    int u1_offset_x = (int)(u1.offset % u1.step);
+    u1_offset_x = (int)(u1_offset_x / u1.elemSize());
+    idxArg = kernel.set(idxArg, u1_offset_x); //int u1_offset_x
+    idxArg = kernel.set(idxArg, (int)(u1.offset / u1.step)); //int u1_offset_y
+    int u2_offset_x = (int)(u2.offset % u2.step);
+    u2_offset_x = (int)(u2_offset_x / u2.elemSize());
+    idxArg = kernel.set(idxArg, u2_offset_x); //int u2_offset_x
+    idxArg = kernel.set(idxArg, (int)(u2.offset / u2.step)); //int u2_offset_y
+    return kernel.run(2, globalsize, NULL, false);
+}
+#endif
+OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
+{
+    tau            = 0.25;
+    lambda         = 0.15;
+    theta          = 0.3;
+    nscales        = 5;
+    warps          = 5;
+    epsilon        = 0.01;
+    gamma          = 0.;
+    innerIterations = 30;
+    outerIterations = 10;
+    useInitialFlow = false;
+    medianFiltering = 5;
+    scaleStep      = 0.8;
+}
+void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow)
+{
+    CV_INSTRUMENT_REGION();
+#ifndef __APPLE__
+    CV_OCL_RUN(_flow.isUMat() &&
+               ocl::Image2D::isFormatSupported(CV_32F, 1, false),
+               calc_ocl(_I0, _I1, _flow))
+#endif
+    Mat I0 = _I0.getMat();
+    Mat I1 = _I1.getMat();
+    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
+    CV_Assert( I0.size() == I1.size() );
+    CV_Assert( I0.type() == I1.type() );
+    CV_Assert( !useInitialFlow || (_flow.size() == I0.size() && _flow.type() == CV_32FC2) );
+    CV_Assert( nscales > 0 );
+    bool use_gamma = gamma != 0;
+    // allocate memory for the pyramid structure
+    dm.I0s.resize(nscales);
+    dm.I1s.resize(nscales);
+    dm.u1s.resize(nscales);
+    dm.u2s.resize(nscales);
+    dm.u3s.resize(nscales);
+    I0.convertTo(dm.I0s[0], dm.I0s[0].depth(), I0.depth() == CV_8U ? 1.0 : 255.0);
+    I1.convertTo(dm.I1s[0], dm.I1s[0].depth(), I1.depth() == CV_8U ? 1.0 : 255.0);
+    dm.u1s[0].create(I0.size());
+    dm.u2s[0].create(I0.size());
+    if (use_gamma) dm.u3s[0].create(I0.size());
+    if (useInitialFlow)
+    {
+        Mat_<float> mv[] = { dm.u1s[0], dm.u2s[0] };
+        split(_flow.getMat(), mv);
+    }
+    dm.I1x_buf.create(I0.size());
+    dm.I1y_buf.create(I0.size());
+    dm.flowMap1_buf.create(I0.size());
+    dm.flowMap2_buf.create(I0.size());
+    dm.I1w_buf.create(I0.size());
+    dm.I1wx_buf.create(I0.size());
+    dm.I1wy_buf.create(I0.size());
+    dm.grad_buf.create(I0.size());
+    dm.rho_c_buf.create(I0.size());
+    dm.v1_buf.create(I0.size());
+    dm.v2_buf.create(I0.size());
+    dm.v3_buf.create(I0.size());
+    dm.p11_buf.create(I0.size());
+    dm.p12_buf.create(I0.size());
+    dm.p21_buf.create(I0.size());
+    dm.p22_buf.create(I0.size());
+    dm.p31_buf.create(I0.size());
+    dm.p32_buf.create(I0.size());
+    dm.div_p1_buf.create(I0.size());
+    dm.div_p2_buf.create(I0.size());
+    dm.div_p3_buf.create(I0.size());
+    dm.u1x_buf.create(I0.size());
+    dm.u1y_buf.create(I0.size());
+    dm.u2x_buf.create(I0.size());
+    dm.u2y_buf.create(I0.size());
+    dm.u3x_buf.create(I0.size());
+    dm.u3y_buf.create(I0.size());
+    // create the scales
+    for (int s = 1; s < nscales; ++s)
+    {
+        resize(dm.I0s[s - 1], dm.I0s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
+        resize(dm.I1s[s - 1], dm.I1s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
+        if (dm.I0s[s].cols < 16 || dm.I0s[s].rows < 16)
+        {
+            nscales = s;
+            break;
+        }
+        if (useInitialFlow)
+        {
+            resize(dm.u1s[s - 1], dm.u1s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
+            resize(dm.u2s[s - 1], dm.u2s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
+            multiply(dm.u1s[s], Scalar::all(scaleStep), dm.u1s[s]);
+            multiply(dm.u2s[s], Scalar::all(scaleStep), dm.u2s[s]);
+        }
+        else
+        {
+            dm.u1s[s].create(dm.I0s[s].size());
+            dm.u2s[s].create(dm.I0s[s].size());
+        }
+        if (use_gamma) dm.u3s[s].create(dm.I0s[s].size());
+    }
+    if (!useInitialFlow)
+    {
+        dm.u1s[nscales - 1].setTo(Scalar::all(0));
+        dm.u2s[nscales - 1].setTo(Scalar::all(0));
+    }
+    if (use_gamma) dm.u3s[nscales - 1].setTo(Scalar::all(0));
+    // pyramidal structure for computing the optical flow
+    for (int s = nscales - 1; s >= 0; --s)
+    {
+        // compute the optical flow at the current scale
+        procOneScale(dm.I0s[s], dm.I1s[s], dm.u1s[s], dm.u2s[s], dm.u3s[s]);
+        // if this was the last scale, finish now
+        if (s == 0)
+            break;
+        // otherwise, upsample the optical flow
+        // zoom the optical flow for the next finer scale
+        resize(dm.u1s[s], dm.u1s[s - 1], dm.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
+        resize(dm.u2s[s], dm.u2s[s - 1], dm.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
+        if (use_gamma) resize(dm.u3s[s], dm.u3s[s - 1], dm.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
+        // scale the optical flow with the appropriate zoom factor (don't scale u3!)
+        multiply(dm.u1s[s - 1], Scalar::all(1 / scaleStep), dm.u1s[s - 1]);
+        multiply(dm.u2s[s - 1], Scalar::all(1 / scaleStep), dm.u2s[s - 1]);
+    }
+    Mat uxy[] = { dm.u1s[0], dm.u2s[0] };
+    merge(uxy, 2, _flow);
+}
+#ifdef HAVE_OPENCL
+bool OpticalFlowDual_TVL1::calc_ocl(InputArray _I0, InputArray _I1, InputOutputArray _flow)
+{
+    UMat I0 = _I0.getUMat();
+    UMat I1 = _I1.getUMat();
+    CV_Assert(I0.type() == CV_8UC1 || I0.type() == CV_32FC1);
+    CV_Assert(I0.size() == I1.size());
+    CV_Assert(I0.type() == I1.type());
+    CV_Assert(!useInitialFlow || (_flow.size() == I0.size() && _flow.type() == CV_32FC2));
+    CV_Assert(nscales > 0);
+    // allocate memory for the pyramid structure
+    dum.I0s.resize(nscales);
+    dum.I1s.resize(nscales);
+    dum.u1s.resize(nscales);
+    dum.u2s.resize(nscales);
+    //I0s_step == I1s_step
+    double alpha = I0.depth() == CV_8U ? 1.0 : 255.0;
+    I0.convertTo(dum.I0s[0], CV_32F, alpha);
+    I1.convertTo(dum.I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
+    dum.u1s[0].create(I0.size(), CV_32FC1);
+    dum.u2s[0].create(I0.size(), CV_32FC1);
+    if (useInitialFlow)
+    {
+        std::vector<UMat> umv;
+        umv.push_back(dum.u1s[0]);
+        umv.push_back(dum.u2s[0]);
+        cv::split(_flow,umv);
+    }
+    dum.I1x_buf.create(I0.size(), CV_32FC1);
+    dum.I1y_buf.create(I0.size(), CV_32FC1);
+    dum.I1w_buf.create(I0.size(), CV_32FC1);
+    dum.I1wx_buf.create(I0.size(), CV_32FC1);
+    dum.I1wy_buf.create(I0.size(), CV_32FC1);
+    dum.grad_buf.create(I0.size(), CV_32FC1);
+    dum.rho_c_buf.create(I0.size(), CV_32FC1);
+    dum.p11_buf.create(I0.size(), CV_32FC1);
+    dum.p12_buf.create(I0.size(), CV_32FC1);
+    dum.p21_buf.create(I0.size(), CV_32FC1);
+    dum.p22_buf.create(I0.size(), CV_32FC1);
+    dum.diff_buf.create(I0.size(), CV_32FC1);
+    // create the scales
+    for (int s = 1; s < nscales; ++s)
+    {
+        resize(dum.I0s[s - 1], dum.I0s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
+        resize(dum.I1s[s - 1], dum.I1s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
+        if (dum.I0s[s].cols < 16 || dum.I0s[s].rows < 16)
+        {
+            nscales = s;
+            break;
+        }
+        if (useInitialFlow)
+        {
+            resize(dum.u1s[s - 1], dum.u1s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
+            resize(dum.u2s[s - 1], dum.u2s[s], Size(), scaleStep, scaleStep, INTER_LINEAR);
+            //scale by scale factor
+            multiply(dum.u1s[s], Scalar::all(scaleStep), dum.u1s[s]);
+            multiply(dum.u2s[s], Scalar::all(scaleStep), dum.u2s[s]);
+        }
+    }
+    // pyramidal structure for computing the optical flow
+    for (int s = nscales - 1; s >= 0; --s)
+    {
+        // compute the optical flow at the current scale
+        if (!OpticalFlowDual_TVL1::procOneScale_ocl(dum.I0s[s], dum.I1s[s], dum.u1s[s], dum.u2s[s]))
+            return false;
+        // if this was the last scale, finish now
+        if (s == 0)
+            break;
+        // zoom the optical flow for the next finer scale
+        resize(dum.u1s[s], dum.u1s[s - 1], dum.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
+        resize(dum.u2s[s], dum.u2s[s - 1], dum.I0s[s - 1].size(), 0, 0, INTER_LINEAR);
+        // scale the optical flow with the appropriate zoom factor
+        multiply(dum.u1s[s - 1], Scalar::all(1 / scaleStep), dum.u1s[s - 1]);
+        multiply(dum.u2s[s - 1], Scalar::all(1 / scaleStep), dum.u2s[s - 1]);
+    }
+    std::vector<UMat> uxy;
+    uxy.push_back(dum.u1s[0]);
+    uxy.push_back(dum.u2s[0]);
+    merge(uxy, _flow);
+    return true;
+}
+#endif
+////////////////////////////////////////////////////////////
+// buildFlowMap
+struct BuildFlowMapBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const CV_OVERRIDE;
+    Mat_<float> u1;
+    Mat_<float> u2;
+    mutable Mat_<float> map1;
+    mutable Mat_<float> map2;
+};
+void BuildFlowMapBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* u1Row = u1[y];
+        const float* u2Row = u2[y];
+        float* map1Row = map1[y];
+        float* map2Row = map2[y];
+        for (int x = 0; x < u1.cols; ++x)
+        {
+            map1Row[x] = x + u1Row[x];
+            map2Row[x] = y + u2Row[x];
+        }
+    }
+}
+static void buildFlowMap(const Mat_<float>& u1, const Mat_<float>& u2,
+                         Mat_<float>& map1, Mat_<float>& map2)
+{
+    CV_DbgAssert( u2.size() == u1.size() );
+    CV_DbgAssert( map1.size() == u1.size() );
+    CV_DbgAssert( map2.size() == u1.size() );
+    BuildFlowMapBody body;
+    body.u1 = u1;
+    body.u2 = u2;
+    body.map1 = map1;
+    body.map2 = map2;
+    parallel_for_(Range(0, u1.rows), body);
+}
+////////////////////////////////////////////////////////////
+// centeredGradient
+struct CenteredGradientBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const CV_OVERRIDE;
+    Mat_<float> src;
+    mutable Mat_<float> dx;
+    mutable Mat_<float> dy;
+};
+void CenteredGradientBody::operator() (const Range& range) const
+{
+    const int last_col = src.cols - 1;
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* srcPrevRow = src[y - 1];
+        const float* srcCurRow = src[y];
+        const float* srcNextRow = src[y + 1];
+        float* dxRow = dx[y];
+        float* dyRow = dy[y];
+        for (int x = 1; x < last_col; ++x)
+        {
+            dxRow[x] = 0.5f * (srcCurRow[x + 1] - srcCurRow[x - 1]);
+            dyRow[x] = 0.5f * (srcNextRow[x] - srcPrevRow[x]);
+        }
+    }
+}
+static void centeredGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
+{
+    CV_DbgAssert( src.rows > 2 && src.cols > 2 );
+    CV_DbgAssert( dx.size() == src.size() );
+    CV_DbgAssert( dy.size() == src.size() );
+    const int last_row = src.rows - 1;
+    const int last_col = src.cols - 1;
+    // compute the gradient on the center body of the image
+    {
+        CenteredGradientBody body;
+        body.src = src;
+        body.dx = dx;
+        body.dy = dy;
+        parallel_for_(Range(1, last_row), body);
+    }
+    // compute the gradient on the first and last rows
+    for (int x = 1; x < last_col; ++x)
+    {
+        dx(0, x) = 0.5f * (src(0, x + 1) - src(0, x - 1));
+        dy(0, x) = 0.5f * (src(1, x) - src(0, x));
+        dx(last_row, x) = 0.5f * (src(last_row, x + 1) - src(last_row, x - 1));
+        dy(last_row, x) = 0.5f * (src(last_row, x) - src(last_row - 1, x));
+    }
+    // compute the gradient on the first and last columns
+    for (int y = 1; y < last_row; ++y)
+    {
+        dx(y, 0) = 0.5f * (src(y, 1) - src(y, 0));
+        dy(y, 0) = 0.5f * (src(y + 1, 0) - src(y - 1, 0));
+        dx(y, last_col) = 0.5f * (src(y, last_col) - src(y, last_col - 1));
+        dy(y, last_col) = 0.5f * (src(y + 1, last_col) - src(y - 1, last_col));
+    }
+    // compute the gradient at the four corners
+    dx(0, 0) = 0.5f * (src(0, 1) - src(0, 0));
+    dy(0, 0) = 0.5f * (src(1, 0) - src(0, 0));
+    dx(0, last_col) = 0.5f * (src(0, last_col) - src(0, last_col - 1));
+    dy(0, last_col) = 0.5f * (src(1, last_col) - src(0, last_col));
+    dx(last_row, 0) = 0.5f * (src(last_row, 1) - src(last_row, 0));
+    dy(last_row, 0) = 0.5f * (src(last_row, 0) - src(last_row - 1, 0));
+    dx(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row, last_col - 1));
+    dy(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row - 1, last_col));
+}
+////////////////////////////////////////////////////////////
+// forwardGradient
+struct ForwardGradientBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const CV_OVERRIDE;
+    Mat_<float> src;
+    mutable Mat_<float> dx;
+    mutable Mat_<float> dy;
+};
+void ForwardGradientBody::operator() (const Range& range) const
+{
+    const int last_col = src.cols - 1;
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* srcCurRow = src[y];
+        const float* srcNextRow = src[y + 1];
+        float* dxRow = dx[y];
+        float* dyRow = dy[y];
+        for (int x = 0; x < last_col; ++x)
+        {
+            dxRow[x] = srcCurRow[x + 1] - srcCurRow[x];
+            dyRow[x] = srcNextRow[x] - srcCurRow[x];
+        }
+    }
+}
+static void forwardGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
+{
+    CV_DbgAssert( src.rows > 2 && src.cols > 2 );
+    CV_DbgAssert( dx.size() == src.size() );
+    CV_DbgAssert( dy.size() == src.size() );
+    const int last_row = src.rows - 1;
+    const int last_col = src.cols - 1;
+    // compute the gradient on the central body of the image
+    {
+        ForwardGradientBody body;
+        body.src = src;
+        body.dx = dx;
+        body.dy = dy;
+        parallel_for_(Range(0, last_row), body);
+    }
+    // compute the gradient on the last row
+    for (int x = 0; x < last_col; ++x)
+    {
+        dx(last_row, x) = src(last_row, x + 1) - src(last_row, x);
+        dy(last_row, x) = 0.0f;
+    }
+    // compute the gradient on the last column
+    for (int y = 0; y < last_row; ++y)
+    {
+        dx(y, last_col) = 0.0f;
+        dy(y, last_col) = src(y + 1, last_col) - src(y, last_col);
+    }
+    dx(last_row, last_col) = 0.0f;
+    dy(last_row, last_col) = 0.0f;
+}
+////////////////////////////////////////////////////////////
+// divergence
+struct DivergenceBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const CV_OVERRIDE;
+    Mat_<float> v1;
+    Mat_<float> v2;
+    mutable Mat_<float> div;
+};
+void DivergenceBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* v1Row = v1[y];
+        const float* v2PrevRow = v2[y - 1];
+        const float* v2CurRow = v2[y];
+        float* divRow = div[y];
+        for(int x = 1; x < v1.cols; ++x)
+        {
+            const float v1x = v1Row[x] - v1Row[x - 1];
+            const float v2y = v2CurRow[x] - v2PrevRow[x];
+            divRow[x] = v1x + v2y;
+        }
+    }
+}
+static void divergence(const Mat_<float>& v1, const Mat_<float>& v2, Mat_<float>& div)
+{
+    CV_DbgAssert( v1.rows > 2 && v1.cols > 2 );
+    CV_DbgAssert( v2.size() == v1.size() );
+    CV_DbgAssert( div.size() == v1.size() );
+    {
+        DivergenceBody body;
+        body.v1 = v1;
+        body.v2 = v2;
+        body.div = div;
+        parallel_for_(Range(1, v1.rows), body);
+    }
+    // compute the divergence on the first row
+    for(int x = 1; x < v1.cols; ++x)
+        div(0, x) = v1(0, x) - v1(0, x - 1) + v2(0, x);
+    // compute the divergence on the first column
+    for (int y = 1; y < v1.rows; ++y)
+        div(y, 0) = v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
+    div(0, 0) = v1(0, 0) + v2(0, 0);
+}
+////////////////////////////////////////////////////////////
+// calcGradRho
+struct CalcGradRhoBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const CV_OVERRIDE;
+    Mat_<float> I0;
+    Mat_<float> I1w;
+    Mat_<float> I1wx;
+    Mat_<float> I1wy;
+    Mat_<float> u1;
+    Mat_<float> u2;
+    mutable Mat_<float> grad;
+    mutable Mat_<float> rho_c;
+};
+void CalcGradRhoBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* I0Row = I0[y];
+        const float* I1wRow = I1w[y];
+        const float* I1wxRow = I1wx[y];
+        const float* I1wyRow = I1wy[y];
+        const float* u1Row = u1[y];
+        const float* u2Row = u2[y];
+        float* gradRow = grad[y];
+        float* rhoRow = rho_c[y];
+        for (int x = 0; x < I0.cols; ++x)
+        {
+            const float Ix2 = I1wxRow[x] * I1wxRow[x];
+            const float Iy2 = I1wyRow[x] * I1wyRow[x];
+            // store the |Grad(I1)|^2
+            gradRow[x] = Ix2 + Iy2;
+            // compute the constant part of the rho function
+            rhoRow[x] = (I1wRow[x] - I1wxRow[x] * u1Row[x] - I1wyRow[x] * u2Row[x] - I0Row[x]);
+        }
+    }
+}
+static void calcGradRho(const Mat_<float>& I0, const Mat_<float>& I1w, const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2,
+    Mat_<float>& grad, Mat_<float>& rho_c)
+{
+    CV_DbgAssert( I1w.size() == I0.size() );
+    CV_DbgAssert( I1wx.size() == I0.size() );
+    CV_DbgAssert( I1wy.size() == I0.size() );
+    CV_DbgAssert( u1.size() == I0.size() );
+    CV_DbgAssert( u2.size() == I0.size() );
+    CV_DbgAssert( grad.size() == I0.size() );
+    CV_DbgAssert( rho_c.size() == I0.size() );
+    CalcGradRhoBody body;
+    body.I0 = I0;
+    body.I1w = I1w;
+    body.I1wx = I1wx;
+    body.I1wy = I1wy;
+    body.u1 = u1;
+    body.u2 = u2;
+    body.grad = grad;
+    body.rho_c = rho_c;
+    parallel_for_(Range(0, I0.rows), body);
+}
+////////////////////////////////////////////////////////////
+// estimateV
+struct EstimateVBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const CV_OVERRIDE;
+    Mat_<float> I1wx;
+    Mat_<float> I1wy;
+    Mat_<float> u1;
+    Mat_<float> u2;
+    Mat_<float> u3;
+    Mat_<float> grad;
+    Mat_<float> rho_c;
+    mutable Mat_<float> v1;
+    mutable Mat_<float> v2;
+    mutable Mat_<float> v3;
+    float l_t;
+    float gamma;
+};
+void EstimateVBody::operator() (const Range& range) const
+{
+    bool use_gamma = gamma != 0;
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* I1wxRow = I1wx[y];
+        const float* I1wyRow = I1wy[y];
+        const float* u1Row = u1[y];
+        const float* u2Row = u2[y];
+        const float* u3Row = use_gamma?u3[y]:NULL;
+        const float* gradRow = grad[y];
+        const float* rhoRow = rho_c[y];
+        float* v1Row = v1[y];
+        float* v2Row = v2[y];
+        float* v3Row = use_gamma ? v3[y]:NULL;
+        for (int x = 0; x < I1wx.cols; ++x)
+        {
+            const float rho = use_gamma ? rhoRow[x] + (I1wxRow[x] * u1Row[x] + I1wyRow[x] * u2Row[x]) + gamma * u3Row[x] :
+                                          rhoRow[x] + (I1wxRow[x] * u1Row[x] + I1wyRow[x] * u2Row[x]);
+            float d1 = 0.0f;
+            float d2 = 0.0f;
+            float d3 = 0.0f;
+            if (rho < -l_t * gradRow[x])
+            {
+                d1 = l_t * I1wxRow[x];
+                d2 = l_t * I1wyRow[x];
+                if (use_gamma) d3 = l_t * gamma;
+            }
+            else if (rho > l_t * gradRow[x])
+            {
+                d1 = -l_t * I1wxRow[x];
+                d2 = -l_t * I1wyRow[x];
+                if (use_gamma) d3 = -l_t * gamma;
+            }
+            else if (gradRow[x] > std::numeric_limits<float>::epsilon())
+            {
+                float fi = -rho / gradRow[x];
+                d1 = fi * I1wxRow[x];
+                d2 = fi * I1wyRow[x];
+                if (use_gamma) d3 = fi * gamma;
+            }
+            v1Row[x] = u1Row[x] + d1;
+            v2Row[x] = u2Row[x] + d2;
+            if (use_gamma) v3Row[x] = u3Row[x] + d3;
+        }
+    }
+}
+static void estimateV(const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2, const Mat_<float>& u3, const Mat_<float>& grad, const Mat_<float>& rho_c,
+   Mat_<float>& v1, Mat_<float>& v2, Mat_<float>& v3, float l_t, float gamma)
+{
+    CV_DbgAssert( I1wy.size() == I1wx.size() );
+    CV_DbgAssert( u1.size() == I1wx.size() );
+    CV_DbgAssert( u2.size() == I1wx.size() );
+    CV_DbgAssert( grad.size() == I1wx.size() );
+    CV_DbgAssert( rho_c.size() == I1wx.size() );
+    CV_DbgAssert( v1.size() == I1wx.size() );
+    CV_DbgAssert( v2.size() == I1wx.size() );
+    EstimateVBody body;
+    bool use_gamma = gamma != 0;
+    body.I1wx = I1wx;
+    body.I1wy = I1wy;
+    body.u1 = u1;
+    body.u2 = u2;
+    if (use_gamma) body.u3 = u3;
+    body.grad = grad;
+    body.rho_c = rho_c;
+    body.v1 = v1;
+    body.v2 = v2;
+    if (use_gamma) body.v3 = v3;
+    body.l_t = l_t;
+    body.gamma = gamma;
+    parallel_for_(Range(0, I1wx.rows), body);
+}
+////////////////////////////////////////////////////////////
+// estimateU
+static float estimateU(const Mat_<float>& v1, const Mat_<float>& v2, const Mat_<float>& v3,
+            const Mat_<float>& div_p1, const Mat_<float>& div_p2, const Mat_<float>& div_p3,
+            Mat_<float>& u1, Mat_<float>& u2, Mat_<float>& u3,
+            float theta, float gamma)
+{
+    CV_DbgAssert( v2.size() == v1.size() );
+    CV_DbgAssert( div_p1.size() == v1.size() );
+    CV_DbgAssert( div_p2.size() == v1.size() );
+    CV_DbgAssert( u1.size() == v1.size() );
+    CV_DbgAssert( u2.size() == v1.size() );
+    float error = 0.0f;
+    bool use_gamma = gamma != 0;
+    for (int y = 0; y < v1.rows; ++y)
+    {
+        const float* v1Row = v1[y];
+        const float* v2Row = v2[y];
+        const float* v3Row = use_gamma?v3[y]:NULL;
+        const float* divP1Row = div_p1[y];
+        const float* divP2Row = div_p2[y];
+        const float* divP3Row = use_gamma?div_p3[y]:NULL;
+        float* u1Row = u1[y];
+        float* u2Row = u2[y];
+        float* u3Row = use_gamma?u3[y]:NULL;
+        for (int x = 0; x < v1.cols; ++x)
+        {
+            const float u1k = u1Row[x];
+            const float u2k = u2Row[x];
+            const float u3k = use_gamma?u3Row[x]:0;
+            u1Row[x] = v1Row[x] + theta * divP1Row[x];
+            u2Row[x] = v2Row[x] + theta * divP2Row[x];
+            if (use_gamma) u3Row[x] = v3Row[x] + theta * divP3Row[x];
+            error += use_gamma?(u1Row[x] - u1k) * (u1Row[x] - u1k) + (u2Row[x] - u2k) * (u2Row[x] - u2k) + (u3Row[x] - u3k) * (u3Row[x] - u3k):
+                               (u1Row[x] - u1k) * (u1Row[x] - u1k) + (u2Row[x] - u2k) * (u2Row[x] - u2k);
+        }
+    }
+    return error;
+}
+////////////////////////////////////////////////////////////
+// estimateDualVariables
+struct EstimateDualVariablesBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const CV_OVERRIDE;
+    Mat_<float> u1x;
+    Mat_<float> u1y;
+    Mat_<float> u2x;
+    Mat_<float> u2y;
+    Mat_<float> u3x;
+    Mat_<float> u3y;
+    mutable Mat_<float> p11;
+    mutable Mat_<float> p12;
+    mutable Mat_<float> p21;
+    mutable Mat_<float> p22;
+    mutable Mat_<float> p31;
+    mutable Mat_<float> p32;
+    float taut;
+    bool use_gamma;
+};
+void EstimateDualVariablesBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* u1xRow = u1x[y];
+        const float* u1yRow = u1y[y];
+        const float* u2xRow = u2x[y];
+        const float* u2yRow = u2y[y];
+        const float* u3xRow = u3x[y];
+        const float* u3yRow = u3y[y];
+        float* p11Row = p11[y];
+        float* p12Row = p12[y];
+        float* p21Row = p21[y];
+        float* p22Row = p22[y];
+        float* p31Row = p31[y];
+        float* p32Row = p32[y];
+        for (int x = 0; x < u1x.cols; ++x)
+        {
+            const float g1 = static_cast<float>(hypot(u1xRow[x], u1yRow[x]));
+            const float g2 = static_cast<float>(hypot(u2xRow[x], u2yRow[x]));
+            const float ng1  = 1.0f + taut * g1;
+            const float ng2 =  1.0f + taut * g2;
+            p11Row[x] = (p11Row[x] + taut * u1xRow[x]) / ng1;
+            p12Row[x] = (p12Row[x] + taut * u1yRow[x]) / ng1;
+            p21Row[x] = (p21Row[x] + taut * u2xRow[x]) / ng2;
+            p22Row[x] = (p22Row[x] + taut * u2yRow[x]) / ng2;
+            if (use_gamma)
+            {
+                const float g3 = static_cast<float>(hypot(u3xRow[x], u3yRow[x]));
+                const float ng3 = 1.0f + taut * g3;
+                p31Row[x] = (p31Row[x] + taut * u3xRow[x]) / ng3;
+                p32Row[x] = (p32Row[x] + taut * u3yRow[x]) / ng3;
+            }
+        }
+    }
+}
+static void estimateDualVariables(const Mat_<float>& u1x, const Mat_<float>& u1y,
+                     const Mat_<float>& u2x, const Mat_<float>& u2y,
+                     const Mat_<float>& u3x, const Mat_<float>& u3y,
+                           Mat_<float>& p11, Mat_<float>& p12,
+                     Mat_<float>& p21, Mat_<float>& p22,
+                     Mat_<float>& p31, Mat_<float>& p32,
+                     float taut, bool use_gamma)
+{
+    CV_DbgAssert( u1y.size() == u1x.size() );
+    CV_DbgAssert( u2x.size() == u1x.size() );
+    CV_DbgAssert( u3x.size() == u1x.size() );
+    CV_DbgAssert( u2y.size() == u1x.size() );
+    CV_DbgAssert( u3y.size() == u1x.size() );
+    CV_DbgAssert( p11.size() == u1x.size() );
+    CV_DbgAssert( p12.size() == u1x.size() );
+    CV_DbgAssert( p21.size() == u1x.size() );
+    CV_DbgAssert( p22.size() == u1x.size() );
+    CV_DbgAssert( p31.size() == u1x.size() );
+    CV_DbgAssert( p32.size() == u1x.size() );
+    EstimateDualVariablesBody body;
+    body.u1x = u1x;
+    body.u1y = u1y;
+    body.u2x = u2x;
+    body.u2y = u2y;
+    body.u3x = u3x;
+    body.u3y = u3y;
+    body.p11 = p11;
+    body.p12 = p12;
+    body.p21 = p21;
+    body.p22 = p22;
+    body.p31 = p31;
+    body.p32 = p32;
+    body.taut = taut;
+    body.use_gamma = use_gamma;
+    parallel_for_(Range(0, u1x.rows), body);
+}
+#ifdef HAVE_OPENCL
+bool OpticalFlowDual_TVL1::procOneScale_ocl(const UMat& I0, const UMat& I1, UMat& u1, UMat& u2)
+{
+    using namespace cv_ocl_tvl1flow;
+    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
+    CV_DbgAssert(I1.size() == I0.size());
+    CV_DbgAssert(I1.type() == I0.type());
+    CV_DbgAssert(u1.empty() || u1.size() == I0.size());
+    CV_DbgAssert(u2.size() == u1.size());
+    if (u1.empty())
+    {
+        u1.create(I0.size(), CV_32FC1);
+        u1.setTo(Scalar::all(0));
+        u2.create(I0.size(), CV_32FC1);
+        u2.setTo(Scalar::all(0));
+    }
+    UMat I1x = dum.I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat I1y = dum.I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    if (!centeredGradient(I1, I1x, I1y))
+        return false;
+    UMat I1w = dum.I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat I1wx = dum.I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat I1wy = dum.I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat grad = dum.grad_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat rho_c = dum.rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat p11 = dum.p11_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat p12 = dum.p12_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat p21 = dum.p21_buf(Rect(0, 0, I0.cols, I0.rows));
+    UMat p22 = dum.p22_buf(Rect(0, 0, I0.cols, I0.rows));
+    p11.setTo(Scalar::all(0));
+    p12.setTo(Scalar::all(0));
+    p21.setTo(Scalar::all(0));
+    p22.setTo(Scalar::all(0));
+    UMat diff = dum.diff_buf(Rect(0, 0, I0.cols, I0.rows));
+    const float l_t = static_cast<float>(lambda * theta);
+    const float taut = static_cast<float>(tau / theta);
+    int n;
+    for (int warpings = 0; warpings < warps; ++warpings)
+    {
+        if (!warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c))
+            return false;
+        double error = std::numeric_limits<double>::max();
+        double prev_error = 0;
+        for (int n_outer = 0; error > scaledEpsilon && n_outer < outerIterations; ++n_outer)
+        {
+            if (medianFiltering > 1) {
+                cv::medianBlur(u1, u1, medianFiltering);
+                cv::medianBlur(u2, u2, medianFiltering);
+            }
+            for (int n_inner = 0; error > scaledEpsilon && n_inner < innerIterations; ++n_inner)
+            {
+                // some tweaks to make sum operation less frequently
+                n = n_inner + n_outer*innerIterations;
+                char calc_error = (n & 0x1) && (prev_error < scaledEpsilon);
+                if (!estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22,
+                    u1, u2, diff, l_t, static_cast<float>(theta), calc_error))
+                    return false;
+                if (calc_error)
+                {
+                    error = cv::sum(diff)[0];
+                    prev_error = error;
+                }
+                else
+                {
+                    error = std::numeric_limits<double>::max();
+                    prev_error -= scaledEpsilon;
+                }
+                if (!estimateDualVariables(u1, u2, p11, p12, p21, p22, taut))
+                    return false;
+            }
+        }
+    }
+    return true;
+}
+#endif
+void OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2, Mat_<float>& u3)
+{
+    const float scaledEpsilon = static_cast<float>(epsilon * epsilon * I0.size().area());
+    CV_DbgAssert( I1.size() == I0.size() );
+    CV_DbgAssert( I1.type() == I0.type() );
+    CV_DbgAssert( u1.size() == I0.size() );
+    CV_DbgAssert( u2.size() == u1.size() );
+    Mat_<float> I1x = dm.I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1y = dm.I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    centeredGradient(I1, I1x, I1y);
+    Mat_<float> flowMap1 = dm.flowMap1_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> flowMap2 = dm.flowMap2_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1w = dm.I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1wx = dm.I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1wy = dm.I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> grad = dm.grad_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> rho_c = dm.rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> v1 = dm.v1_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> v2 = dm.v2_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> v3 = dm.v3_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p11 = dm.p11_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p12 = dm.p12_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p21 = dm.p21_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p22 = dm.p22_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p31 = dm.p31_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p32 = dm.p32_buf(Rect(0, 0, I0.cols, I0.rows));
+    p11.setTo(Scalar::all(0));
+    p12.setTo(Scalar::all(0));
+    p21.setTo(Scalar::all(0));
+    p22.setTo(Scalar::all(0));
+    bool use_gamma = gamma != 0.;
+    if (use_gamma) p31.setTo(Scalar::all(0));
+    if (use_gamma) p32.setTo(Scalar::all(0));
+    Mat_<float> div_p1 = dm.div_p1_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> div_p2 = dm.div_p2_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> div_p3 = dm.div_p3_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u1x = dm.u1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u1y = dm.u1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u2x = dm.u2x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u2y = dm.u2y_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u3x = dm.u3x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u3y = dm.u3y_buf(Rect(0, 0, I0.cols, I0.rows));
+    const float l_t = static_cast<float>(lambda * theta);
+    const float taut = static_cast<float>(tau / theta);
+    for (int warpings = 0; warpings < warps; ++warpings)
+    {
+        // compute the warping of the target image and its derivatives
+        buildFlowMap(u1, u2, flowMap1, flowMap2);
+        remap(I1, I1w, flowMap1, flowMap2, INTER_CUBIC);
+        remap(I1x, I1wx, flowMap1, flowMap2, INTER_CUBIC);
+        remap(I1y, I1wy, flowMap1, flowMap2, INTER_CUBIC);
+        //calculate I1(x+u0) and its gradient
+        calcGradRho(I0, I1w, I1wx, I1wy, u1, u2, grad, rho_c);
+        float error = std::numeric_limits<float>::max();
+        for (int n_outer = 0; error > scaledEpsilon && n_outer < outerIterations; ++n_outer)
+        {
+            if (medianFiltering > 1) {
+                cv::medianBlur(u1, u1, medianFiltering);
+                cv::medianBlur(u2, u2, medianFiltering);
+            }
+            for (int n_inner = 0; error > scaledEpsilon && n_inner < innerIterations; ++n_inner)
+            {
+                // estimate the values of the variable (v1, v2) (thresholding operator TH)
+                estimateV(I1wx, I1wy, u1, u2, u3, grad, rho_c, v1, v2, v3, l_t, static_cast<float>(gamma));
+                // compute the divergence of the dual variable (p1, p2, p3)
+                divergence(p11, p12, div_p1);
+                divergence(p21, p22, div_p2);
+                if (use_gamma) divergence(p31, p32, div_p3);
+                // estimate the values of the optical flow (u1, u2)
+                error = estimateU(v1, v2, v3, div_p1, div_p2, div_p3, u1, u2, u3, static_cast<float>(theta), static_cast<float>(gamma));
+                // compute the gradient of the optical flow (Du1, Du2)
+                forwardGradient(u1, u1x, u1y);
+                forwardGradient(u2, u2x, u2y);
+                if (use_gamma) forwardGradient(u3, u3x, u3y);
+                // estimate the values of the dual variable (p1, p2, p3)
+                estimateDualVariables(u1x, u1y, u2x, u2y, u3x, u3y, p11, p12, p21, p22, p31, p32, taut, use_gamma);
+            }
+        }
+    }
+}
+void OpticalFlowDual_TVL1::collectGarbage()
+{
+    //dataMat structure dm
+    dm.I0s.clear();
+    dm.I1s.clear();
+    dm.u1s.clear();
+    dm.u2s.clear();
+    dm.I1x_buf.release();
+    dm.I1y_buf.release();
+    dm.flowMap1_buf.release();
+    dm.flowMap2_buf.release();
+    dm.I1w_buf.release();
+    dm.I1wx_buf.release();
+    dm.I1wy_buf.release();
+    dm.grad_buf.release();
+    dm.rho_c_buf.release();
+    dm.v1_buf.release();
+    dm.v2_buf.release();
+    dm.p11_buf.release();
+    dm.p12_buf.release();
+    dm.p21_buf.release();
+    dm.p22_buf.release();
+    dm.div_p1_buf.release();
+    dm.div_p2_buf.release();
+    dm.u1x_buf.release();
+    dm.u1y_buf.release();
+    dm.u2x_buf.release();
+    dm.u2y_buf.release();
+#ifdef HAVE_OPENCL
+    //dataUMat structure dum
+    dum.I0s.clear();
+    dum.I1s.clear();
+    dum.u1s.clear();
+    dum.u2s.clear();
+    dum.I1x_buf.release();
+    dum.I1y_buf.release();
+    dum.I1w_buf.release();
+    dum.I1wx_buf.release();
+    dum.I1wy_buf.release();
+    dum.grad_buf.release();
+    dum.rho_c_buf.release();
+    dum.p11_buf.release();
+    dum.p12_buf.release();
+    dum.p21_buf.release();
+    dum.p22_buf.release();
+    dum.diff_buf.release();
+    dum.norm_buf.release();
+#endif
+}
+Ptr<DualTVL1OpticalFlow> createOptFlow_DualTVL1()
+{
+    return makePtr<OpticalFlowDual_TVL1>();
+}
+Ptr<DualTVL1OpticalFlow> DualTVL1OpticalFlow::create(
+    double tau, double lambda, double theta, int nscales, int warps,
+    double epsilon, int innerIterations, int outerIterations, double scaleStep,
+    double gamma, int medianFilter, bool useInitialFlow)
+{
+    return makePtr<OpticalFlowDual_TVL1>(tau, lambda, theta, nscales, warps,
+                                         epsilon, innerIterations, outerIterations,
+                                         scaleStep, gamma, medianFilter, useInitialFlow);
+}
+}
+}
--- a/modules/optflow/src/variational_refinement.cpp
+++ b/modules/optflow/src/variational_refinement.cpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-#include "opencv2/core/hal/intrin.hpp"
-using namespace std;
-namespace cv
-{
-namespace optflow
-{
-class VariationalRefinementImpl CV_FINAL : public VariationalRefinement
-{
-  public:
-    VariationalRefinementImpl();
-    void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
-    void calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v) CV_OVERRIDE;
-    void collectGarbage() CV_OVERRIDE;
-  protected: //!< algorithm parameters
-    int fixedPointIterations, sorIterations;
-    float omega;
-    float alpha, delta, gamma;
-    float zeta, epsilon;
-  public:
-    int getFixedPointIterations() const CV_OVERRIDE { return fixedPointIterations; }
-    void setFixedPointIterations(int val) CV_OVERRIDE { fixedPointIterations = val; }
-    int getSorIterations() const CV_OVERRIDE { return sorIterations; }
-    void setSorIterations(int val) CV_OVERRIDE { sorIterations = val; }
-    float getOmega() const CV_OVERRIDE { return omega; }
-    void setOmega(float val) CV_OVERRIDE { omega = val; }
-    float getAlpha() const CV_OVERRIDE { return alpha; }
-    void setAlpha(float val) CV_OVERRIDE { alpha = val; }
-    float getDelta() const CV_OVERRIDE { return delta; }
-    void setDelta(float val) CV_OVERRIDE { delta = val; }
-    float getGamma() const CV_OVERRIDE { return gamma; }
-    void setGamma(float val) CV_OVERRIDE { gamma = val; }
-  protected: //!< internal buffers
-    /* This struct defines a special data layout for Mat_<float>. Original buffer is split into two: one for "red"
-     * elements (sum of indices is even) and one for "black" (sum of indices is odd) in a checkerboard pattern. It
-     * allows for more efficient processing in SOR iterations, more natural SIMD vectorization and parallelization
-     * (Red-Black SOR). Additionally, it simplifies border handling by adding repeated borders to both red and
-     * black buffers.
-     */
-    struct RedBlackBuffer
-    {
-        Mat_<float> red;   //!< (i+j)%2==0
-        Mat_<float> black; //!< (i+j)%2==1
-        /* Width of even and odd rows may be different */
-        int red_even_len, red_odd_len;
-        int black_even_len, black_odd_len;
-        void create(Size s);
-        void release();
-    };
-    Mat_<float> Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz;                            //!< image derivative buffers
-    RedBlackBuffer Ix_rb, Iy_rb, Iz_rb, Ixx_rb, Ixy_rb, Iyy_rb, Ixz_rb, Iyz_rb; //!< corresponding red-black buffers
-    RedBlackBuffer A11, A12, A22, b1, b2; //!< main linear system coefficients
-    RedBlackBuffer weights;               //!< smoothness term weights in the current fixed point iteration
-    Mat_<float> mapX, mapY; //!< auxiliary buffers for remapping
-    RedBlackBuffer tempW_u, tempW_v; //!< flow buffers that are modified in each fixed point iteration
-    RedBlackBuffer dW_u, dW_v;       //!< optical flow increment
-    RedBlackBuffer W_u_rb, W_v_rb;   //!< red-black-buffer version of the input flow
-  private: //!< private methods and parallel sections
-    void splitCheckerboard(RedBlackBuffer &dst, Mat &src);
-    void mergeCheckerboard(Mat &dst, RedBlackBuffer &src);
-    void updateRepeatedBorders(RedBlackBuffer &dst);
-    void warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat &flow_v);
-    void prepareBuffers(Mat &I0, Mat &I1, Mat &W_u, Mat &W_v);
-    /* Parallelizing arbitrary operations with 3 input/output arguments */
-    typedef void (VariationalRefinementImpl::*Op)(void *op1, void *op2, void *op3);
-    struct ParallelOp_ParBody : public ParallelLoopBody
-    {
-        VariationalRefinementImpl *var;
-        vector<Op> ops;
-        vector<void *> op1s;
-        vector<void *> op2s;
-        vector<void *> op3s;
-        ParallelOp_ParBody(VariationalRefinementImpl &_var, vector<Op> _ops, vector<void *> &_op1s,
-                           vector<void *> &_op2s, vector<void *> &_op3s);
-        void operator()(const Range &range) const CV_OVERRIDE;
-    };
-    void gradHorizAndSplitOp(void *src, void *dst, void *dst_split)
-    {
-        Sobel(*(Mat *)src, *(Mat *)dst, -1, 1, 0, 1, 1, 0.00, BORDER_REPLICATE);
-        splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst);
-    }
-    void gradVertAndSplitOp(void *src, void *dst, void *dst_split)
-    {
-        Sobel(*(Mat *)src, *(Mat *)dst, -1, 0, 1, 1, 1, 0.00, BORDER_REPLICATE);
-        splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst);
-    }
-    void averageOp(void *src1, void *src2, void *dst)
-    {
-        addWeighted(*(Mat *)src1, 0.5, *(Mat *)src2, 0.5, 0.0, *(Mat *)dst, CV_32F);
-    }
-    void subtractOp(void *src1, void *src2, void *dst)
-    {
-        subtract(*(Mat *)src1, *(Mat *)src2, *(Mat *)dst, noArray(), CV_32F);
-    }
-    struct ComputeDataTerm_ParBody : public ParallelLoopBody
-    {
-        VariationalRefinementImpl *var;
-        int nstripes, stripe_sz;
-        int h;
-        RedBlackBuffer *dW_u, *dW_v;
-        bool red_pass;
-        ComputeDataTerm_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_dW_u,
-                                RedBlackBuffer &_dW_v, bool _red_pass);
-        void operator()(const Range &range) const CV_OVERRIDE;
-    };
-    struct ComputeSmoothnessTermHorPass_ParBody : public ParallelLoopBody
-    {
-        VariationalRefinementImpl *var;
-        int nstripes, stripe_sz;
-        int h;
-        RedBlackBuffer *W_u, *W_v, *curW_u, *curW_v;
-        bool red_pass;
-        ComputeSmoothnessTermHorPass_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h,
-                                             RedBlackBuffer &_W_u, RedBlackBuffer &_W_v, RedBlackBuffer &_tempW_u,
-                                             RedBlackBuffer &_tempW_v, bool _red_pass);
-        void operator()(const Range &range) const CV_OVERRIDE;
-    };
-    struct ComputeSmoothnessTermVertPass_ParBody : public ParallelLoopBody
-    {
-        VariationalRefinementImpl *var;
-        int nstripes, stripe_sz;
-        int h;
-        RedBlackBuffer *W_u, *W_v;
-        bool red_pass;
-        ComputeSmoothnessTermVertPass_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h,
-                                              RedBlackBuffer &W_u, RedBlackBuffer &_W_v, bool _red_pass);
-        void operator()(const Range &range) const CV_OVERRIDE;
-    };
-    struct RedBlackSOR_ParBody : public ParallelLoopBody
-    {
-        VariationalRefinementImpl *var;
-        int nstripes, stripe_sz;
-        int h;
-        RedBlackBuffer *dW_u, *dW_v;
-        bool red_pass;
-        RedBlackSOR_ParBody(VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_dW_u,
-                            RedBlackBuffer &_dW_v, bool _red_pass);
-        void operator()(const Range &range) const CV_OVERRIDE;
-    };
-};
-VariationalRefinementImpl::VariationalRefinementImpl()
-{
-    fixedPointIterations = 5;
-    sorIterations = 5;
-    alpha = 20.0f;
-    delta = 5.0f;
-    gamma = 10.0f;
-    omega = 1.6f;
-    zeta = 0.1f;
-    epsilon = 0.001f;
-}
-/* This function converts an input Mat into the RedBlackBuffer format, which involves
- * splitting the input buffer into two and adding repeated borders. Assumes that enough
- * memory in dst is already allocated.
- */
-void VariationalRefinementImpl::splitCheckerboard(RedBlackBuffer &dst, Mat &src)
-{
-    int buf_j, j;
-    int buf_w = (int)ceil(src.cols / 2.0) + 2; //!< max width of red/black buffers with borders
-    /* Rows of red and black buffers can have different actual width, some extra repeated values are
-     * added for padding in such cases.
-     */
-    for (int i = 0; i < src.rows; i++)
-    {
-        float *src_buf = src.ptr<float>(i);
-        float *r_buf = dst.red.ptr<float>(i + 1);
-        float *b_buf = dst.black.ptr<float>(i + 1);
-        r_buf[0] = b_buf[0] = src_buf[0];
-        buf_j = 1;
-        if (i % 2 == 0)
-        {
-            for (j = 0; j < src.cols - 1; j += 2)
-            {
-                r_buf[buf_j] = src_buf[j];
-                b_buf[buf_j] = src_buf[j + 1];
-                buf_j++;
-            }
-            if (j < src.cols)
-                r_buf[buf_j] = b_buf[buf_j] = src_buf[j];
-            else
-                j--;
-        }
-        else
-        {
-            for (j = 0; j < src.cols - 1; j += 2)
-            {
-                b_buf[buf_j] = src_buf[j];
-                r_buf[buf_j] = src_buf[j + 1];
-                buf_j++;
-            }
-            if (j < src.cols)
-                r_buf[buf_j] = b_buf[buf_j] = src_buf[j];
-            else
-                j--;
-        }
-        r_buf[buf_w - 1] = b_buf[buf_w - 1] = src_buf[j];
-    }
-    /* Fill top and bottom borders: */
-    {
-        float *r_buf_border = dst.red.ptr<float>(dst.red.rows - 1);
-        float *b_buf_border = dst.black.ptr<float>(dst.black.rows - 1);
-        float *r_buf = dst.red.ptr<float>(dst.red.rows - 2);
-        float *b_buf = dst.black.ptr<float>(dst.black.rows - 2);
-        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
-        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
-    }
-    {
-        float *r_buf_border = dst.red.ptr<float>(0);
-        float *b_buf_border = dst.black.ptr<float>(0);
-        float *r_buf = dst.red.ptr<float>(1);
-        float *b_buf = dst.black.ptr<float>(1);
-        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
-        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
-    }
-}
-/* The inverse of splitCheckerboard, i.e. converting the RedBlackBuffer back into Mat.
- * Assumes that enough memory in dst is already allocated.
- */
-void VariationalRefinementImpl::mergeCheckerboard(Mat &dst, RedBlackBuffer &src)
-{
-    int buf_j, j;
-    for (int i = 0; i < dst.rows; i++)
-    {
-        float *src_r_buf = src.red.ptr<float>(i + 1);
-        float *src_b_buf = src.black.ptr<float>(i + 1);
-        float *dst_buf = dst.ptr<float>(i);
-        buf_j = 1;
-        if (i % 2 == 0)
-        {
-            for (j = 0; j < dst.cols - 1; j += 2)
-            {
-                dst_buf[j] = src_r_buf[buf_j];
-                dst_buf[j + 1] = src_b_buf[buf_j];
-                buf_j++;
-            }
-            if (j < dst.cols)
-                dst_buf[j] = src_r_buf[buf_j];
-        }
-        else
-        {
-            for (j = 0; j < dst.cols - 1; j += 2)
-            {
-                dst_buf[j] = src_b_buf[buf_j];
-                dst_buf[j + 1] = src_r_buf[buf_j];
-                buf_j++;
-            }
-            if (j < dst.cols)
-                dst_buf[j] = src_b_buf[buf_j];
-        }
-    }
-}
-/* An auxiliary function that updates the borders. Used to enforce that border values repeat
- * the ones adjacent to the border.
- */
-void VariationalRefinementImpl::updateRepeatedBorders(RedBlackBuffer &dst)
-{
-    int buf_w = dst.red.cols;
-    for (int i = 0; i < dst.red.rows - 2; i++)
-    {
-        float *r_buf = dst.red.ptr<float>(i + 1);
-        float *b_buf = dst.black.ptr<float>(i + 1);
-        if (i % 2 == 0)
-        {
-            b_buf[0] = r_buf[1];
-            if (dst.red_even_len > dst.black_even_len)
-                b_buf[dst.black_even_len + 1] = r_buf[dst.red_even_len];
-            else
-                r_buf[dst.red_even_len + 1] = b_buf[dst.black_even_len];
-        }
-        else
-        {
-            r_buf[0] = b_buf[1];
-            if (dst.red_odd_len < dst.black_odd_len)
-                r_buf[dst.red_odd_len + 1] = b_buf[dst.black_odd_len];
-            else
-                b_buf[dst.black_odd_len + 1] = r_buf[dst.red_odd_len];
-        }
-    }
-    {
-        float *r_buf_border = dst.red.ptr<float>(dst.red.rows - 1);
-        float *b_buf_border = dst.black.ptr<float>(dst.black.rows - 1);
-        float *r_buf = dst.red.ptr<float>(dst.red.rows - 2);
-        float *b_buf = dst.black.ptr<float>(dst.black.rows - 2);
-        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
-        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
-    }
-    {
-        float *r_buf_border = dst.red.ptr<float>(0);
-        float *b_buf_border = dst.black.ptr<float>(0);
-        float *r_buf = dst.red.ptr<float>(1);
-        float *b_buf = dst.black.ptr<float>(1);
-        memcpy(r_buf_border, b_buf, buf_w * sizeof(float));
-        memcpy(b_buf_border, r_buf, buf_w * sizeof(float));
-    }
-}
-void VariationalRefinementImpl::RedBlackBuffer::create(Size s)
-{
-    /* Allocate enough memory to include borders */
-    int w = (int)ceil(s.width / 2.0) + 2;
-    red.create(s.height + 2, w);
-    black.create(s.height + 2, w);
-    if (s.width % 2 == 0)
-        red_even_len = red_odd_len = black_even_len = black_odd_len = w - 2;
-    else
-    {
-        red_even_len = black_odd_len = w - 2;
-        red_odd_len = black_even_len = w - 3;
-    }
-}
-void VariationalRefinementImpl::RedBlackBuffer::release()
-{
-    red.release();
-    black.release();
-}
-VariationalRefinementImpl::ParallelOp_ParBody::ParallelOp_ParBody(VariationalRefinementImpl &_var, vector<Op> _ops,
-                                                                  vector<void *> &_op1s, vector<void *> &_op2s,
-                                                                  vector<void *> &_op3s)
-    : var(&_var), ops(_ops), op1s(_op1s), op2s(_op2s), op3s(_op3s)
-{
-}
-void VariationalRefinementImpl::ParallelOp_ParBody::operator()(const Range &range) const
-{
-    for (int i = range.start; i < range.end; i++)
-        (var->*ops[i])(op1s[i], op2s[i], op3s[i]);
-}
-void VariationalRefinementImpl::warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat &flow_v)
-{
-    for (int i = 0; i < flow_u.rows; i++)
-    {
-        float *pFlowU = flow_u.ptr<float>(i);
-        float *pFlowV = flow_v.ptr<float>(i);
-        float *pMapX = mapX.ptr<float>(i);
-        float *pMapY = mapY.ptr<float>(i);
-        for (int j = 0; j < flow_u.cols; j++)
-        {
-            pMapX[j] = j + pFlowU[j];
-            pMapY[j] = i + pFlowV[j];
-        }
-    }
-    remap(src, dst, mapX, mapY, INTER_LINEAR, BORDER_REPLICATE);
-}
-void VariationalRefinementImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &W_u, Mat &W_v)
-{
-    Size s = I0.size();
-    A11.create(s);
-    A12.create(s);
-    A22.create(s);
-    b1.create(s);
-    b2.create(s);
-    weights.create(s);
-    weights.red.setTo(0.0f);
-    weights.black.setTo(0.0f);
-    tempW_u.create(s);
-    tempW_v.create(s);
-    dW_u.create(s);
-    dW_v.create(s);
-    W_u_rb.create(s);
-    W_v_rb.create(s);
-    Ix.create(s);
-    Iy.create(s);
-    Iz.create(s);
-    Ixx.create(s);
-    Ixy.create(s);
-    Iyy.create(s);
-    Ixz.create(s);
-    Iyz.create(s);
-    Ix_rb.create(s);
-    Iy_rb.create(s);
-    Iz_rb.create(s);
-    Ixx_rb.create(s);
-    Ixy_rb.create(s);
-    Iyy_rb.create(s);
-    Ixz_rb.create(s);
-    Iyz_rb.create(s);
-    mapX.create(s);
-    mapY.create(s);
-    /* Floating point warps work significantly better than fixed-point */
-    Mat I1flt, warpedI;
-    I1.convertTo(I1flt, CV_32F);
-    warpImage(warpedI, I1flt, W_u, W_v);
-    /* Computing an average of the current and warped next frames (to compute the derivatives on) and
-     * temporal derivative Iz
-     */
-    Mat averagedI;
-    {
-        vector<void *> op1s;
-        op1s.push_back((void *)&I0);
-        op1s.push_back((void *)&warpedI);
-        vector<void *> op2s;
-        op2s.push_back((void *)&warpedI);
-        op2s.push_back((void *)&I0);
-        vector<void *> op3s;
-        op3s.push_back((void *)&averagedI);
-        op3s.push_back((void *)&Iz);
-        vector<Op> ops;
-        ops.push_back(&VariationalRefinementImpl::averageOp);
-        ops.push_back(&VariationalRefinementImpl::subtractOp);
-        parallel_for_(Range(0, 2), ParallelOp_ParBody(*this, ops, op1s, op2s, op3s));
-    }
-    splitCheckerboard(Iz_rb, Iz);
-    /* Computing first-order derivatives */
-    {
-        vector<void *> op1s;
-        op1s.push_back((void *)&averagedI);
-        op1s.push_back((void *)&averagedI);
-        op1s.push_back((void *)&Iz);
-        op1s.push_back((void *)&Iz);
-        vector<void *> op2s;
-        op2s.push_back((void *)&Ix);
-        op2s.push_back((void *)&Iy);
-        op2s.push_back((void *)&Ixz);
-        op2s.push_back((void *)&Iyz);
-        vector<void *> op3s;
-        op3s.push_back((void *)&Ix_rb);
-        op3s.push_back((void *)&Iy_rb);
-        op3s.push_back((void *)&Ixz_rb);
-        op3s.push_back((void *)&Iyz_rb);
-        vector<Op> ops;
-        ops.push_back(&VariationalRefinementImpl::gradHorizAndSplitOp);
-        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
-        ops.push_back(&VariationalRefinementImpl::gradHorizAndSplitOp);
-        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
-        parallel_for_(Range(0, 4), ParallelOp_ParBody(*this, ops, op1s, op2s, op3s));
-    }
-    /* Computing second-order derivatives */
-    {
-        vector<void *> op1s;
-        op1s.push_back((void *)&Ix);
-        op1s.push_back((void *)&Ix);
-        op1s.push_back((void *)&Iy);
-        vector<void *> op2s;
-        op2s.push_back((void *)&Ixx);
-        op2s.push_back((void *)&Ixy);
-        op2s.push_back((void *)&Iyy);
-        vector<void *> op3s;
-        op3s.push_back((void *)&Ixx_rb);
-        op3s.push_back((void *)&Ixy_rb);
-        op3s.push_back((void *)&Iyy_rb);
-        vector<Op> ops;
-        ops.push_back(&VariationalRefinementImpl::gradHorizAndSplitOp);
-        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
-        ops.push_back(&VariationalRefinementImpl::gradVertAndSplitOp);
-        parallel_for_(Range(0, 3), ParallelOp_ParBody(*this, ops, op1s, op2s, op3s));
-    }
-}
-VariationalRefinementImpl::ComputeDataTerm_ParBody::ComputeDataTerm_ParBody(VariationalRefinementImpl &_var,
-                                                                            int _nstripes, int _h,
-                                                                            RedBlackBuffer &_dW_u,
-                                                                            RedBlackBuffer &_dW_v, bool _red_pass)
-    : var(&_var), nstripes(_nstripes), h(_h), dW_u(&_dW_u), dW_v(&_dW_v), red_pass(_red_pass)
-{
-    stripe_sz = (int)ceil(h / (double)nstripes);
-}
-/* This function computes parts of the main linear system coefficients A11,A12,A22,b1,b1
- * that correspond to the data term, which includes color and gradient constancy assumptions.
- */
-void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range &range) const
-{
-    int start_i = min(range.start * stripe_sz, h);
-    int end_i = min(range.end * stripe_sz, h);
-    float zeta_squared = var->zeta * var->zeta;
-    float epsilon_squared = var->epsilon * var->epsilon;
-    float gamma2 = var->gamma / 2;
-    float delta2 = var->delta / 2;
-    float *pIx, *pIy, *pIz;
-    float *pIxx, *pIxy, *pIyy, *pIxz, *pIyz;
-    float *pdU, *pdV;
-    float *pa11, *pa12, *pa22, *pb1, *pb2;
-    float derivNorm, derivNorm2;
-    float Ik1z, Ik1zx, Ik1zy;
-    float weight;
-    int len;
-    for (int i = start_i; i < end_i; i++)
-    {
-#define INIT_ROW_POINTERS(color)                                                                                       \
-    pIx = var->Ix_rb.color.ptr<float>(i + 1) + 1;                                                                      \
-    pIy = var->Iy_rb.color.ptr<float>(i + 1) + 1;                                                                      \
-    pIz = var->Iz_rb.color.ptr<float>(i + 1) + 1;                                                                      \
-    pIxx = var->Ixx_rb.color.ptr<float>(i + 1) + 1;                                                                    \
-    pIxy = var->Ixy_rb.color.ptr<float>(i + 1) + 1;                                                                    \
-    pIyy = var->Iyy_rb.color.ptr<float>(i + 1) + 1;                                                                    \
-    pIxz = var->Ixz_rb.color.ptr<float>(i + 1) + 1;                                                                    \
-    pIyz = var->Iyz_rb.color.ptr<float>(i + 1) + 1;                                                                    \
-    pa11 = var->A11.color.ptr<float>(i + 1) + 1;                                                                       \
-    pa12 = var->A12.color.ptr<float>(i + 1) + 1;                                                                       \
-    pa22 = var->A22.color.ptr<float>(i + 1) + 1;                                                                       \
-    pb1 = var->b1.color.ptr<float>(i + 1) + 1;                                                                         \
-    pb2 = var->b2.color.ptr<float>(i + 1) + 1;                                                                         \
-    pdU = dW_u->color.ptr<float>(i + 1) + 1;                                                                           \
-    pdV = dW_v->color.ptr<float>(i + 1) + 1;                                                                           \
-    if (i % 2 == 0)                                                                                                    \
-        len = var->Ix_rb.color##_even_len;                                                                             \
-    else                                                                                                               \
-        len = var->Ix_rb.color##_odd_len;
-        if (red_pass)
-        {
-            INIT_ROW_POINTERS(red);
-        }
-        else
-        {
-            INIT_ROW_POINTERS(black);
-        }
-#undef INIT_ROW_POINTERS
-        int j = 0;
-#ifdef CV_SIMD128
-        v_float32x4 zeta_vec = v_setall_f32(zeta_squared);
-        v_float32x4 eps_vec = v_setall_f32(epsilon_squared);
-        v_float32x4 delta_vec = v_setall_f32(delta2);
-        v_float32x4 gamma_vec = v_setall_f32(gamma2);
-        v_float32x4 zero_vec = v_setall_f32(0.0f);
-        v_float32x4 pIx_vec, pIy_vec, pIz_vec, pdU_vec, pdV_vec;
-        v_float32x4 pIxx_vec, pIxy_vec, pIyy_vec, pIxz_vec, pIyz_vec;
-        v_float32x4 derivNorm_vec, derivNorm2_vec, weight_vec;
-        v_float32x4 Ik1z_vec, Ik1zx_vec, Ik1zy_vec;
-        v_float32x4 pa11_vec, pa12_vec, pa22_vec, pb1_vec, pb2_vec;
-        for (; j < len - 3; j += 4)
-        {
-            pIx_vec = v_load(pIx + j);
-            pIy_vec = v_load(pIy + j);
-            pIz_vec = v_load(pIz + j);
-            pdU_vec = v_load(pdU + j);
-            pdV_vec = v_load(pdV + j);
-            derivNorm_vec = pIx_vec * pIx_vec + pIy_vec * pIy_vec + zeta_vec;
-            Ik1z_vec = pIz_vec + pIx_vec * pdU_vec + pIy_vec * pdV_vec;
-            weight_vec = (delta_vec / v_sqrt(Ik1z_vec * Ik1z_vec / derivNorm_vec + eps_vec)) / derivNorm_vec;
-            pa11_vec = weight_vec * (pIx_vec * pIx_vec) + zeta_vec;
-            pa12_vec = weight_vec * (pIx_vec * pIy_vec);
-            pa22_vec = weight_vec * (pIy_vec * pIy_vec) + zeta_vec;
-            pb1_vec = zero_vec - weight_vec * (pIz_vec * pIx_vec);
-            pb2_vec = zero_vec - weight_vec * (pIz_vec * pIy_vec);
-            pIxx_vec = v_load(pIxx + j);
-            pIxy_vec = v_load(pIxy + j);
-            pIyy_vec = v_load(pIyy + j);
-            pIxz_vec = v_load(pIxz + j);
-            pIyz_vec = v_load(pIyz + j);
-            derivNorm_vec = pIxx_vec * pIxx_vec + pIxy_vec * pIxy_vec + zeta_vec;
-            derivNorm2_vec = pIyy_vec * pIyy_vec + pIxy_vec * pIxy_vec + zeta_vec;
-            Ik1zx_vec = pIxz_vec + pIxx_vec * pdU_vec + pIxy_vec * pdV_vec;
-            Ik1zy_vec = pIyz_vec + pIxy_vec * pdU_vec + pIyy_vec * pdV_vec;
-            weight_vec = gamma_vec / v_sqrt(Ik1zx_vec * Ik1zx_vec / derivNorm_vec +
-                                            Ik1zy_vec * Ik1zy_vec / derivNorm2_vec + eps_vec);
-            pa11_vec += weight_vec * (pIxx_vec * pIxx_vec / derivNorm_vec + pIxy_vec * pIxy_vec / derivNorm2_vec);
-            pa12_vec += weight_vec * (pIxx_vec * pIxy_vec / derivNorm_vec + pIxy_vec * pIyy_vec / derivNorm2_vec);
-            pa22_vec += weight_vec * (pIxy_vec * pIxy_vec / derivNorm_vec + pIyy_vec * pIyy_vec / derivNorm2_vec);
-            pb1_vec -= weight_vec * (pIxx_vec * pIxz_vec / derivNorm_vec + pIxy_vec * pIyz_vec / derivNorm2_vec);
-            pb2_vec -= weight_vec * (pIxy_vec * pIxz_vec / derivNorm_vec + pIyy_vec * pIyz_vec / derivNorm2_vec);
-            v_store(pa11 + j, pa11_vec);
-            v_store(pa12 + j, pa12_vec);
-            v_store(pa22 + j, pa22_vec);
-            v_store(pb1 + j, pb1_vec);
-            v_store(pb2 + j, pb2_vec);
-        }
-#endif
-        for (; j < len; j++)
-        {
-            /* Step 1: Compute color constancy terms */
-            /* Normalization factor:*/
-            derivNorm = pIx[j] * pIx[j] + pIy[j] * pIy[j] + zeta_squared;
-            /* Color constancy penalty (computed by Taylor expansion):*/
-            Ik1z = pIz[j] + pIx[j] * pdU[j] + pIy[j] * pdV[j];
-            /* Weight of the color constancy term in the current fixed-point iteration divided by derivNorm: */
-            weight = (delta2 / sqrt(Ik1z * Ik1z / derivNorm + epsilon_squared)) / derivNorm;
-            /* Add respective color constancy terms to the linear system coefficients: */
-            pa11[j] = weight * (pIx[j] * pIx[j]) + zeta_squared;
-            pa12[j] = weight * (pIx[j] * pIy[j]);
-            pa22[j] = weight * (pIy[j] * pIy[j]) + zeta_squared;
-            pb1[j] = -weight * (pIz[j] * pIx[j]);
-            pb2[j] = -weight * (pIz[j] * pIy[j]);
-            /* Step 2: Compute gradient constancy terms */
-            /* Normalization factor for x gradient: */
-            derivNorm = pIxx[j] * pIxx[j] + pIxy[j] * pIxy[j] + zeta_squared;
-            /* Normalization factor for y gradient: */
-            derivNorm2 = pIyy[j] * pIyy[j] + pIxy[j] * pIxy[j] + zeta_squared;
-            /* Gradient constancy penalties (computed by Taylor expansion): */
-            Ik1zx = pIxz[j] + pIxx[j] * pdU[j] + pIxy[j] * pdV[j];
-            Ik1zy = pIyz[j] + pIxy[j] * pdU[j] + pIyy[j] * pdV[j];
-            /* Weight of the gradient constancy term in the current fixed-point iteration: */
-            weight = gamma2 / sqrt(Ik1zx * Ik1zx / derivNorm + Ik1zy * Ik1zy / derivNorm2 + epsilon_squared);
-            /* Add respective gradient constancy components to the linear system coefficients: */
-            pa11[j] += weight * (pIxx[j] * pIxx[j] / derivNorm + pIxy[j] * pIxy[j] / derivNorm2);
-            pa12[j] += weight * (pIxx[j] * pIxy[j] / derivNorm + pIxy[j] * pIyy[j] / derivNorm2);
-            pa22[j] += weight * (pIxy[j] * pIxy[j] / derivNorm + pIyy[j] * pIyy[j] / derivNorm2);
-            pb1[j] += -weight * (pIxx[j] * pIxz[j] / derivNorm + pIxy[j] * pIyz[j] / derivNorm2);
-            pb2[j] += -weight * (pIxy[j] * pIxz[j] / derivNorm + pIyy[j] * pIyz[j] / derivNorm2);
-        }
-    }
-}
-VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::ComputeSmoothnessTermHorPass_ParBody(
-  VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_W_u, RedBlackBuffer &_W_v,
-  RedBlackBuffer &_tempW_u, RedBlackBuffer &_tempW_v, bool _red_pass)
-    : var(&_var), nstripes(_nstripes), h(_h), W_u(&_W_u), W_v(&_W_v), curW_u(&_tempW_u), curW_v(&_tempW_v),
-      red_pass(_red_pass)
-{
-    stripe_sz = (int)ceil(h / (double)nstripes);
-}
-/* This function updates the linear system coefficients A11,A22,b1,b1 according to the
- * flow smoothness term and computes corresponding weights for the current fixed point iteration.
- * A11,A22,b1,b1 are updated only partially (horizontal pass). Doing both horizontal and vertical
- * passes in one loop complicates parallelization (different threads write to the same elements).
- */
-void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator()(const Range &range) const
-{
-    int start_i = min(range.start * stripe_sz, h);
-    int end_i = min(range.end * stripe_sz, h);
-    float epsilon_squared = var->epsilon * var->epsilon;
-    float alpha2 = var->alpha / 2;
-    float *pWeight;
-    float *pA_u, *pA_u_next, *pA_v, *pA_v_next;
-    float *pB_u, *pB_u_next, *pB_v, *pB_v_next;
-    float *cW_u, *cW_u_next, *cW_u_next_row;
-    float *cW_v, *cW_v_next, *cW_v_next_row;
-    float *pW_u, *pW_u_next;
-    float *pW_v, *pW_v_next;
-    float ux, uy, vx, vy;
-    int len;
-    bool touches_right_border = true;
-#define INIT_ROW_POINTERS(cur_color, next_color, next_offs_even, next_offs_odd, bool_default)                          \
-    pWeight = var->weights.cur_color.ptr<float>(i + 1) + 1;                                                            \
-    pA_u = var->A11.cur_color.ptr<float>(i + 1) + 1;                                                                   \
-    pB_u = var->b1.cur_color.ptr<float>(i + 1) + 1;                                                                    \
-    cW_u = curW_u->cur_color.ptr<float>(i + 1) + 1;                                                                    \
-    pW_u = W_u->cur_color.ptr<float>(i + 1) + 1;                                                                       \
-    pA_v = var->A22.cur_color.ptr<float>(i + 1) + 1;                                                                   \
-    pB_v = var->b2.cur_color.ptr<float>(i + 1) + 1;                                                                    \
-    cW_v = curW_v->cur_color.ptr<float>(i + 1) + 1;                                                                    \
-    pW_v = W_v->cur_color.ptr<float>(i + 1) + 1;                                                                       \
-                                                                                                                       \
-    cW_u_next_row = curW_u->next_color.ptr<float>(i + 2) + 1;                                                          \
-    cW_v_next_row = curW_v->next_color.ptr<float>(i + 2) + 1;                                                          \
-                                                                                                                       \
-    if (i % 2 == 0)                                                                                                    \
-    {                                                                                                                  \
-        pA_u_next = var->A11.next_color.ptr<float>(i + 1) + next_offs_even;                                            \
-        pB_u_next = var->b1.next_color.ptr<float>(i + 1) + next_offs_even;                                             \
-        cW_u_next = curW_u->next_color.ptr<float>(i + 1) + next_offs_even;                                             \
-        pW_u_next = W_u->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
-        pA_v_next = var->A22.next_color.ptr<float>(i + 1) + next_offs_even;                                            \
-        pB_v_next = var->b2.next_color.ptr<float>(i + 1) + next_offs_even;                                             \
-        cW_v_next = curW_v->next_color.ptr<float>(i + 1) + next_offs_even;                                             \
-        pW_v_next = W_v->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
-        len = var->A11.cur_color##_even_len;                                                                           \
-        if (var->A11.cur_color##_even_len != var->A11.cur_color##_odd_len)                                             \
-            touches_right_border = bool_default;                                                                       \
-        else                                                                                                           \
-            touches_right_border = !bool_default;                                                                      \
-    }                                                                                                                  \
-    else                                                                                                               \
-    {                                                                                                                  \
-        pA_u_next = var->A11.next_color.ptr<float>(i + 1) + next_offs_odd;                                             \
-        pB_u_next = var->b1.next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
-        cW_u_next = curW_u->next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
-        pW_u_next = W_u->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
-        pA_v_next = var->A22.next_color.ptr<float>(i + 1) + next_offs_odd;                                             \
-        pB_v_next = var->b2.next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
-        cW_v_next = curW_v->next_color.ptr<float>(i + 1) + next_offs_odd;                                              \
-        pW_v_next = W_v->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
-        len = var->A11.cur_color##_odd_len;                                                                            \
-        if (var->A11.cur_color##_even_len != var->A11.cur_color##_odd_len)                                             \
-            touches_right_border = !bool_default;                                                                      \
-        else                                                                                                           \
-            touches_right_border = bool_default;                                                                       \
-    }
-    for (int i = start_i; i < end_i; i++)
-    {
-        if (red_pass)
-        {
-            INIT_ROW_POINTERS(red, black, 1, 2, true);
-        }
-        else
-        {
-            INIT_ROW_POINTERS(black, red, 2, 1, false);
-        }
-#undef INIT_ROW_POINTERS
-#define COMPUTE                                                                                                        \
-    /* Gradients for the flow on the current fixed-point iteration: */                                                 \
-    ux = cW_u_next[j] - cW_u[j];                                                                                       \
-    vx = cW_v_next[j] - cW_v[j];                                                                                       \
-    uy = cW_u_next_row[j] - cW_u[j];                                                                                   \
-    vy = cW_v_next_row[j] - cW_v[j];                                                                                   \
-    /* Weight of the smoothness term in the current fixed-point iteration: */                                          \
-    pWeight[j] = alpha2 / sqrt(ux * ux + vx * vx + uy * uy + vy * vy + epsilon_squared);                               \
-    /* Gradients for initial raw flow multiplied by weight:*/                                                          \
-    ux = pWeight[j] * (pW_u_next[j] - pW_u[j]);                                                                        \
-    vx = pWeight[j] * (pW_v_next[j] - pW_v[j]);
-#define UPDATE                                                                                                         \
-    pB_u[j] += ux;                                                                                                     \
-    pA_u[j] += pWeight[j];                                                                                             \
-    pB_v[j] += vx;                                                                                                     \
-    pA_v[j] += pWeight[j];                                                                                             \
-    pB_u_next[j] -= ux;                                                                                                \
-    pA_u_next[j] += pWeight[j];                                                                                        \
-    pB_v_next[j] -= vx;                                                                                                \
-    pA_v_next[j] += pWeight[j];
-        int j = 0;
-#ifdef CV_SIMD128
-        v_float32x4 alpha2_vec = v_setall_f32(alpha2);
-        v_float32x4 eps_vec = v_setall_f32(epsilon_squared);
-        v_float32x4 cW_u_vec, cW_v_vec;
-        v_float32x4 pWeight_vec, ux_vec, vx_vec, uy_vec, vy_vec;
-        for (; j < len - 4; j += 4)
-        {
-            cW_u_vec = v_load(cW_u + j);
-            cW_v_vec = v_load(cW_v + j);
-            ux_vec = v_load(cW_u_next + j) - cW_u_vec;
-            vx_vec = v_load(cW_v_next + j) - cW_v_vec;
-            uy_vec = v_load(cW_u_next_row + j) - cW_u_vec;
-            vy_vec = v_load(cW_v_next_row + j) - cW_v_vec;
-            pWeight_vec =
-              alpha2_vec / v_sqrt(ux_vec * ux_vec + vx_vec * vx_vec + uy_vec * uy_vec + vy_vec * vy_vec + eps_vec);
-            v_store(pWeight + j, pWeight_vec);
-            ux_vec = pWeight_vec * (v_load(pW_u_next + j) - v_load(pW_u + j));
-            vx_vec = pWeight_vec * (v_load(pW_v_next + j) - v_load(pW_v + j));
-            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
-            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
-            v_store(pB_u + j, v_load(pB_u + j) + ux_vec);
-            v_store(pB_v + j, v_load(pB_v + j) + vx_vec);
-            v_store(pA_u_next + j, v_load(pA_u_next + j) + pWeight_vec);
-            v_store(pA_v_next + j, v_load(pA_v_next + j) + pWeight_vec);
-            v_store(pB_u_next + j, v_load(pB_u_next + j) - ux_vec);
-            v_store(pB_v_next + j, v_load(pB_v_next + j) - vx_vec);
-        }
-#endif
-        for (; j < len - 1; j++)
-        {
-            COMPUTE;
-            UPDATE;
-        }
-        /* Omit the update on the rightmost elements */
-        if (touches_right_border)
-        {
-            COMPUTE;
-        }
-        else
-        {
-            COMPUTE;
-            UPDATE;
-        }
-    }
-#undef COMPUTE
-#undef UPDATE
-}
-VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::ComputeSmoothnessTermVertPass_ParBody(
-  VariationalRefinementImpl &_var, int _nstripes, int _h, RedBlackBuffer &_W_u, RedBlackBuffer &_W_v, bool _red_pass)
-    : var(&_var), nstripes(_nstripes), W_u(&_W_u), W_v(&_W_v), red_pass(_red_pass)
-{
-    /* Omit the last row in the vertical pass */
-    h = _h - 1;
-    stripe_sz = (int)ceil(h / (double)nstripes);
-}
-/* This function adds the last remaining terms to the linear system coefficients A11,A22,b1,b1. */
-void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator()(const Range &range) const
-{
-    int start_i = min(range.start * stripe_sz, h);
-    int end_i = min(range.end * stripe_sz, h);
-    float *pWeight;
-    float *pA_u, *pA_u_next_row, *pA_v, *pA_v_next_row;
-    float *pB_u, *pB_u_next_row, *pB_v, *pB_v_next_row;
-    float *pW_u, *pW_u_next_row, *pW_v, *pW_v_next_row;
-    float vy, uy;
-    int len;
-    for (int i = start_i; i < end_i; i++)
-    {
-#define INIT_ROW_POINTERS(cur_color, next_color)                                                                       \
-    pWeight = var->weights.cur_color.ptr<float>(i + 1) + 1;                                                            \
-    pA_u = var->A11.cur_color.ptr<float>(i + 1) + 1;                                                                   \
-    pB_u = var->b1.cur_color.ptr<float>(i + 1) + 1;                                                                    \
-    pW_u = W_u->cur_color.ptr<float>(i + 1) + 1;                                                                       \
-    pA_v = var->A22.cur_color.ptr<float>(i + 1) + 1;                                                                   \
-    pB_v = var->b2.cur_color.ptr<float>(i + 1) + 1;                                                                    \
-    pW_v = W_v->cur_color.ptr<float>(i + 1) + 1;                                                                       \
-                                                                                                                       \
-    pA_u_next_row = var->A11.next_color.ptr<float>(i + 2) + 1;                                                         \
-    pB_u_next_row = var->b1.next_color.ptr<float>(i + 2) + 1;                                                          \
-    pW_u_next_row = W_u->next_color.ptr<float>(i + 2) + 1;                                                             \
-    pA_v_next_row = var->A22.next_color.ptr<float>(i + 2) + 1;                                                         \
-    pB_v_next_row = var->b2.next_color.ptr<float>(i + 2) + 1;                                                          \
-    pW_v_next_row = W_v->next_color.ptr<float>(i + 2) + 1;                                                             \
-                                                                                                                       \
-    if (i % 2 == 0)                                                                                                    \
-        len = var->A11.cur_color##_even_len;                                                                           \
-    else                                                                                                               \
-        len = var->A11.cur_color##_odd_len;
-        if (red_pass)
-        {
-            INIT_ROW_POINTERS(red, black);
-        }
-        else
-        {
-            INIT_ROW_POINTERS(black, red);
-        }
-#undef INIT_ROW_POINTERS
-        int j = 0;
-#ifdef CV_SIMD128
-        v_float32x4 pWeight_vec, uy_vec, vy_vec;
-        for (; j < len - 3; j += 4)
-        {
-            pWeight_vec = v_load(pWeight + j);
-            uy_vec = pWeight_vec * (v_load(pW_u_next_row + j) - v_load(pW_u + j));
-            vy_vec = pWeight_vec * (v_load(pW_v_next_row + j) - v_load(pW_v + j));
-            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
-            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
-            v_store(pB_u + j, v_load(pB_u + j) + uy_vec);
-            v_store(pB_v + j, v_load(pB_v + j) + vy_vec);
-            v_store(pA_u_next_row + j, v_load(pA_u_next_row + j) + pWeight_vec);
-            v_store(pA_v_next_row + j, v_load(pA_v_next_row + j) + pWeight_vec);
-            v_store(pB_u_next_row + j, v_load(pB_u_next_row + j) - uy_vec);
-            v_store(pB_v_next_row + j, v_load(pB_v_next_row + j) - vy_vec);
-        }
-#endif
-        for (; j < len; j++)
-        {
-            uy = pWeight[j] * (pW_u_next_row[j] - pW_u[j]);
-            vy = pWeight[j] * (pW_v_next_row[j] - pW_v[j]);
-            pB_u[j] += uy;
-            pA_u[j] += pWeight[j];
-            pB_v[j] += vy;
-            pA_v[j] += pWeight[j];
-            pB_u_next_row[j] -= uy;
-            pA_u_next_row[j] += pWeight[j];
-            pB_v_next_row[j] -= vy;
-            pA_v_next_row[j] += pWeight[j];
-        }
-    }
-}
-VariationalRefinementImpl::RedBlackSOR_ParBody::RedBlackSOR_ParBody(VariationalRefinementImpl &_var, int _nstripes,
-                                                                    int _h, RedBlackBuffer &_dW_u,
-                                                                    RedBlackBuffer &_dW_v, bool _red_pass)
-    : var(&_var), nstripes(_nstripes), h(_h), dW_u(&_dW_u), dW_v(&_dW_v), red_pass(_red_pass)
-{
-    stripe_sz = (int)ceil(h / (double)nstripes);
-}
-/* This function implements the Red-Black SOR (successive-over relaxation) method for solving the main
- * linear system in the current fixed-point iteration.
- */
-void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &range) const
-{
-    int start = min(range.start * stripe_sz, h);
-    int end = min(range.end * stripe_sz, h);
-    float *pa11, *pa12, *pa22, *pb1, *pb2, *pW, *pdu, *pdv;
-    float *pW_next, *pdu_next, *pdv_next;
-    float *pW_prev_row, *pdu_prev_row, *pdv_prev_row;
-    float *pdu_next_row, *pdv_next_row;
-    float sigmaU, sigmaV;
-    int j, len;
-    for (int i = start; i < end; i++)
-    {
-#define INIT_ROW_POINTERS(cur_color, next_color, next_offs_even, next_offs_odd)                                        \
-    pW = var->weights.cur_color.ptr<float>(i + 1) + 1;                                                                 \
-    pa11 = var->A11.cur_color.ptr<float>(i + 1) + 1;                                                                   \
-    pa12 = var->A12.cur_color.ptr<float>(i + 1) + 1;                                                                   \
-    pa22 = var->A22.cur_color.ptr<float>(i + 1) + 1;                                                                   \
-    pb1 = var->b1.cur_color.ptr<float>(i + 1) + 1;                                                                     \
-    pb2 = var->b2.cur_color.ptr<float>(i + 1) + 1;                                                                     \
-    pdu = dW_u->cur_color.ptr<float>(i + 1) + 1;                                                                       \
-    pdv = dW_v->cur_color.ptr<float>(i + 1) + 1;                                                                       \
-                                                                                                                       \
-    pdu_next_row = dW_u->next_color.ptr<float>(i + 2) + 1;                                                             \
-    pdv_next_row = dW_v->next_color.ptr<float>(i + 2) + 1;                                                             \
-                                                                                                                       \
-    pW_prev_row = var->weights.next_color.ptr<float>(i) + 1;                                                           \
-    pdu_prev_row = dW_u->next_color.ptr<float>(i) + 1;                                                                 \
-    pdv_prev_row = dW_v->next_color.ptr<float>(i) + 1;                                                                 \
-                                                                                                                       \
-    if (i % 2 == 0)                                                                                                    \
-    {                                                                                                                  \
-        pW_next = var->weights.next_color.ptr<float>(i + 1) + next_offs_even;                                          \
-        pdu_next = dW_u->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
-        pdv_next = dW_v->next_color.ptr<float>(i + 1) + next_offs_even;                                                \
-        len = var->A11.cur_color##_even_len;                                                                           \
-    }                                                                                                                  \
-    else                                                                                                               \
-    {                                                                                                                  \
-        pW_next = var->weights.next_color.ptr<float>(i + 1) + next_offs_odd;                                           \
-        pdu_next = dW_u->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
-        pdv_next = dW_v->next_color.ptr<float>(i + 1) + next_offs_odd;                                                 \
-        len = var->A11.cur_color##_odd_len;                                                                            \
-    }
-        if (red_pass)
-        {
-            INIT_ROW_POINTERS(red, black, 1, 2);
-        }
-        else
-        {
-            INIT_ROW_POINTERS(black, red, 2, 1);
-        }
-#undef INIT_ROW_POINTERS
-        j = 0;
-#ifdef CV_SIMD128
-        v_float32x4 pW_prev_vec = v_setall_f32(pW_next[-1]);
-        v_float32x4 pdu_prev_vec = v_setall_f32(pdu_next[-1]);
-        v_float32x4 pdv_prev_vec = v_setall_f32(pdv_next[-1]);
-        v_float32x4 omega_vec = v_setall_f32(var->omega);
-        v_float32x4 pW_vec, pW_next_vec, pW_prev_row_vec;
-        v_float32x4 pdu_next_vec, pdu_prev_row_vec, pdu_next_row_vec;
-        v_float32x4 pdv_next_vec, pdv_prev_row_vec, pdv_next_row_vec;
-        v_float32x4 pW_shifted_vec, pdu_shifted_vec, pdv_shifted_vec;
-        v_float32x4 pa12_vec, sigmaU_vec, sigmaV_vec, pdu_vec, pdv_vec;
-        for (; j < len - 3; j += 4)
-        {
-            pW_vec = v_load(pW + j);
-            pW_next_vec = v_load(pW_next + j);
-            pW_prev_row_vec = v_load(pW_prev_row + j);
-            pdu_next_vec = v_load(pdu_next + j);
-            pdu_prev_row_vec = v_load(pdu_prev_row + j);
-            pdu_next_row_vec = v_load(pdu_next_row + j);
-            pdv_next_vec = v_load(pdv_next + j);
-            pdv_prev_row_vec = v_load(pdv_prev_row + j);
-            pdv_next_row_vec = v_load(pdv_next_row + j);
-            pa12_vec = v_load(pa12 + j);
-            pW_shifted_vec = v_reinterpret_as_f32(
-              v_extract<3>(v_reinterpret_as_s32(pW_prev_vec), v_reinterpret_as_s32(pW_next_vec)));
-            pdu_shifted_vec = v_reinterpret_as_f32(
-              v_extract<3>(v_reinterpret_as_s32(pdu_prev_vec), v_reinterpret_as_s32(pdu_next_vec)));
-            pdv_shifted_vec = v_reinterpret_as_f32(
-              v_extract<3>(v_reinterpret_as_s32(pdv_prev_vec), v_reinterpret_as_s32(pdv_next_vec)));
-            sigmaU_vec = pW_shifted_vec * pdu_shifted_vec + pW_vec * pdu_next_vec + pW_prev_row_vec * pdu_prev_row_vec +
-                         pW_vec * pdu_next_row_vec;
-            sigmaV_vec = pW_shifted_vec * pdv_shifted_vec + pW_vec * pdv_next_vec + pW_prev_row_vec * pdv_prev_row_vec +
-                         pW_vec * pdv_next_row_vec;
-            pdu_vec = v_load(pdu + j);
-            pdv_vec = v_load(pdv + j);
-            pdu_vec += omega_vec * ((sigmaU_vec + v_load(pb1 + j) - pdv_vec * pa12_vec) / v_load(pa11 + j) - pdu_vec);
-            pdv_vec += omega_vec * ((sigmaV_vec + v_load(pb2 + j) - pdu_vec * pa12_vec) / v_load(pa22 + j) - pdv_vec);
-            v_store(pdu + j, pdu_vec);
-            v_store(pdv + j, pdv_vec);
-            pW_prev_vec = pW_next_vec;
-            pdu_prev_vec = pdu_next_vec;
-            pdv_prev_vec = pdv_next_vec;
-        }
-#endif
-        for (; j < len; j++)
-        {
-            sigmaU = pW_next[j - 1] * pdu_next[j - 1] + pW[j] * pdu_next[j] + pW_prev_row[j] * pdu_prev_row[j] +
-                     pW[j] * pdu_next_row[j];
-            sigmaV = pW_next[j - 1] * pdv_next[j - 1] + pW[j] * pdv_next[j] + pW_prev_row[j] * pdv_prev_row[j] +
-                     pW[j] * pdv_next_row[j];
-            pdu[j] += var->omega * ((sigmaU + pb1[j] - pdv[j] * pa12[j]) / pa11[j] - pdu[j]);
-            pdv[j] += var->omega * ((sigmaV + pb2[j] - pdu[j] * pa12[j]) / pa22[j] - pdv[j]);
-        }
-    }
-}
-void VariationalRefinementImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
-{
-    CV_Assert(!I0.empty() && I0.channels() == 1);
-    CV_Assert(!I1.empty() && I1.channels() == 1);
-    CV_Assert(I0.sameSize(I1));
-    CV_Assert((I0.depth() == CV_8U && I1.depth() == CV_8U) || (I0.depth() == CV_32F && I1.depth() == CV_32F));
-    CV_Assert(!flow.empty() && flow.depth() == CV_32F && flow.channels() == 2);
-    CV_Assert(I0.sameSize(flow));
-    Mat uv[2];
-    Mat &flowMat = flow.getMatRef();
-    split(flowMat, uv);
-    calcUV(I0, I1, uv[0], uv[1]);
-    merge(uv, 2, flowMat);
-}
-void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v)
-{
-    CV_Assert(!I0.empty() && I0.channels() == 1);
-    CV_Assert(!I1.empty() && I1.channels() == 1);
-    CV_Assert(I0.sameSize(I1));
-    CV_Assert((I0.depth() == CV_8U && I1.depth() == CV_8U) || (I0.depth() == CV_32F && I1.depth() == CV_32F));
-    CV_Assert(!flow_u.empty() && flow_u.depth() == CV_32F && flow_u.channels() == 1);
-    CV_Assert(!flow_v.empty() && flow_v.depth() == CV_32F && flow_v.channels() == 1);
-    CV_Assert(I0.sameSize(flow_u));
-    CV_Assert(flow_u.sameSize(flow_v));
-    int num_stripes = getNumThreads();
-    Mat I0Mat = I0.getMat();
-    Mat I1Mat = I1.getMat();
-    Mat &W_u = flow_u.getMatRef();
-    Mat &W_v = flow_v.getMatRef();
-    prepareBuffers(I0Mat, I1Mat, W_u, W_v);
-    splitCheckerboard(W_u_rb, W_u);
-    splitCheckerboard(W_v_rb, W_v);
-    W_u_rb.red.copyTo(tempW_u.red);
-    W_u_rb.black.copyTo(tempW_u.black);
-    W_v_rb.red.copyTo(tempW_v.red);
-    W_v_rb.black.copyTo(tempW_v.black);
-    dW_u.red.setTo(0.0f);
-    dW_u.black.setTo(0.0f);
-    dW_v.red.setTo(0.0f);
-    dW_v.black.setTo(0.0f);
-    for (int i = 0; i < fixedPointIterations; i++)
-    {
-        parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true));
-        parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false));
-        parallel_for_(Range(0, num_stripes), ComputeSmoothnessTermHorPass_ParBody(
-                                               *this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, tempW_u, tempW_v, true));
-        parallel_for_(Range(0, num_stripes), ComputeSmoothnessTermHorPass_ParBody(
-                                               *this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, tempW_u, tempW_v, false));
-        parallel_for_(Range(0, num_stripes),
-                      ComputeSmoothnessTermVertPass_ParBody(*this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, true));
-        parallel_for_(Range(0, num_stripes),
-                      ComputeSmoothnessTermVertPass_ParBody(*this, num_stripes, I0Mat.rows, W_u_rb, W_v_rb, false));
-        for (int j = 0; j < sorIterations; j++)
-        {
-            parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true));
-            parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false));
-        }
-        tempW_u.red = W_u_rb.red + dW_u.red;
-        tempW_u.black = W_u_rb.black + dW_u.black;
-        updateRepeatedBorders(tempW_u);
-        tempW_v.red = W_v_rb.red + dW_v.red;
-        tempW_v.black = W_v_rb.black + dW_v.black;
-        updateRepeatedBorders(tempW_v);
-    }
-    mergeCheckerboard(W_u, tempW_u);
-    mergeCheckerboard(W_v, tempW_v);
-}
-void VariationalRefinementImpl::collectGarbage()
-{
-    Ix.release();
-    Iy.release();
-    Iz.release();
-    Ixx.release();
-    Ixy.release();
-    Iyy.release();
-    Ixz.release();
-    Iyz.release();
-    Ix_rb.release();
-    Iy_rb.release();
-    Iz_rb.release();
-    Ixx_rb.release();
-    Ixy_rb.release();
-    Iyy_rb.release();
-    Ixz_rb.release();
-    Iyz_rb.release();
-    A11.release();
-    A12.release();
-    A22.release();
-    b1.release();
-    b2.release();
-    weights.release();
-    mapX.release();
-    mapY.release();
-    tempW_u.release();
-    tempW_v.release();
-    dW_u.release();
-    dW_v.release();
-    W_u_rb.release();
-    W_v_rb.release();
-}
-Ptr<VariationalRefinement> createVariationalFlowRefinement() { return makePtr<VariationalRefinementImpl>(); }
-}
-}
--- a/modules/optflow/test/ocl/test_dis.cpp
+++ b/modules/optflow/test/ocl/test_dis.cpp
@@ -7,10 +7,12 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -23,7 +25,7 @@
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
@@ -44,53 +46,72 @@
 #ifdef HAVE_OPENCL
-namespace opencv_test { namespace {
+namespace opencv_test {
+namespace ocl {
-PARAM_TEST_CASE(OCL_DenseOpticalFlow_DIS, int)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Optical_flow_tvl1
+namespace
 {
-    int preset;
+    IMPLEMENT_PARAM_CLASS(UseInitFlow, bool)
+    IMPLEMENT_PARAM_CLASS(MedianFiltering, int)
+    IMPLEMENT_PARAM_CLASS(ScaleStep, double)
+}
+PARAM_TEST_CASE(OpticalFlowTVL1, UseInitFlow, MedianFiltering, ScaleStep)
+{
+    bool useInitFlow;
+    int medianFiltering;
+    double scaleStep;
    virtual void SetUp()
    {
-        preset = GET_PARAM(0);
+        useInitFlow = GET_PARAM(0);
+        medianFiltering = GET_PARAM(1);
+        scaleStep = GET_PARAM(2);
    }
 };
-OCL_TEST_P(OCL_DenseOpticalFlow_DIS, Mat)
+OCL_TEST_P(OpticalFlowTVL1, Mat)
 {
-    Mat frame1, frame2, GT;
+    Mat frame0 = readImage("optflow/RubberWhale1.png", IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
-    frame1 = imread(TS::ptr()->get_data_path() + "optflow/RubberWhale1.png");
+    Mat frame1 = readImage("optflow/RubberWhale2.png", IMREAD_GRAYSCALE);
-    frame2 = imread(TS::ptr()->get_data_path() + "optflow/RubberWhale2.png");
+    ASSERT_FALSE(frame1.empty());
-    CV_Assert(!frame1.empty() && !frame2.empty());
+    Mat flow; UMat uflow;
-    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
+    //create algorithm
-    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
+    Ptr<DualTVL1OpticalFlow> alg = createOptFlow_DualTVL1();
-    Ptr<DenseOpticalFlow> algo;
+    //set parameters
+    alg->setScaleStep(scaleStep);
+    alg->setMedianFiltering(medianFiltering);
-    // iterate over presets:
+    //create initial flow as result of algorithm calculation
-    for (int i = 0; i < cvtest::ocl::test_loop_times; i++)
+    if (useInitFlow)
    {
-        Mat flow;
+        OCL_ON(alg->calc(frame0, frame1, uflow));
-        UMat ocl_flow;
+        uflow.copyTo(flow);
+    }
-        algo = createOptFlow_DIS(preset);
+    //set flag to use initial flow as it is ready to use
-        OCL_OFF(algo->calc(frame1, frame2, flow));
+    alg->setUseInitialFlow(useInitFlow);
-        OCL_ON(algo->calc(frame1, frame2, ocl_flow));
-        ASSERT_EQ(flow.rows, ocl_flow.rows);
-        ASSERT_EQ(flow.cols, ocl_flow.cols);
-        EXPECT_MAT_SIMILAR(flow, ocl_flow, 6e-3);
+    OCL_OFF(alg->calc(frame0, frame1, flow));
-    }
+    OCL_ON(alg->calc(frame0, frame1, uflow));
+    EXPECT_MAT_SIMILAR(flow, uflow, 1e-2);
 }
-OCL_INSTANTIATE_TEST_CASE_P(Contrib, OCL_DenseOpticalFlow_DIS,
+OCL_INSTANTIATE_TEST_CASE_P(Contrib, OpticalFlowTVL1,
-                            Values(DISOpticalFlow::PRESET_ULTRAFAST,
+    Combine(
-                                   DISOpticalFlow::PRESET_FAST,
+    Values(UseInitFlow(false), UseInitFlow(true)),
-                                   DISOpticalFlow::PRESET_MEDIUM));
+    Values(MedianFiltering(3), MedianFiltering(-1)),
+    Values(ScaleStep(0.3),ScaleStep(0.5))
+    )
+    );
-}} // namespace
+} } // namespace opencv_test::ocl
 #endif // HAVE_OPENCL
--- a/modules/optflow/test/test_OF_accuracy.cpp
+++ b/modules/optflow/test/test_OF_accuracy.cpp
@@ -172,52 +172,6 @@ TEST(DenseOpticalFlow_SparseToDenseFlow, ReferenceAccuracy)
    EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
 }
-TEST(DenseOpticalFlow_DIS, ReferenceAccuracy)
-{
-    Mat frame1, frame2, GT;
-    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
-    int presets[] = {DISOpticalFlow::PRESET_ULTRAFAST, DISOpticalFlow::PRESET_FAST, DISOpticalFlow::PRESET_MEDIUM};
-    float target_RMSE[] = {0.86f, 0.74f, 0.49f};
-    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
-    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
-    Ptr<DenseOpticalFlow> algo;
-    // iterate over presets:
-    for (int i = 0; i < 3; i++)
-    {
-        Mat flow;
-        algo = createOptFlow_DIS(presets[i]);
-        algo->calc(frame1, frame2, flow);
-        ASSERT_EQ(GT.rows, flow.rows);
-        ASSERT_EQ(GT.cols, flow.cols);
-        EXPECT_LE(calcRMSE(GT, flow), target_RMSE[i]);
-    }
-}
-TEST(DenseOpticalFlow_VariationalRefinement, ReferenceAccuracy)
-{
-    Mat frame1, frame2, GT;
-    ASSERT_TRUE(readRubberWhale(frame1, frame2, GT));
-    float target_RMSE = 0.86f;
-    cvtColor(frame1, frame1, COLOR_BGR2GRAY);
-    cvtColor(frame2, frame2, COLOR_BGR2GRAY);
-    Ptr<VariationalRefinement> var_ref;
-    var_ref = createVariationalFlowRefinement();
-    var_ref->setAlpha(20.0f);
-    var_ref->setDelta(5.0f);
-    var_ref->setGamma(10.0f);
-    var_ref->setSorIterations(25);
-    var_ref->setFixedPointIterations(25);
-    Mat flow(frame1.size(), CV_32FC2);
-    flow.setTo(0.0f);
-    var_ref->calc(frame1, frame2, flow);
-    ASSERT_EQ(GT.rows, flow.rows);
-    ASSERT_EQ(GT.cols, flow.cols);
-    EXPECT_LE(calcRMSE(GT, flow), target_RMSE);
-}
 TEST(DenseOpticalFlow_PCAFlow, ReferenceAccuracy)
 {
    Mat frame1, frame2, GT;

--- a/modules/optflow/test/test_OF_reproducibility.cpp
+++ b/modules/optflow/test/test_OF_reproducibility.cpp
@@ -43,118 +43,131 @@
 namespace opencv_test { namespace {
-typedef tuple<Size> OFParams;
+//#define DUMP
-typedef TestWithParam<OFParams> DenseOpticalFlow_DIS;
-typedef TestWithParam<OFParams> DenseOpticalFlow_VariationalRefinement;
-TEST_P(DenseOpticalFlow_DIS, MultithreadReproducibility)
+    // first four bytes, should be the same in little endian
-{
+    const float FLO_TAG_FLOAT = 202021.25f;  // check for this when READING the file
-    double MAX_DIF = 0.01;
-    double MAX_MEAN_DIF = 0.001;
+#ifdef DUMP
-    int loopsCount = 2;
+    // binary file format for flow data specified here:
-    RNG rng(0);
+    // http://vision.middlebury.edu/flow/data/
+    void writeOpticalFlowToFile(const Mat_<Point2f>& flow, const string& fileName)
-    OFParams params = GetParam();
-    Size size = get<0>(params);
-    int nThreads = cv::getNumThreads();
-    if (nThreads == 1)
-        throw SkipTestException("Single thread environment");
-    for (int iter = 0; iter <= loopsCount; iter++)
    {
-        Mat frame1(size, CV_8U);
+        const char FLO_TAG_STRING[] = "PIEH";    // use this when WRITING the file
-        randu(frame1, 0, 255);
+        ofstream file(fileName.c_str(), ios_base::binary);
-        Mat frame2(size, CV_8U);
-        randu(frame2, 0, 255);
+        file << FLO_TAG_STRING;
-        Ptr<DISOpticalFlow> algo = createOptFlow_DIS();
+        file.write((const char*) &flow.cols, sizeof(int));
-        int psz = rng.uniform(4, 16);
+        file.write((const char*) &flow.rows, sizeof(int));
-        int pstr = rng.uniform(1, psz - 1);
-        int grad_iter = rng.uniform(1, 64);
+        for (int i = 0; i < flow.rows; ++i)
-        int var_iter = rng.uniform(0, 10);
+        {
-        bool use_mean_normalization = !!rng.uniform(0, 2);
+            for (int j = 0; j < flow.cols; ++j)
-        bool use_spatial_propagation = !!rng.uniform(0, 2);
+            {
-        algo->setFinestScale(0);
+                const Point2f u = flow(i, j);
-        algo->setPatchSize(psz);
-        algo->setPatchStride(pstr);
+                file.write((const char*) &u.x, sizeof(float));
-        algo->setGradientDescentIterations(grad_iter);
+                file.write((const char*) &u.y, sizeof(float));
-        algo->setVariationalRefinementIterations(var_iter);
+            }
-        algo->setUseMeanNormalization(use_mean_normalization);
+        }
-        algo->setUseSpatialPropagation(use_spatial_propagation);
-        cv::setNumThreads(nThreads);
-        Mat resMultiThread;
-        algo->calc(frame1, frame2, resMultiThread);
-        cv::setNumThreads(1);
-        Mat resSingleThread;
-        algo->calc(frame1, frame2, resSingleThread);
-        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_INF), MAX_DIF);
-        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_L1), MAX_MEAN_DIF * frame1.total());
-        // resulting flow should be within the frame bounds:
-        double min_val, max_val;
-        minMaxLoc(resMultiThread, &min_val, &max_val);
-        EXPECT_LE(abs(min_val), sqrt( static_cast<double>(size.height * size.height + size.width * size.width)) );
-        EXPECT_LE(abs(max_val), sqrt( static_cast<double>(size.height * size.height + size.width * size.width)) );
    }
-}
+#endif
-INSTANTIATE_TEST_CASE_P(FullSet, DenseOpticalFlow_DIS, Values(szODD, szQVGA));
+    // binary file format for flow data specified here:
+    // http://vision.middlebury.edu/flow/data/
+    void readOpticalFlowFromFile(Mat_<Point2f>& flow, const string& fileName)
+    {
+        std::ifstream file(fileName.c_str(), std::ios_base::binary);
-TEST_P(DenseOpticalFlow_VariationalRefinement, MultithreadReproducibility)
+        float tag;
-{
+        file.read((char*) &tag, sizeof(float));
-    double MAX_DIF = 0.01;
+        CV_Assert( tag == FLO_TAG_FLOAT );
-    double MAX_MEAN_DIF = 0.001;
-    float input_flow_rad = 5.0;
+        Size size;
-    int loopsCount = 2;
-    RNG rng(0);
+        file.read((char*) &size.width, sizeof(int));
+        file.read((char*) &size.height, sizeof(int));
-    OFParams params = GetParam();
-    Size size = get<0>(params);
+        flow.create(size);
-    int nThreads = cv::getNumThreads();
+        for (int i = 0; i < flow.rows; ++i)
-    if (nThreads == 1)
+        {
-        throw SkipTestException("Single thread environment");
+            for (int j = 0; j < flow.cols; ++j)
-    for (int iter = 0; iter <= loopsCount; iter++)
+            {
+                Point2f u;
+                file.read((char*) &u.x, sizeof(float));
+                file.read((char*) &u.y, sizeof(float));
+                flow(i, j) = u;
+            }
+        }
+        file.close();
+    }
+    bool isFlowCorrect(Point2f u)
    {
-        Mat frame1(size, CV_8U);
+        return !cvIsNaN(u.x) && !cvIsNaN(u.y) && (fabs(u.x) < 1e9) && (fabs(u.y) < 1e9);
-        randu(frame1, 0, 255);
-        Mat frame2(size, CV_8U);
-        randu(frame2, 0, 255);
-        Mat flow(size, CV_32FC2);
-        randu(flow, -input_flow_rad, input_flow_rad);
-        Ptr<VariationalRefinement> var = createVariationalFlowRefinement();
-        var->setAlpha(rng.uniform(1.0f, 100.0f));
-        var->setGamma(rng.uniform(0.1f, 10.0f));
-        var->setDelta(rng.uniform(0.1f, 10.0f));
-        var->setSorIterations(rng.uniform(1, 20));
-        var->setFixedPointIterations(rng.uniform(1, 20));
-        var->setOmega(rng.uniform(1.01f, 1.99f));
-        cv::setNumThreads(nThreads);
-        Mat resMultiThread;
-        flow.copyTo(resMultiThread);
-        var->calc(frame1, frame2, resMultiThread);
-        cv::setNumThreads(1);
-        Mat resSingleThread;
-        flow.copyTo(resSingleThread);
-        var->calc(frame1, frame2, resSingleThread);
-        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_INF), MAX_DIF);
-        EXPECT_LE(cv::norm(resSingleThread, resMultiThread, NORM_L1), MAX_MEAN_DIF * frame1.total());
-        // resulting flow should be within the frame bounds:
-        double min_val, max_val;
-        minMaxLoc(resMultiThread, &min_val, &max_val);
-        EXPECT_LE(abs(min_val), sqrt( static_cast<double>(size.height * size.height + size.width * size.width)) );
-        EXPECT_LE(abs(max_val), sqrt( static_cast<double>(size.height * size.height + size.width * size.width)) );
    }
-}
-INSTANTIATE_TEST_CASE_P(FullSet, DenseOpticalFlow_VariationalRefinement, Values(szODD, szQVGA));
+    void check(const Mat_<Point2f>& gold, const Mat_<Point2f>& flow, double threshold = 0.1, double expectedAccuracy = 0.95)
+    {
+        threshold = threshold*threshold;
+        size_t gold_counter = 0;
+        size_t valid_counter = 0;
+        for (int i = 0; i < gold.rows; ++i)
+        {
+            for (int j = 0; j < gold.cols; ++j)
+            {
+                const Point2f u1 = gold(i, j);
+                const Point2f u2 = flow(i, j);
+                if (isFlowCorrect(u1))
+                {
+                    gold_counter++;
+                    if (isFlowCorrect(u2))
+                    {
+                        const Point2f diff = u1 - u2;
+                        double err = diff.ddot(diff);
+                        if (err <= threshold)
+                            valid_counter++;
+                    }
+                }
+            }
+        }
+        EXPECT_GE(valid_counter, expectedAccuracy * gold_counter);
+    }
+TEST(Contrib_calcOpticalFlowDual_TVL1, Regression)
+{
+    const string frame1_path = TS::ptr()->get_data_path() + "optflow/RubberWhale1.png";
+    const string frame2_path = TS::ptr()->get_data_path() + "optflow/RubberWhale2.png";
+    const string gold_flow_path = TS::ptr()->get_data_path() + "optflow/tvl1_flow.flo";
+    Mat frame1 = imread(frame1_path, IMREAD_GRAYSCALE);
+    Mat frame2 = imread(frame2_path, IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+    ASSERT_FALSE(frame2.empty());
+    Mat_<Point2f> flow;
+    Ptr<DualTVL1OpticalFlow> tvl1 = cv::optflow::DualTVL1OpticalFlow::create();
+    tvl1->calc(frame1, frame2, flow);
+#ifdef DUMP
+    writeOpticalFlowToFile(flow, gold_flow_path);
+#else
+    Mat_<Point2f> gold;
+    readOpticalFlowFromFile(gold, gold_flow_path);
+    ASSERT_EQ(gold.rows, flow.rows);
+    ASSERT_EQ(gold.cols, flow.cols);
+    check(gold, flow);
+#endif
+}
 }} // namespace
--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -6,6 +6,6 @@ set(the_description "Super Resolution")
 if(HAVE_CUDA)
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wshadow)
 endif()
-ocv_define_module(superres opencv_imgproc opencv_video
+ocv_define_module(superres opencv_imgproc opencv_video opencv_optflow
                  OPTIONAL opencv_videoio opencv_cudaarithm opencv_cudafilters opencv_cudawarping opencv_cudaimgproc opencv_cudaoptflow opencv_cudacodec
                  WRAP python)
--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@@ -41,6 +41,7 @@
 //M*/
 #include "precomp.hpp"
+#include "opencv2/optflow.hpp"
 #include "opencv2/core/opencl/ocl_defs.hpp"
 using namespace cv;
@@ -371,12 +372,12 @@ namespace
        void impl(InputArray input0, InputArray input1, OutputArray dst) CV_OVERRIDE;
    private:
-        Ptr<cv::DualTVL1OpticalFlow> alg_;
+        Ptr<optflow::DualTVL1OpticalFlow> alg_;
    };
    DualTVL1::DualTVL1() : CpuOpticalFlow(CV_8UC1)
    {
-        alg_ = cv::createOptFlow_DualTVL1();
+        alg_ = optflow::createOptFlow_DualTVL1();
    }
    void DualTVL1::calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2)

--- a/samples/python2/dis_opt_flow.py
+++ b/samples/python2/dis_opt_flow.py
-#!/usr/bin/env python
-'''
-example to show optical flow estimation using DISOpticalFlow
-USAGE: dis_opt_flow.py [<video_source>]
-Keys:
- 1  - toggle HSV flow visualization
- 2  - toggle glitch
- 3  - toggle spatial propagation of flow vectors
- 4  - toggle temporal propagation of flow vectors
-ESC - exit
-'''
-# Python 2/3 compatibility
-from __future__ import print_function
-import numpy as np
-import cv2 as cv
-import video
-def draw_flow(img, flow, step=16):
-    h, w = img.shape[:2]
-    y, x = np.mgrid[step/2:h:step, step/2:w:step].reshape(2,-1).astype(int)
-    fx, fy = flow[y,x].T
-    lines = np.vstack([x, y, x+fx, y+fy]).T.reshape(-1, 2, 2)
-    lines = np.int32(lines + 0.5)
-    vis = cv.cvtColor(img, cv.COLOR_GRAY2BGR)
-    cv.polylines(vis, lines, 0, (0, 255, 0))
-    for (x1, y1), (x2, y2) in lines:
-        cv.circle(vis, (x1, y1), 1, (0, 255, 0), -1)
-    return vis
-def draw_hsv(flow):
-    h, w = flow.shape[:2]
-    fx, fy = flow[:,:,0], flow[:,:,1]
-    ang = np.arctan2(fy, fx) + np.pi
-    v = np.sqrt(fx*fx+fy*fy)
-    hsv = np.zeros((h, w, 3), np.uint8)
-    hsv[...,0] = ang*(180/np.pi/2)
-    hsv[...,1] = 255
-    hsv[...,2] = np.minimum(v*4, 255)
-    bgr = cv.cvtColor(hsv, cv.COLOR_HSV2BGR)
-    return bgr
-def warp_flow(img, flow):
-    h, w = flow.shape[:2]
-    flow = -flow
-    flow[:,:,0] += np.arange(w)
-    flow[:,:,1] += np.arange(h)[:,np.newaxis]
-    res = cv.remap(img, flow, None, cv.INTER_LINEAR)
-    return res
-if __name__ == '__main__':
-    import sys
-    print(__doc__)
-    try:
-        fn = sys.argv[1]
-    except IndexError:
-        fn = 0
-    cam = video.create_capture(fn)
-    ret, prev = cam.read()
-    prevgray = cv.cvtColor(prev, cv.COLOR_BGR2GRAY)
-    show_hsv = False
-    show_glitch = False
-    use_spatial_propagation = False
-    use_temporal_propagation = True
-    cur_glitch = prev.copy()
-    inst = cv.optflow.createOptFlow_DIS(cv.optflow.DISOPTICAL_FLOW_PRESET_MEDIUM)
-    inst.setUseSpatialPropagation(use_spatial_propagation)
-    flow = None
-    while True:
-        ret, img = cam.read()
-        gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
-        if flow is not None and use_temporal_propagation:
-            #warp previous flow to get an initial approximation for the current flow:
-            flow = inst.calc(prevgray, gray, warp_flow(flow,flow))
-        else:
-            flow = inst.calc(prevgray, gray, None)
-        prevgray = gray
-        cv.imshow('flow', draw_flow(gray, flow))
-        if show_hsv:
-            cv.imshow('flow HSV', draw_hsv(flow))
-        if show_glitch:
-            cur_glitch = warp_flow(cur_glitch, flow)
-            cv.imshow('glitch', cur_glitch)
-        ch = 0xFF & cv.waitKey(5)
-        if ch == 27:
-            break
-        if ch == ord('1'):
-            show_hsv = not show_hsv
-            print('HSV flow visualization is', ['off', 'on'][show_hsv])
-        if ch == ord('2'):
-            show_glitch = not show_glitch
-            if show_glitch:
-                cur_glitch = img.copy()
-            print('glitch is', ['off', 'on'][show_glitch])
-        if ch == ord('3'):
-            use_spatial_propagation = not use_spatial_propagation
-            inst.setUseSpatialPropagation(use_spatial_propagation)
-            print('spatial propagation is', ['off', 'on'][use_spatial_propagation])
-        if ch == ord('4'):
-            use_temporal_propagation = not use_temporal_propagation
-            print('temporal propagation is', ['off', 'on'][use_temporal_propagation])
-    cv.destroyAllWindows()