NVIDIA Optical Flow Integration in OpenCV

5e0783e1 · Vishal Chiluka · Vishal Bhaskar Chiluka · f0d30f2c · 5e0783e1 · 5e0783e1
Commit 5e0783e1 authored May 15, 2019 by Vishal Chiluka Committed by Vishal Bhaskar Chiluka Jul 06, 2019
7 changed files
--- a/modules/cudaoptflow/CMakeLists.txt
+++ b/modules/cudaoptflow/CMakeLists.txt
@@ -7,3 +7,22 @@ set(the_description "CUDA-accelerated Optical Flow")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)

 ocv_define_module(cudaoptflow opencv_video opencv_optflow opencv_cudaarithm opencv_cudawarping opencv_cudaimgproc OPTIONAL opencv_cudalegacy WRAP python)
+
+set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT "79c6cee80a2df9a196f20afd6b598a9810964c32")
+set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_MD5 "ca5acedee6cb45d0ec610a6732de5c15")
+set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH "${OpenCV_BINARY_DIR}/3rdparty/NVIDIAOpticalFlowSDK_1_0_Headers")
+ocv_download(FILENAME "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT}.zip"
+               HASH ${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_MD5}
+               URL
+                 "https://github.com/NVIDIA/NVIDIAOpticalFlowSDK/archive/"
+               DESTINATION_DIR "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH}"
+               STATUS NVIDIA_OPTICAL_FLOW_1_0_HEADERS_DOWNLOAD_SUCCESS
+               ID "NVIDIA_OPTICAL_FLOW"
+               UNPACK RELATIVE_URL)
+
+if(NOT NVIDIA_OPTICAL_FLOW_1_0_HEADERS_DOWNLOAD_SUCCESS)
+  message(STATUS "Failed to download NVIDIA_Optical_Flow_1_0 Headers")
+else()
+  add_definitions(-DHAVE_NVIDIA_OPTFLOW=1)
+  ocv_include_directories(SYSTEM "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH}/NVIDIAOpticalFlowSDK-${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT}")
+endif()
\ No newline at end of file
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -102,6 +102,47 @@ public:
                      OutputArray err = cv::noArray(),
                      Stream& stream = Stream::Null()) = 0;
 };
+/** @brief Base Interface for optical flow algorithms using NVIDIA Optical Flow SDK.
+ */
+class CV_EXPORTS_W NvidiaHWOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates Optical Flow using NVIDIA Optical Flow SDK.
+
+    * NVIDIA GPUs starting with Turing contain a dedicated hardware accelerator for computing optical flow vectors between pairs of images.
+    * The optical flow hardware accelerator generates block-based optical flow vectors.
+    * The size of the block depends on hardware in use, and can be queried using the function getGridSize().
+    * The block-based flow vectors generated by the hardware can be converted to dense representation (i.e. per-pixel flow vectors) using upSampler() helper function, if needed.
+    * The flow vectors are stored in CV_16SC2 format with x and y components of each flow vector in 16-bit signed fixed point representation S10.5.
+
+    @param inputImage Input image.
+    @param referenceImage Reference image of the same size and the same type as input image.
+    @param flow A buffer consisting of inputImage.Size() / getGridSize() flow vectors in CV_16SC2 format.
+    @param stream Stream for the asynchronous version.
+    @param hint Hint buffer if client provides external hints. Must have same size as flow buffer.
+                Caller can provide flow vectors as hints for optical flow calculation.
+    @param cost Cost buffer contains numbers indicating the confidence associated with each of the generated flow vectors.
+                Higher the cost, lower the confidence. Cost buffer is of type CV_32SC1.
+
+    @note
+    - Client must use critical sections around each calc() function if calling it from multiple threads.
+    */
+    CV_WRAP virtual void calc(
+        InputArray inputImage,
+        InputArray referenceImage,
+        InputOutputArray flow,
+        Stream& stream = Stream::Null(),
+        InputArray hint = cv::noArray(),
+        OutputArray cost = cv::noArray()) = 0;
+
+    /** @brief Releases all buffers, contexts and device pointers.
+    */
+    CV_WRAP virtual void collectGarbage() = 0;
+
+    /** @brief Returns grid size of output buffer as per the hardware's capability.
+    */
+    CV_WRAP virtual int getGridSize() const = 0;
+};

 //
 // BroxOpticalFlow
@@ -342,6 +383,70 @@ public:
            bool useInitialFlow = false);
 };

+//
+// NvidiaOpticalFlow
+//
+
+/** @brief Class for computing the optical flow vectors between two images using NVIDIA Optical Flow hardware and Optical Flow SDK 1.0.
+@note
+- A sample application demonstrating the use of NVIDIA Optical Flow can be found at
+opencv_source_code/samples/gpu/nvidia_optical_flow.cpp
+- An example application comparing accuracy and performance of NVIDIA Optical Flow with other optical flow algorithms in OpenCV can be found at
+opencv_source_code/samples/gpu/optical_flow.cpp
+*/
+
+class CV_EXPORTS_W NvidiaOpticalFlow_1_0 : public NvidiaHWOpticalFlow
+{
+public:
+    /**
+    * Supported optical flow performance levels.
+    */
+    enum NVIDIA_OF_PERF_LEVEL
+    {
+        NV_OF_PERF_LEVEL_UNDEFINED,
+        NV_OF_PERF_LEVEL_SLOW = 5,                   /**< Slow perf level results in lowest performance and best quality */
+        NV_OF_PERF_LEVEL_MEDIUM = 10,                /**< Medium perf level results in low performance and medium quality */
+        NV_OF_PERF_LEVEL_FAST = 20,                  /**< Fast perf level results in high performance and low quality */
+        NV_OF_PERF_LEVEL_MAX
+    };
+
+    /** @brief The NVIDIA optical flow hardware generates flow vectors at granularity gridSize, which can be queried via function getGridSize().
+    * Upsampler() helper function converts the hardware-generated flow vectors to dense representation (1 flow vector for each pixel)
+    * using nearest neighbour upsampling method.
+
+    @param flow Buffer of type CV_16FC2 containing flow vectors generated by calc().
+    @param width Width of the input image in pixels for which these flow vectors were generated.
+    @param height Height of the input image in pixels for which these flow vectors were generated.
+    @param gridSize Granularity of the optical flow vectors returned by calc() function. Can be queried using getGridSize().
+    @param upsampledFlow Buffer of type CV_32FC2, containing upsampled flow vectors, each flow vector for 1 pixel, in the pitch-linear layout.
+    */
+    CV_WRAP virtual void upSampler(InputArray flow, int width, int height,
+        int gridSize, InputOutputArray upsampledFlow) = 0;
+
+    /** @brief Instantiate NVIDIA Optical Flow
+
+    @param width Width of input image in pixels.
+    @param height Height of input image in pixels.
+    @param perfPreset Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about presets.
+                      Defaults to NV_OF_PERF_LEVEL_SLOW.
+    @param enableTemporalHints Optional parameter. Flag to enable temporal hints. When set to true, the hardware uses the flow vectors
+                               generated in previous call to calc() as internal hints for the current call to calc().
+                               Useful when computing flow vectors between successive video frames. Defaults to false.
+    @param enableExternalHints Optional Parameter. Flag to enable passing external hints buffer to calc(). Defaults to false.
+    @param enableCostBuffer Optional Parameter. Flag to enable cost buffer output from calc(). Defaults to false.
+    @param gpuId Optional parameter to select the GPU ID on which the optical flow should be computed. Useful in multi-GPU systems. Defaults to 0.
+    */
+    CV_WRAP static Ptr<NvidiaOpticalFlow_1_0> create(
+        int width,
+        int height,
+        cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL perfPreset
+        = cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
+        bool enableTemporalHints = false,
+        bool enableExternalHints = false,
+        bool enableCostBuffer = false,
+        int gpuId = 0);
+};
+
 //! @}

 }} // namespace cv { namespace cuda {

--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@@ -326,4 +326,57 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1,
    }
 }

+//////////////////////////////////////////////////////
+// NvidiaOpticalFlow_1_0
+
+PERF_TEST_P(ImagePair, NvidiaOpticalFlow_1_0,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(10);
+
+    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    const int width = frame0.size().width;
+    const int height = frame0.size().height;
+    const bool enableTemporalHints = false;
+    const bool enableExternalHints = false;
+    const bool enableCostBuffer = false;
+    const int gpuid = 0;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_frame0(frame0);
+        const cv::cuda::GpuMat d_frame1(frame1);
+        cv::cuda::GpuMat d_flow;
+        cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
+        try
+        {
+            d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
+                cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST,
+                enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+        }
+        catch (const cv::Exception& e)
+        {
+            if(e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+                throw SkipTestException("Current configuration is not supported");
+            throw;
+        }
+
+        TEST_CYCLE() d_nvof->calc(d_frame0, d_frame1, d_flow);
+
+        cv::cuda::GpuMat flow[2];
+        cv::cuda::split(d_flow, flow);
+
+        cv::cuda::GpuMat u = flow[0];
+        cv::cuda::GpuMat v = flow[1];
+
+        CUDA_SANITY_CHECK(u, 1e-10);
+        CUDA_SANITY_CHECK(v, 1e-10);
+    }
+}
+
 }} // namespace
--- a/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
+++ b/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
+#include <unordered_map>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/video/tracking.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::cuda;
+
+//this function is taken from opencv/samples/gpu/optical_flow.cpp
+inline bool isFlowCorrect(Point2f u)
+{
+    return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9;
+}
+
+//this function is taken from opencv/samples/gpu/optical_flow.cpp
+static Vec3b computeColor(float fx, float fy)
+{
+    static bool first = true;
+
+    // relative lengths of color transitions:
+    // these are chosen based on perceptual similarity
+    // (e.g. one can distinguish more shades between red and yellow
+    //  than between yellow and green)
+    const int RY = 15;
+    const int YG = 6;
+    const int GC = 4;
+    const int CB = 11;
+    const int BM = 13;
+    const int MR = 6;
+    const int NCOLS = RY + YG + GC + CB + BM + MR;
+    static Vec3i colorWheel[NCOLS];
+
+    if (first)
+    {
+        int k = 0;
+
+        for (int i = 0; i < RY; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
+
+        for (int i = 0; i < YG; ++i, ++k)
+            colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
+
+        for (int i = 0; i < GC; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
+
+        for (int i = 0; i < CB; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
+
+        for (int i = 0; i < BM; ++i, ++k)
+            colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
+
+        for (int i = 0; i < MR; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
+
+        first = false;
+    }
+
+    const float rad = sqrt(fx * fx + fy * fy);
+    const float a = atan2(-fy, -fx) / (float)CV_PI;
+
+    const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
+    const int k0 = static_cast<int>(fk);
+    const int k1 = (k0 + 1) % NCOLS;
+    const float f = fk - k0;
+
+    Vec3b pix;
+
+    for (int b = 0; b < 3; b++)
+    {
+        const float col0 = colorWheel[k0][b] / 255.0f;
+        const float col1 = colorWheel[k1][b] / 255.0f;
+
+        float col = (1 - f) * col0 + f * col1;
+
+        if (rad <= 1)
+            col = 1 - rad * (1 - col); // increase saturation with radius
+        else
+            col *= .75; // out of range
+
+        pix[2 - b] = static_cast<uchar>(255.0 * col);
+    }
+
+    return pix;
+}
+
+//this function is taken from opencv/samples/gpu/optical_flow.cpp
+static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy
+    , Mat& dst, float maxmotion = -1)
+{
+    dst.create(flowx.size(), CV_8UC3);
+    dst.setTo(Scalar::all(0));
+
+    // determine motion range:
+    float maxrad = maxmotion;
+
+    if (maxmotion <= 0)
+    {
+        maxrad = 1;
+        for (int y = 0; y < flowx.rows; ++y)
+        {
+            for (int x = 0; x < flowx.cols; ++x)
+            {
+                Point2f u(flowx(y, x), flowy(y, x));
+
+                if (!isFlowCorrect(u))
+                    continue;
+
+                maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
+            }
+        }
+    }
+
+    for (int y = 0; y < flowx.rows; ++y)
+    {
+        for (int x = 0; x < flowx.cols; ++x)
+        {
+            Point2f u(flowx(y, x), flowy(y, x));
+
+            if (isFlowCorrect(u))
+                dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
+        }
+    }
+}
+
+int main(int argc, char **argv)
+{
+    std::unordered_map<std::string, NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL> presetMap = {
+        { "slow", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW },
+        { "medium", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_MEDIUM },
+        { "fast", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST } };
+
+    try
+    {
+        CommandLineParser cmd(argc, argv,
+            "{ l left   | ../data/basketball1.png | specify left image }"
+            "{ r right  | ../data/basketball2.png | specify right image }"
+            "{ g gpuid  | 0 | cuda device index}"
+            "{ p preset | slow | perf preset for OF algo [ options : slow, medium, fast ]}"
+            "{ o output | OpenCVNvOF.flo | output flow vector file in middlebury format}"
+            "{ th enableTemporalHints | false | Enable temporal hints}"
+            "{ eh enableExternalHints | false | Enable external hints}"
+            "{ cb enableCostBuffer | false | Enable output cost buffer}"
+            "{ h help   | | print help message }");
+
+        cmd.about("Nvidia's optical flow sample.");
+        if (cmd.has("help") || !cmd.check())
+        {
+            cmd.printMessage();
+            cmd.printErrors();
+            return 0;
+        }
+
+        string pathL = cmd.get<string>("left");
+        string pathR = cmd.get<string>("right");
+        string preset = cmd.get<string>("preset");
+        string output = cmd.get<string>("output");
+        bool enableExternalHints = cmd.get<bool>("enableExternalHints");
+        bool enableTemporalHints = cmd.get<bool>("enableTemporalHints");
+        bool enableCostBuffer = cmd.get<bool>("enableCostBuffer");
+        int gpuId = cmd.get<int>("gpuid");
+
+        if (pathL.empty()) cout << "Specify left image path\n";
+        if (pathR.empty()) cout << "Specify right image path\n";
+        if (preset.empty()) cout << "Specify perf preset for OpticalFlow algo\n";
+        if (pathL.empty() || pathR.empty()) return 0;
+
+        auto search = presetMap.find(preset);
+        if (search == presetMap.end())
+        {
+            std::cout << "Invalid preset level : " << preset << std::endl;
+            return 0;
+        }
+        NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL perfPreset = search->second;
+
+        Mat frameL = imread(pathL, IMREAD_GRAYSCALE);
+        Mat frameR = imread(pathR, IMREAD_GRAYSCALE);
+        if (frameL.empty()) cout << "Can't open '" << pathL << "'\n";
+        if (frameR.empty()) cout << "Can't open '" << pathR << "'\n";
+        if (frameL.empty() || frameR.empty()) return -1;
+
+        Ptr<NvidiaOpticalFlow_1_0> nvof = NvidiaOpticalFlow_1_0::create(
+            frameL.size().width, frameL.size().height, perfPreset,
+            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuId);
+
+        Mat flowx, flowy, flowxy, upsampledFlowXY, image;
+
+        nvof->calc(frameL, frameR, flowxy);
+
+        nvof->upSampler(flowxy, frameL.size().width, frameL.size().height,
+            nvof->getGridSize(), upsampledFlowXY);
+
+        if (output.size() != 0)
+        {
+            if (!writeOpticalFlow(output, upsampledFlowXY))
+                cout << "Failed to save Flow Vector" << endl;
+            else
+                cout << "Flow vector saved as '" << output << "'\n";
+        }
+
+        Mat planes[] = { flowx, flowy };
+        split(upsampledFlowXY, planes);
+        flowx = planes[0]; flowy = planes[1];
+
+        drawOpticalFlow(flowx, flowy, image, 10);
+
+        imshow("Colorize image",image);
+        waitKey(0);
+        nvof->collectGarbage();
+    }
+    catch (const std::exception &ex)
+    {
+        std::cout << ex.what() << std::endl;
+        return 1;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/modules/cudaoptflow/samples/optical_flow.cpp
+++ b/modules/cudaoptflow/samples/optical_flow.cpp
+#include <iostream>
+#include <fstream>
+
+#include "opencv2/core.hpp"
+#include <opencv2/core/utility.hpp>
+#include "opencv2/highgui.hpp"
+#include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::cuda;
+
+inline bool isFlowCorrect(Point2f u)
+{
+    return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9;
+}
+
+static Vec3b computeColor(float fx, float fy)
+{
+    static bool first = true;
+
+    // relative lengths of color transitions:
+    // these are chosen based on perceptual similarity
+    // (e.g. one can distinguish more shades between red and yellow
+    //  than between yellow and green)
+    const int RY = 15;
+    const int YG = 6;
+    const int GC = 4;
+    const int CB = 11;
+    const int BM = 13;
+    const int MR = 6;
+    const int NCOLS = RY + YG + GC + CB + BM + MR;
+    static Vec3i colorWheel[NCOLS];
+
+    if (first)
+    {
+        int k = 0;
+
+        for (int i = 0; i < RY; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
+
+        for (int i = 0; i < YG; ++i, ++k)
+            colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
+
+        for (int i = 0; i < GC; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
+
+        for (int i = 0; i < CB; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
+
+        for (int i = 0; i < BM; ++i, ++k)
+            colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
+
+        for (int i = 0; i < MR; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
+
+        first = false;
+    }
+
+    const float rad = sqrt(fx * fx + fy * fy);
+    const float a = atan2(-fy, -fx) / (float)CV_PI;
+
+    const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
+    const int k0 = static_cast<int>(fk);
+    const int k1 = (k0 + 1) % NCOLS;
+    const float f = fk - k0;
+
+    Vec3b pix;
+
+    for (int b = 0; b < 3; b++)
+    {
+        const float col0 = colorWheel[k0][b] / 255.0f;
+        const float col1 = colorWheel[k1][b] / 255.0f;
+
+        float col = (1 - f) * col0 + f * col1;
+
+        if (rad <= 1)
+            col = 1 - rad * (1 - col); // increase saturation with radius
+        else
+            col *= .75; // out of range
+
+        pix[2 - b] = static_cast<uchar>(255.0 * col);
+    }
+
+    return pix;
+}
+
+static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy, Mat& dst, float maxmotion = -1)
+{
+    dst.create(flowx.size(), CV_8UC3);
+    dst.setTo(Scalar::all(0));
+
+    // determine motion range:
+    float maxrad = maxmotion;
+
+    if (maxmotion <= 0)
+    {
+        maxrad = 1;
+        for (int y = 0; y < flowx.rows; ++y)
+        {
+            for (int x = 0; x < flowx.cols; ++x)
+            {
+                Point2f u(flowx(y, x), flowy(y, x));
+
+                if (!isFlowCorrect(u))
+                    continue;
+
+                maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
+            }
+        }
+    }
+
+    for (int y = 0; y < flowx.rows; ++y)
+    {
+        for (int x = 0; x < flowx.cols; ++x)
+        {
+            Point2f u(flowx(y, x), flowy(y, x));
+
+            if (isFlowCorrect(u))
+                dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
+        }
+    }
+}
+
+static void showFlow(const char* name, const GpuMat& d_flow)
+{
+    GpuMat planes[2];
+    cuda::split(d_flow, planes);
+
+    Mat flowx(planes[0]);
+    Mat flowy(planes[1]);
+
+    Mat out;
+    drawOpticalFlow(flowx, flowy, out, 10);
+
+    imshow(name, out);
+}
+
+int main(int argc, const char* argv[])
+{
+    string filename1, filename2;
+    if (argc < 3)
+    {
+        cerr << "Usage : " << argv[0] << " <frame0> <frame1>" << endl;
+        filename1 = "../data/basketball1.png";
+        filename2 = "../data/basketball2.png";
+    }
+    else
+    {
+        filename1 = argv[1];
+        filename2 = argv[2];
+    }
+
+    Mat frame0 = imread(filename1, IMREAD_GRAYSCALE);
+    Mat frame1 = imread(filename2, IMREAD_GRAYSCALE);
+
+    if (frame0.empty())
+    {
+        cerr << "Can't open image [" << filename1 << "]" << endl;
+        return -1;
+    }
+    if (frame1.empty())
+    {
+        cerr << "Can't open image [" << filename2 << "]" << endl;
+        return -1;
+    }
+
+    if (frame1.size() != frame0.size())
+    {
+        cerr << "Images should be of equal sizes" << endl;
+        return -1;
+    }
+
+    GpuMat d_frame0(frame0);
+    GpuMat d_frame1(frame1);
+
+    GpuMat d_flow(frame0.size(), CV_32FC2), d_flowxy;
+
+    Ptr<cuda::BroxOpticalFlow> brox = cuda::BroxOpticalFlow::create(0.197f, 50.0f, 0.8f, 10, 77, 10);
+    Ptr<cuda::DensePyrLKOpticalFlow> lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7));
+    Ptr<cuda::FarnebackOpticalFlow> farn = cuda::FarnebackOpticalFlow::create();
+    Ptr<cuda::OpticalFlowDual_TVL1> tvl1 = cuda::OpticalFlowDual_TVL1::create();
+    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof = cuda::NvidiaOpticalFlow_1_0::create(
+        frame0.size().width, frame0.size().height, NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST);
+
+    {
+        GpuMat d_frame0f;
+        GpuMat d_frame1f;
+
+        d_frame0.convertTo(d_frame0f, CV_32F, 1.0 / 255.0);
+        d_frame1.convertTo(d_frame1f, CV_32F, 1.0 / 255.0);
+
+        const int64 start = getTickCount();
+
+        brox->calc(d_frame0f, d_frame1f, d_flow);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Brox : " << timeSec << " sec" << endl;
+
+        showFlow("Brox", d_flow);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        lk->calc(d_frame0, d_frame1, d_flow);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "LK : " << timeSec << " sec" << endl;
+
+        showFlow("LK", d_flow);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        farn->calc(d_frame0, d_frame1, d_flow);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Farn : " << timeSec << " sec" << endl;
+
+        showFlow("Farn", d_flow);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        tvl1->calc(d_frame0, d_frame1, d_flow);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "TVL1 : " << timeSec << " sec" << endl;
+
+        showFlow("TVL1", d_flow);
+    }
+
+    {
+        //The timing displayed below includes the time taken to copy the input buffers to the OF CUDA input buffers
+        //and to copy the output buffers from the OF CUDA output buffer to the output buffer.
+        //Hence it is expected to be more than what is displayed in the NVIDIA Optical Flow SDK documentation.
+        const int64 start = getTickCount();
+
+        nvof->calc(d_frame0, d_frame1, d_flowxy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "NVIDIAOpticalFlow : " << timeSec << " sec" << endl;
+
+        nvof->upSampler(d_flowxy, frame0.size().width, frame0.size().height,
+            nvof->getGridSize(), d_flow);
+
+        showFlow("NVIDIAOpticalFlow", d_flow);
+    }
+
+    imshow("Frame 0", frame0);
+    imshow("Frame 1", frame1);
+    waitKey();
+
+    return 0;
+}
--- a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
+++ b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
+//
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//M*/
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+Ptr<NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, int, NVIDIA_OF_PERF_LEVEL, bool, bool) { throw_no_cuda(); return Ptr<NvidiaOpticalFlow_1_0>(); }
+
+#elif !defined HAVE_NVIDIA_OPTFLOW
+
+CV_Error(cv::Error::HeaderIsNull, "Nvidia Optical Flow headers not found. Make sure cmake downloads it properly");
+
+#else
+
+#include "nvOpticalFlowCommon.h"
+#include "nvOpticalFlowCuda.h"
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <Windows.h>
+#else
+#define HMODULE void *
+#define _stricmp strcasecmp
+#include <dlfcn.h>
+#endif
+
+//macro for dll loading
+#if defined(_WIN64)
+#define MODULENAME TEXT("nvofapi64.dll")
+#elif defined(_WIN32)
+#define MODULENAME TEXT("nvofapi.dll")
+#else
+#define MODULENAME "libnvidia-opticalflow.so.1"
+#endif
+
+#define NVOF_API_CALL(nvOFAPI)                                                                      \
+    do                                                                                              \
+    {                                                                                               \
+        NV_OF_STATUS errorCode = nvOFAPI;                                                           \
+        std::ostringstream errorLog;                                                                \
+        if(errorCode != NV_OF_SUCCESS)                                                              \
+        {                                                                                           \
+            switch (errorCode)                                                                      \
+            {                                                                                       \
+            case 1:                                                                                 \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_OF_NOT_AVAILABLE";                                          \
+                CV_Error(Error::StsBadFunc, errorLog.str());                                        \
+                break;                                                                              \
+            case 2:                                                                                 \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_UNSUPPORTED_DEVICE";                                        \
+                CV_Error(Error::StsBadArg, errorLog.str());                                         \
+                break;                                                                              \
+            case 3:                                                                                 \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_DEVICE_DOES_NOT_EXIST";                                     \
+                CV_Error(Error::StsBadArg, errorLog.str());                                         \
+                break;                                                                              \
+            case 4:                                                                                 \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_INVALID_PTR";                                               \
+                CV_Error(Error::StsNullPtr, errorLog.str());                                        \
+                break;                                                                              \
+            case 5:                                                                                 \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_INVALID_PARAM";                                             \
+                CV_Error(Error::StsBadArg, errorLog.str());                                         \
+                break;                                                                              \
+            case 6:                                                                                 \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_INVALID_CALL";                                              \
+                CV_Error(Error::BadCallBack, errorLog.str());                                       \
+                break;                                                                              \
+            case 7:                                                                                 \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_INVALID_VERSION";                                           \
+                CV_Error(Error::StsError, errorLog.str());                                          \
+                break;                                                                              \
+            case 8:                                                                                 \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_OUT_OF_MEMORY";                                             \
+                CV_Error(Error::StsNoMem, errorLog.str());                                          \
+                break;                                                                              \
+            case 9:                                                                                 \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_NOT_INITIALIZED";                                           \
+                CV_Error(Error::StsBadArg, errorLog.str());                                         \
+                break;                                                                              \
+            case 10:                                                                                \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_UNSUPPORTED_FEATURE";                                       \
+                CV_Error(Error::StsBadArg, errorLog.str());                                         \
+                break;                                                                              \
+            case 11:                                                                                \
+                errorLog << #nvOFAPI << " returned error " << (unsigned int)errorCode;              \
+                errorLog << ":NV_OF_ERR_GENERIC";                                                   \
+                CV_Error(Error::StsInternal, errorLog.str());                                       \
+                break;                                                                              \
+            default:                                                                                \
+                break;                                                                              \
+            }                                                                                       \
+        }                                                                                           \
+    } while (0)                                                                                     \
+
+using namespace std;
+using namespace cv;
+using namespace cv::cuda;
+
+namespace
+{
+class NvidiaOpticalFlowImpl : public cv::cuda::NvidiaOpticalFlow_1_0
+{
+private:
+    int m_width;
+    int m_height;
+    NV_OF_PERF_LEVEL m_preset;
+    bool m_enableTemporalHints;
+    bool m_enableExternalHints;
+    bool m_enableCostBuffer;
+    int m_gpuId;
+
+    CUcontext m_cuContext;
+    NV_OF_BUFFER_FORMAT m_format;
+
+    NV_OF_OUTPUT_VECTOR_GRID_SIZE m_gridSize;
+
+    NV_OF_BUFFER_DESCRIPTOR m_inputBufferDesc;
+    NV_OF_BUFFER_DESCRIPTOR m_outputBufferDesc;
+    NV_OF_BUFFER_DESCRIPTOR m_hintBufferDesc;
+    NV_OF_BUFFER_DESCRIPTOR m_costBufferDesc;
+
+    uint32_t m_outputElementSize;
+    uint32_t m_costBufElementSize;
+    uint32_t m_hintBufElementSize;
+
+    NV_OF_INIT_PARAMS m_initParams;
+
+    std::unique_ptr<NV_OF_CUDA_API_FUNCTION_LIST> m_ofAPI;
+    NvOFHandle m_hOF; //nvof handle
+
+    NvOFGPUBufferHandle m_hInputBuffer;
+    NvOFGPUBufferHandle m_hReferenceBuffer;
+    NvOFGPUBufferHandle m_hOutputBuffer;
+    NvOFGPUBufferHandle m_hHintBuffer;
+    NvOFGPUBufferHandle m_hCostBuffer;
+
+    CUdeviceptr m_frame0cuDevPtr;
+    CUdeviceptr m_frame1cuDevPtr;
+    CUdeviceptr m_flowXYcuDevPtr;
+    CUdeviceptr m_hintcuDevPtr;
+    CUdeviceptr m_costcuDevPtr;
+
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_inputBufferStrideInfo;
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_referenceBufferStrideInfo;
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_outputBufferStrideInfo;
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_hintBufferStrideInfo;
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_costBufferStrideInfo;
+
+    NV_OF_CUDA_API_FUNCTION_LIST* GetAPI()
+    {
+        std::lock_guard<std::mutex> lock(m_lock);
+        return  m_ofAPI.get();
+    }
+
+    NvOFHandle GetHandle() { return m_hOF; }
+
+protected:
+    HMODULE m_hModule; //module handle to load nvof dll
+    std::mutex m_lock;
+
+public:
+    NvidiaOpticalFlowImpl(int width, int height, NV_OF_PERF_LEVEL perfPreset,
+        bool bEnableTemporalHints, bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId);
+
+    virtual void calc(InputArray inputImage, InputArray referenceImage,
+        InputOutputArray flow, Stream& stream = Stream::Null(),
+        InputArray hint = cv::noArray(), OutputArray cost = cv::noArray());
+
+    virtual void collectGarbage();
+
+    virtual void upSampler(InputArray flow, int width, int height,
+        int gridSize, InputOutputArray upsampledFlow);
+
+    virtual int getGridSize() const { return m_gridSize; }
+};
+
+NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl(
+    int width, int height, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints,
+    bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId) :
+    m_width(width), m_height(height), m_preset(perfPreset),
+    m_enableTemporalHints((NV_OF_BOOL)bEnableTemporalHints),
+    m_enableExternalHints((NV_OF_BOOL)bEnableExternalHints),
+    m_enableCostBuffer((NV_OF_BOOL)bEnableCostBuffer), m_gpuId(gpuId),
+    m_cuContext(nullptr), m_format(NV_OF_BUFFER_FORMAT_GRAYSCALE8),
+    m_gridSize(NV_OF_OUTPUT_VECTOR_GRID_SIZE_4)
+{
+    int nGpu = 0;
+
+    cuSafeCall(cudaGetDeviceCount(&nGpu));
+    if (m_gpuId < 0 || m_gpuId >= nGpu)
+    {
+        CV_Error(Error::StsBadArg, "Invalid GPU Ordinal");
+    }
+
+    cuSafeCall(cudaSetDevice(m_gpuId));
+    cuSafeCall(cudaFree(m_cuContext));
+    cuSafeCall(cuCtxGetCurrent(&m_cuContext));
+
+    if (m_gridSize != NV_OF_OUTPUT_VECTOR_GRID_SIZE_4)
+    {
+        CV_Error(Error::StsBadArg, "Unsupported grid size");
+    }
+
+    auto nOutWidth = (m_width + m_gridSize - 1) / m_gridSize;
+    auto nOutHeight = (m_height + m_gridSize - 1) / m_gridSize;
+
+    auto outBufFmt = NV_OF_BUFFER_FORMAT_SHORT2;
+
+    memset(&m_inputBufferDesc, 0, sizeof(m_inputBufferDesc));
+    m_inputBufferDesc.width = m_width;
+    m_inputBufferDesc.height = m_height;
+    m_inputBufferDesc.bufferFormat = m_format;
+    m_inputBufferDesc.bufferUsage = NV_OF_BUFFER_USAGE_INPUT;
+
+    memset(&m_outputBufferDesc, 0, sizeof(m_outputBufferDesc));
+    m_outputBufferDesc.width = nOutWidth;
+    m_outputBufferDesc.height = nOutHeight;
+    m_outputBufferDesc.bufferFormat = outBufFmt;
+    m_outputBufferDesc.bufferUsage = NV_OF_BUFFER_USAGE_OUTPUT;
+    m_outputElementSize = sizeof(NV_OF_FLOW_VECTOR);
+
+    if (m_enableExternalHints)
+    {
+        memset(&m_hintBufferDesc, 0, sizeof(m_hintBufferDesc));
+        m_hintBufferDesc.width = nOutWidth;
+        m_hintBufferDesc.height = nOutHeight;
+        m_hintBufferDesc.bufferFormat = outBufFmt;
+        m_hintBufferDesc.bufferUsage = NV_OF_BUFFER_USAGE_HINT;
+        m_hintBufElementSize = m_outputElementSize;
+    }
+
+    if (m_enableCostBuffer)
+    {
+        memset(&m_costBufferDesc, 0, sizeof(m_costBufferDesc));
+        m_costBufferDesc.width = nOutWidth;
+        m_costBufferDesc.height = nOutHeight;
+        m_costBufferDesc.bufferFormat = NV_OF_BUFFER_FORMAT_UINT;
+        m_costBufferDesc.bufferUsage = NV_OF_BUFFER_USAGE_COST;
+        m_costBufElementSize = sizeof(uint32_t);
+    }
+
+#if defined(_WIN32) || defined(_WIN64)
+HMODULE hModule = LoadLibrary(MODULENAME);
+#else
+void *hModule = dlopen(MODULENAME, RTLD_LAZY);
+#endif
+
+    if (hModule == NULL)
+    {
+        CV_Error(Error::StsBadFunc,
+            "Cannot find NvOF library.");
+    }
+    m_hModule = hModule;
+
+    typedef NV_OF_STATUS(NVOFAPI *PFNNvOFAPICreateInstanceCuda)
+        (uint32_t apiVer, NV_OF_CUDA_API_FUNCTION_LIST* cudaOf);
+
+#if defined(_WIN32)
+PFNNvOFAPICreateInstanceCuda NvOFAPICreateInstanceCuda
+    = (PFNNvOFAPICreateInstanceCuda)GetProcAddress(m_hModule, "NvOFAPICreateInstanceCuda");
+#else
+PFNNvOFAPICreateInstanceCuda NvOFAPICreateInstanceCuda
+    = (PFNNvOFAPICreateInstanceCuda)dlsym(m_hModule, "NvOFAPICreateInstanceCuda");
+#endif
+    if (!NvOFAPICreateInstanceCuda)
+    {
+        CV_Error(Error::StsBadFunc,
+            "Cannot find NvOFAPICreateInstanceCuda() entry in NVOF library");
+    }
+
+    m_ofAPI.reset(new NV_OF_CUDA_API_FUNCTION_LIST());
+
+    NVOF_API_CALL(NvOFAPICreateInstanceCuda(NV_OF_API_VERSION, m_ofAPI.get()));
+    NVOF_API_CALL(GetAPI()->nvCreateOpticalFlowCuda(m_cuContext, &m_hOF));
+
+    memset(&m_initParams, 0, sizeof(m_initParams));
+    m_initParams.width = m_inputBufferDesc.width;
+    m_initParams.height = m_inputBufferDesc.height;
+    m_initParams.enableExternalHints = (NV_OF_BOOL)m_enableExternalHints;
+    m_initParams.enableOutputCost = (NV_OF_BOOL)m_enableCostBuffer;
+    m_initParams.hintGridSize = (NV_OF_BOOL)m_enableExternalHints == NV_OF_TRUE ?
+        NV_OF_HINT_VECTOR_GRID_SIZE_4 : NV_OF_HINT_VECTOR_GRID_SIZE_UNDEFINED;
+    m_initParams.outGridSize = m_gridSize;
+    m_initParams.mode = NV_OF_MODE_OPTICALFLOW;
+    m_initParams.perfLevel = m_preset;
+
+    NVOF_API_CALL(GetAPI()->nvOFInit(GetHandle(), &m_initParams));
+
+    //Input Buffer 1
+    NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+        &m_inputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hInputBuffer));
+    m_frame0cuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hInputBuffer);
+    NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+        m_hInputBuffer, &m_inputBufferStrideInfo));
+
+    //Input Buffer 2
+    NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+        &m_inputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hReferenceBuffer));
+    m_frame1cuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hReferenceBuffer);
+    NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+        m_hReferenceBuffer, &m_referenceBufferStrideInfo));
+
+    //Output Buffer
+    NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+        &m_outputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hOutputBuffer));
+    m_flowXYcuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hOutputBuffer);
+    NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+        m_hOutputBuffer, &m_outputBufferStrideInfo));
+
+    //Hint Buffer
+    if (m_enableExternalHints)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+            &m_hintBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hHintBuffer));
+        m_hintcuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hHintBuffer);
+        NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+            m_hHintBuffer, &m_hintBufferStrideInfo));
+    }
+
+    //Cost Buffer
+    if (m_enableCostBuffer)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+            &m_costBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hCostBuffer));
+        m_costcuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hCostBuffer);
+        NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+            m_hCostBuffer, &m_costBufferStrideInfo));
+    }
+}
+
+void NvidiaOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow,
+    Stream& stream, InputArray hint, OutputArray cost)
+{
+    Stream inputStream = {};
+    Stream outputStream = {};
+    if (stream)
+        inputStream = stream;
+
+    NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(),
+        StreamAccessor::getStream(inputStream), StreamAccessor::getStream(outputStream)));
+
+    GpuMat frame0GpuMat(_frame0.size(), _frame0.type(), (void*)m_frame0cuDevPtr,
+        m_inputBufferStrideInfo.strideInfo[0].strideXInBytes);
+    GpuMat frame1GpuMat(_frame1.size(), _frame1.type(), (void*)m_frame1cuDevPtr,
+        m_referenceBufferStrideInfo.strideInfo[0].strideXInBytes);
+    GpuMat flowXYGpuMat(Size((m_width + m_gridSize - 1) / m_gridSize,
+        (m_height + m_gridSize - 1) / m_gridSize), CV_16SC2,
+        (void*)m_flowXYcuDevPtr, m_outputBufferStrideInfo.strideInfo[0].strideXInBytes);
+
+    //check whether frame0 is Mat or GpuMat
+    if (_frame0.isMat())
+    {
+        //Get Mats from InputArrays
+        frame0GpuMat.upload(_frame0);
+    }
+    else if (_frame0.isGpuMat())
+    {
+        //Get GpuMats from InputArrays
+        _frame0.copyTo(frame0GpuMat);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg,
+            "Incorrect input. Pass input image (frame0) as Mat or GpuMat");
+    }
+
+    //check whether frame1 is Mat or GpuMat
+    if (_frame1.isMat())
+    {
+        //Get Mats from InputArrays
+        frame1GpuMat.upload(_frame1);
+    }
+    else if (_frame1.isGpuMat())
+    {
+        //Get GpuMats from InputArrays
+        _frame1.copyTo(frame1GpuMat);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg,
+            "Incorrect input. Pass reference image (frame1) as Mat or GpuMat");
+    }
+
+    if (m_enableExternalHints)
+    {
+        GpuMat hintGpuMat(hint.size(), hint.type(), (void*)m_hintcuDevPtr,
+            m_hintBufferStrideInfo.strideInfo[0].strideXInBytes);
+
+        if (hint.isMat())
+        {
+            //Get Mat from InputArray hint
+            hintGpuMat.upload(hint);
+        }
+        else if(hint.isGpuMat())
+        {
+            //Get GpuMat from InputArray hint
+            hint.copyTo(hintGpuMat);
+        }
+        else
+        {
+            CV_Error(Error::StsBadArg,"Incorrect hint buffer passed. Pass Mat or GpuMat");
+        }
+    }
+
+    cuSafeCall(cuCtxPushCurrent(m_cuContext));
+    inputStream.waitForCompletion();
+    cuSafeCall(cuCtxPopCurrent(&m_cuContext));
+
+    //Execute Call
+    NV_OF_EXECUTE_INPUT_PARAMS exeInParams;
+    NV_OF_EXECUTE_OUTPUT_PARAMS exeOutParams;
+    memset(&exeInParams, 0, sizeof(exeInParams));
+    exeInParams.inputFrame = m_hInputBuffer;
+    exeInParams.referenceFrame = m_hReferenceBuffer;
+    exeInParams.disableTemporalHints = (NV_OF_BOOL)m_enableTemporalHints == NV_OF_TRUE ?
+        NV_OF_FALSE : NV_OF_TRUE;
+    exeInParams.externalHints = m_initParams.enableExternalHints == NV_OF_TRUE ?
+        m_hHintBuffer : nullptr;
+    memset(&exeOutParams, 0, sizeof(exeOutParams));
+    exeOutParams.outputBuffer = m_hOutputBuffer;
+    exeOutParams.outputCostBuffer = m_initParams.enableOutputCost == NV_OF_TRUE ?
+        m_hCostBuffer : nullptr;;
+    NVOF_API_CALL(GetAPI()->nvOFExecute(GetHandle(), &exeInParams, &exeOutParams));
+
+    cuSafeCall(cuCtxPushCurrent(m_cuContext));
+    outputStream.waitForCompletion();
+    cuSafeCall(cuCtxPopCurrent(&m_cuContext));
+
+    if (_flow.isMat())
+        flowXYGpuMat.download(_flow);
+    else if(_flow.isGpuMat())
+        flowXYGpuMat.copyTo(_flow);
+    else
+        CV_Error(Error::StsBadArg, "Incorrect flow buffer passed. Pass Mat or GpuMat");
+
+    if (m_enableCostBuffer)
+    {
+        GpuMat costGpuMat(Size((m_width + m_gridSize - 1) / m_gridSize,
+            (m_height + m_gridSize - 1) / m_gridSize), CV_32SC1, (void*)m_costcuDevPtr,
+            m_costBufferStrideInfo.strideInfo[0].strideXInBytes);
+
+        if (cost.isMat())
+            costGpuMat.download(cost);
+        else if(cost.isGpuMat())
+            costGpuMat.copyTo(cost);
+        else
+            CV_Error(Error::StsBadArg, "Incorrect cost buffer passed. Pass Mat or GpuMat");
+    }
+    cuSafeCall(cuCtxSynchronize());
+}
+
+void NvidiaOpticalFlowImpl::collectGarbage()
+{
+    if (m_hInputBuffer)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hInputBuffer));
+    }
+    if (m_hReferenceBuffer)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hReferenceBuffer));
+    }
+    if (m_hOutputBuffer)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hOutputBuffer));
+    }
+    if (m_enableExternalHints)
+    {
+        if (m_hHintBuffer)
+        {
+            NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hHintBuffer));
+        }
+    }
+    if (m_enableCostBuffer)
+    {
+        if (m_hCostBuffer)
+        {
+            NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hCostBuffer));
+        }
+    }
+    if (m_hOF)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFDestroy(m_hOF));
+    }
+    if (m_cuContext)
+    {
+        cuSafeCall(cudaDeviceReset());
+        m_cuContext = nullptr;
+    }
+}
+
+void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height,
+    int gridSize, InputOutputArray upsampledFlow)
+{
+    Mat flow;
+    if (_flow.isMat())
+    {
+        _flow.copyTo(flow);
+    }
+    else if (_flow.isGpuMat())
+    {
+        GpuMat __flow = _flow.getGpuMat();
+        __flow.download(flow);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg,
+            "Incorrect flow buffer passed. Pass either Mat or GpuMat");
+    }
+
+    std::unique_ptr<float[]> flowVectors = nullptr;
+    const NV_OF_FLOW_VECTOR* _flowVectors = static_cast<const NV_OF_FLOW_VECTOR*>((const void*)flow.data);
+    flowVectors.reset(new float[2 * width * height]);
+    for (int y = 0; y < height; ++y)
+    {
+        for (int x = 0; x < width; ++x)
+        {
+            uint32_t blockIdX = x / gridSize;
+            uint32_t blockIdY = y / gridSize;
+            uint32_t widthInBlocks = ((width + gridSize - 1) / gridSize);
+            uint32_t heightInBlocks = ((height + gridSize - 1) / gridSize);;
+            if ((blockIdX < widthInBlocks) && (blockIdY < heightInBlocks))
+            {
+                flowVectors[(y * 2 * width) + 2 * x] = (float)
+                    (_flowVectors[blockIdX + (blockIdY * widthInBlocks)].flowx / (float)(1 << 5));
+                flowVectors[(y * 2 * width) + 2 * x + 1] = (float)
+                    (_flowVectors[blockIdX + (blockIdY * widthInBlocks)].flowy / (float)(1 << 5));
+            }
+        }
+    }
+
+    Mat output(Size(width, height), CV_32FC2, flowVectors.get());
+    if (upsampledFlow.isMat())
+    {
+        output.copyTo(upsampledFlow);
+    }
+    else if (upsampledFlow.isGpuMat())
+    {
+        GpuMat _output(output);
+        _output.copyTo(upsampledFlow);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg,
+            "Incorrect flow buffer passed for upsampled flow. Pass either Mat or GpuMat");
+    }
+}}
+
+Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(
+    int width, int height, NVIDIA_OF_PERF_LEVEL perfPreset,
+    bool bEnableTemporalHints, bool bEnableExternalHints,
+    bool bEnableCostBuffer, int gpuId)
+{
+    return makePtr<NvidiaOpticalFlowImpl>(
+        width,
+        height,
+        (NV_OF_PERF_LEVEL)perfPreset,
+        bEnableTemporalHints,
+        bEnableExternalHints,
+        bEnableCostBuffer,
+        gpuId);
+}
+#endif
\ No newline at end of file
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -409,6 +409,106 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
    ALL_DEVICES,
    testing::Values(Gamma(0.0), Gamma(1.0))));

+//////////////////////////////////////////////////////
+// NvidiaOpticalFlow_1_0
+
+struct NvidiaOpticalFlow_1_0 : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(NvidiaOpticalFlow_1_0, Regression)
+{
+    cv::Mat frame0 = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    const int width = frame0.size().width;
+    const int height = frame0.size().height;
+    const bool enableTemporalHints = false;
+    const bool enableExternalHints = false;
+    const bool enableCostBuffer = false;
+    const int gpuid = 0;
+
+    cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
+    try
+    {
+        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
+            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
+            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+    }
+    catch (const cv::Exception& e)
+    {
+        if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+            throw SkipTestException("Current configuration is not supported");
+        throw;
+    }
+    const int gridSize = d_nvof->getGridSize();
+
+    Mat flow, upsampledFlow;
+    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
+    d_nvof->upSampler(flow, width, height, gridSize, upsampledFlow);
+
+    std::string fname(cvtest::TS::ptr()->get_data_path());
+    fname += "opticalflow/nvofGolden.flo";
+    cv::Mat golden = cv::readOpticalFlow(fname.c_str());
+    ASSERT_FALSE(golden.empty());
+
+    EXPECT_MAT_SIMILAR(golden, upsampledFlow, 1e-10);
+}
+
+CUDA_TEST_P(NvidiaOpticalFlow_1_0, OpticalFlowNan)
+{
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat r_frame0, r_frame1;
+
+    const int width = frame0.size().width;
+    const int height = frame0.size().height;
+    const bool enableTemporalHints = false;
+    const bool enableExternalHints = false;
+    const bool enableCostBuffer = false;
+    const int gpuid = 0;
+
+    cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
+    try
+    {
+        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
+            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
+            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+    }
+    catch (const cv::Exception& e)
+    {
+        if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+            throw SkipTestException("Current configuration is not supported");
+        throw;
+    }
+
+    Mat flow, flowx, flowy;
+    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
+
+    Mat planes[] = { flowx, flowy };
+    split(flow, planes);
+    flowx = planes[0]; flowy = planes[1];
+
+    EXPECT_TRUE(cv::checkRange(flowx));
+    EXPECT_TRUE(cv::checkRange(flowy));
+};
+
+INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, NvidiaOpticalFlow_1_0, ALL_DEVICES);

 }} // namespace
 #endif // HAVE_CUDA