NVIDIA Optical Flow Integration in OpenCV

5e0783e1 · Vishal Chiluka · Vishal Bhaskar Chiluka · f0d30f2c · 5e0783e1 · 5e0783e1
Commit 5e0783e1 authored May 15, 2019 by Vishal Chiluka Committed by Vishal Bhaskar Chiluka Jul 06, 2019
7 changed files
--- a/modules/cudaoptflow/CMakeLists.txt
+++ b/modules/cudaoptflow/CMakeLists.txt
@@ -7,3 +7,22 @@ set(the_description "CUDA-accelerated Optical Flow")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)

 ocv_define_module(cudaoptflow opencv_video opencv_optflow opencv_cudaarithm opencv_cudawarping opencv_cudaimgproc OPTIONAL opencv_cudalegacy WRAP python)
+
+set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT "79c6cee80a2df9a196f20afd6b598a9810964c32")
+set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_MD5 "ca5acedee6cb45d0ec610a6732de5c15")
+set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH "${OpenCV_BINARY_DIR}/3rdparty/NVIDIAOpticalFlowSDK_1_0_Headers")
+ocv_download(FILENAME "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT}.zip"
+               HASH ${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_MD5}
+               URL
+                 "https://github.com/NVIDIA/NVIDIAOpticalFlowSDK/archive/"
+               DESTINATION_DIR "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH}"
+               STATUS NVIDIA_OPTICAL_FLOW_1_0_HEADERS_DOWNLOAD_SUCCESS
+               ID "NVIDIA_OPTICAL_FLOW"
+               UNPACK RELATIVE_URL)
+
+if(NOT NVIDIA_OPTICAL_FLOW_1_0_HEADERS_DOWNLOAD_SUCCESS)
+  message(STATUS "Failed to download NVIDIA_Optical_Flow_1_0 Headers")
+else()
+  add_definitions(-DHAVE_NVIDIA_OPTFLOW=1)
+  ocv_include_directories(SYSTEM "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH}/NVIDIAOpticalFlowSDK-${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT}")
+endif()
\ No newline at end of file
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -102,6 +102,47 @@ public:
                      OutputArray err = cv::noArray(),
                      Stream& stream = Stream::Null()) = 0;
 };
+/** @brief Base Interface for optical flow algorithms using NVIDIA Optical Flow SDK.
+ */
+class CV_EXPORTS_W NvidiaHWOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates Optical Flow using NVIDIA Optical Flow SDK.
+
+    * NVIDIA GPUs starting with Turing contain a dedicated hardware accelerator for computing optical flow vectors between pairs of images.
+    * The optical flow hardware accelerator generates block-based optical flow vectors.
+    * The size of the block depends on hardware in use, and can be queried using the function getGridSize().
+    * The block-based flow vectors generated by the hardware can be converted to dense representation (i.e. per-pixel flow vectors) using upSampler() helper function, if needed.
+    * The flow vectors are stored in CV_16SC2 format with x and y components of each flow vector in 16-bit signed fixed point representation S10.5.
+
+    @param inputImage Input image.
+    @param referenceImage Reference image of the same size and the same type as input image.
+    @param flow A buffer consisting of inputImage.Size() / getGridSize() flow vectors in CV_16SC2 format.
+    @param stream Stream for the asynchronous version.
+    @param hint Hint buffer if client provides external hints. Must have same size as flow buffer.
+                Caller can provide flow vectors as hints for optical flow calculation.
+    @param cost Cost buffer contains numbers indicating the confidence associated with each of the generated flow vectors.
+                Higher the cost, lower the confidence. Cost buffer is of type CV_32SC1.
+
+    @note
+    - Client must use critical sections around each calc() function if calling it from multiple threads.
+    */
+    CV_WRAP virtual void calc(
+        InputArray inputImage,
+        InputArray referenceImage,
+        InputOutputArray flow,
+        Stream& stream = Stream::Null(),
+        InputArray hint = cv::noArray(),
+        OutputArray cost = cv::noArray()) = 0;
+
+    /** @brief Releases all buffers, contexts and device pointers.
+    */
+    CV_WRAP virtual void collectGarbage() = 0;
+
+    /** @brief Returns grid size of output buffer as per the hardware's capability.
+    */
+    CV_WRAP virtual int getGridSize() const = 0;
+};

 //
 // BroxOpticalFlow
@@ -342,6 +383,70 @@ public:
            bool useInitialFlow = false);
 };

+//
+// NvidiaOpticalFlow
+//
+
+/** @brief Class for computing the optical flow vectors between two images using NVIDIA Optical Flow hardware and Optical Flow SDK 1.0.
+@note
+- A sample application demonstrating the use of NVIDIA Optical Flow can be found at
+opencv_source_code/samples/gpu/nvidia_optical_flow.cpp
+- An example application comparing accuracy and performance of NVIDIA Optical Flow with other optical flow algorithms in OpenCV can be found at
+opencv_source_code/samples/gpu/optical_flow.cpp
+*/
+
+class CV_EXPORTS_W NvidiaOpticalFlow_1_0 : public NvidiaHWOpticalFlow
+{
+public:
+    /**
+    * Supported optical flow performance levels.
+    */
+    enum NVIDIA_OF_PERF_LEVEL
+    {
+        NV_OF_PERF_LEVEL_UNDEFINED,
+        NV_OF_PERF_LEVEL_SLOW = 5,                   /**< Slow perf level results in lowest performance and best quality */
+        NV_OF_PERF_LEVEL_MEDIUM = 10,                /**< Medium perf level results in low performance and medium quality */
+        NV_OF_PERF_LEVEL_FAST = 20,                  /**< Fast perf level results in high performance and low quality */
+        NV_OF_PERF_LEVEL_MAX
+    };
+
+    /** @brief The NVIDIA optical flow hardware generates flow vectors at granularity gridSize, which can be queried via function getGridSize().
+    * Upsampler() helper function converts the hardware-generated flow vectors to dense representation (1 flow vector for each pixel)
+    * using nearest neighbour upsampling method.
+
+    @param flow Buffer of type CV_16FC2 containing flow vectors generated by calc().
+    @param width Width of the input image in pixels for which these flow vectors were generated.
+    @param height Height of the input image in pixels for which these flow vectors were generated.
+    @param gridSize Granularity of the optical flow vectors returned by calc() function. Can be queried using getGridSize().
+    @param upsampledFlow Buffer of type CV_32FC2, containing upsampled flow vectors, each flow vector for 1 pixel, in the pitch-linear layout.
+    */
+    CV_WRAP virtual void upSampler(InputArray flow, int width, int height,
+        int gridSize, InputOutputArray upsampledFlow) = 0;
+
+    /** @brief Instantiate NVIDIA Optical Flow
+
+    @param width Width of input image in pixels.
+    @param height Height of input image in pixels.
+    @param perfPreset Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about presets.
+                      Defaults to NV_OF_PERF_LEVEL_SLOW.
+    @param enableTemporalHints Optional parameter. Flag to enable temporal hints. When set to true, the hardware uses the flow vectors
+                               generated in previous call to calc() as internal hints for the current call to calc().
+                               Useful when computing flow vectors between successive video frames. Defaults to false.
+    @param enableExternalHints Optional Parameter. Flag to enable passing external hints buffer to calc(). Defaults to false.
+    @param enableCostBuffer Optional Parameter. Flag to enable cost buffer output from calc(). Defaults to false.
+    @param gpuId Optional parameter to select the GPU ID on which the optical flow should be computed. Useful in multi-GPU systems. Defaults to 0.
+    */
+    CV_WRAP static Ptr<NvidiaOpticalFlow_1_0> create(
+        int width,
+        int height,
+        cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL perfPreset
+        = cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
+        bool enableTemporalHints = false,
+        bool enableExternalHints = false,
+        bool enableCostBuffer = false,
+        int gpuId = 0);
+};
+
 //! @}

 }} // namespace cv { namespace cuda {

--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@@ -326,4 +326,57 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1,
    }
 }

+//////////////////////////////////////////////////////
+// NvidiaOpticalFlow_1_0
+
+PERF_TEST_P(ImagePair, NvidiaOpticalFlow_1_0,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(10);
+
+    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    const int width = frame0.size().width;
+    const int height = frame0.size().height;
+    const bool enableTemporalHints = false;
+    const bool enableExternalHints = false;
+    const bool enableCostBuffer = false;
+    const int gpuid = 0;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_frame0(frame0);
+        const cv::cuda::GpuMat d_frame1(frame1);
+        cv::cuda::GpuMat d_flow;
+        cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
+        try
+        {
+            d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
+                cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST,
+                enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+        }
+        catch (const cv::Exception& e)
+        {
+            if(e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+                throw SkipTestException("Current configuration is not supported");
+            throw;
+        }
+
+        TEST_CYCLE() d_nvof->calc(d_frame0, d_frame1, d_flow);
+
+        cv::cuda::GpuMat flow[2];
+        cv::cuda::split(d_flow, flow);
+
+        cv::cuda::GpuMat u = flow[0];
+        cv::cuda::GpuMat v = flow[1];
+
+        CUDA_SANITY_CHECK(u, 1e-10);
+        CUDA_SANITY_CHECK(v, 1e-10);
+    }
+}
+
 }} // namespace
--- a/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
+++ b/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
+#include <unordered_map>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/video/tracking.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::cuda;
+
+//this function is taken from opencv/samples/gpu/optical_flow.cpp
+inline bool isFlowCorrect(Point2f u)
+{
+    return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9;
+}
+
+//this function is taken from opencv/samples/gpu/optical_flow.cpp
+static Vec3b computeColor(float fx, float fy)
+{
+    static bool first = true;
+
+    // relative lengths of color transitions:
+    // these are chosen based on perceptual similarity
+    // (e.g. one can distinguish more shades between red and yellow
+    //  than between yellow and green)
+    const int RY = 15;
+    const int YG = 6;
+    const int GC = 4;
+    const int CB = 11;
+    const int BM = 13;
+    const int MR = 6;
+    const int NCOLS = RY + YG + GC + CB + BM + MR;
+    static Vec3i colorWheel[NCOLS];
+
+    if (first)
+    {
+        int k = 0;
+
+        for (int i = 0; i < RY; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
+
+        for (int i = 0; i < YG; ++i, ++k)
+            colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
+
+        for (int i = 0; i < GC; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
+
+        for (int i = 0; i < CB; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
+
+        for (int i = 0; i < BM; ++i, ++k)
+            colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
+
+        for (int i = 0; i < MR; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
+
+        first = false;
+    }
+
+    const float rad = sqrt(fx * fx + fy * fy);
+    const float a = atan2(-fy, -fx) / (float)CV_PI;
+
+    const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
+    const int k0 = static_cast<int>(fk);
+    const int k1 = (k0 + 1) % NCOLS;
+    const float f = fk - k0;
+
+    Vec3b pix;
+
+    for (int b = 0; b < 3; b++)
+    {
+        const float col0 = colorWheel[k0][b] / 255.0f;
+        const float col1 = colorWheel[k1][b] / 255.0f;
+
+        float col = (1 - f) * col0 + f * col1;
+
+        if (rad <= 1)
+            col = 1 - rad * (1 - col); // increase saturation with radius
+        else
+            col *= .75; // out of range
+
+        pix[2 - b] = static_cast<uchar>(255.0 * col);
+    }
+
+    return pix;
+}
+
+//this function is taken from opencv/samples/gpu/optical_flow.cpp
+static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy
+    , Mat& dst, float maxmotion = -1)
+{
+    dst.create(flowx.size(), CV_8UC3);
+    dst.setTo(Scalar::all(0));
+
+    // determine motion range:
+    float maxrad = maxmotion;
+
+    if (maxmotion <= 0)
+    {
+        maxrad = 1;
+        for (int y = 0; y < flowx.rows; ++y)
+        {
+            for (int x = 0; x < flowx.cols; ++x)
+            {
+                Point2f u(flowx(y, x), flowy(y, x));
+
+                if (!isFlowCorrect(u))
+                    continue;
+
+                maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
+            }
+        }
+    }
+
+    for (int y = 0; y < flowx.rows; ++y)
+    {
+        for (int x = 0; x < flowx.cols; ++x)
+        {
+            Point2f u(flowx(y, x), flowy(y, x));
+
+            if (isFlowCorrect(u))
+                dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
+        }
+    }
+}
+
+int main(int argc, char **argv)
+{
+    std::unordered_map<std::string, NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL> presetMap = {
+        { "slow", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW },
+        { "medium", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_MEDIUM },
+        { "fast", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST } };
+
+    try
+    {
+        CommandLineParser cmd(argc, argv,
+            "{ l left   | ../data/basketball1.png | specify left image }"
+            "{ r right  | ../data/basketball2.png | specify right image }"
+            "{ g gpuid  | 0 | cuda device index}"
+            "{ p preset | slow | perf preset for OF algo [ options : slow, medium, fast ]}"
+            "{ o output | OpenCVNvOF.flo | output flow vector file in middlebury format}"
+            "{ th enableTemporalHints | false | Enable temporal hints}"
+            "{ eh enableExternalHints | false | Enable external hints}"
+            "{ cb enableCostBuffer | false | Enable output cost buffer}"
+            "{ h help   | | print help message }");
+
+        cmd.about("Nvidia's optical flow sample.");
+        if (cmd.has("help") || !cmd.check())
+        {
+            cmd.printMessage();
+            cmd.printErrors();
+            return 0;
+        }
+
+        string pathL = cmd.get<string>("left");
+        string pathR = cmd.get<string>("right");
+        string preset = cmd.get<string>("preset");
+        string output = cmd.get<string>("output");
+        bool enableExternalHints = cmd.get<bool>("enableExternalHints");
+        bool enableTemporalHints = cmd.get<bool>("enableTemporalHints");
+        bool enableCostBuffer = cmd.get<bool>("enableCostBuffer");
+        int gpuId = cmd.get<int>("gpuid");
+
+        if (pathL.empty()) cout << "Specify left image path\n";
+        if (pathR.empty()) cout << "Specify right image path\n";
+        if (preset.empty()) cout << "Specify perf preset for OpticalFlow algo\n";
+        if (pathL.empty() || pathR.empty()) return 0;
+
+        auto search = presetMap.find(preset);
+        if (search == presetMap.end())
+        {
+            std::cout << "Invalid preset level : " << preset << std::endl;
+            return 0;
+        }
+        NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL perfPreset = search->second;
+
+        Mat frameL = imread(pathL, IMREAD_GRAYSCALE);
+        Mat frameR = imread(pathR, IMREAD_GRAYSCALE);
+        if (frameL.empty()) cout << "Can't open '" << pathL << "'\n";
+        if (frameR.empty()) cout << "Can't open '" << pathR << "'\n";
+        if (frameL.empty() || frameR.empty()) return -1;
+
+        Ptr<NvidiaOpticalFlow_1_0> nvof = NvidiaOpticalFlow_1_0::create(
+            frameL.size().width, frameL.size().height, perfPreset,
+            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuId);
+
+        Mat flowx, flowy, flowxy, upsampledFlowXY, image;
+
+        nvof->calc(frameL, frameR, flowxy);
+
+        nvof->upSampler(flowxy, frameL.size().width, frameL.size().height,
+            nvof->getGridSize(), upsampledFlowXY);
+
+        if (output.size() != 0)
+        {
+            if (!writeOpticalFlow(output, upsampledFlowXY))
+                cout << "Failed to save Flow Vector" << endl;
+            else
+                cout << "Flow vector saved as '" << output << "'\n";
+        }
+
+        Mat planes[] = { flowx, flowy };
+        split(upsampledFlowXY, planes);
+        flowx = planes[0]; flowy = planes[1];
+
+        drawOpticalFlow(flowx, flowy, image, 10);
+
+        imshow("Colorize image",image);
+        waitKey(0);
+        nvof->collectGarbage();
+    }
+    catch (const std::exception &ex)
+    {
+        std::cout << ex.what() << std::endl;
+        return 1;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/modules/cudaoptflow/samples/optical_flow.cpp
+++ b/modules/cudaoptflow/samples/optical_flow.cpp
+#include <iostream>
+#include <fstream>
+
+#include "opencv2/core.hpp"
+#include <opencv2/core/utility.hpp>
+#include "opencv2/highgui.hpp"
+#include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::cuda;
+
+inline bool isFlowCorrect(Point2f u)
+{
+    return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9;
+}
+
+static Vec3b computeColor(float fx, float fy)
+{
+    static bool first = true;
+
+    // relative lengths of color transitions:
+    // these are chosen based on perceptual similarity
+    // (e.g. one can distinguish more shades between red and yellow
+    //  than between yellow and green)
+    const int RY = 15;
+    const int YG = 6;
+    const int GC = 4;
+    const int CB = 11;
+    const int BM = 13;
+    const int MR = 6;
+    const int NCOLS = RY + YG + GC + CB + BM + MR;
+    static Vec3i colorWheel[NCOLS];
+
+    if (first)
+    {
+        int k = 0;
+
+        for (int i = 0; i < RY; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
+
+        for (int i = 0; i < YG; ++i, ++k)
+            colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
+
+        for (int i = 0; i < GC; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
+
+        for (int i = 0; i < CB; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
+
+        for (int i = 0; i < BM; ++i, ++k)
+            colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
+
+        for (int i = 0; i < MR; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
+
+        first = false;
+    }
+
+    const float rad = sqrt(fx * fx + fy * fy);
+    const float a = atan2(-fy, -fx) / (float)CV_PI;
+
+    const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
+    const int k0 = static_cast<int>(fk);
+    const int k1 = (k0 + 1) % NCOLS;
+    const float f = fk - k0;
+
+    Vec3b pix;
+
+    for (int b = 0; b < 3; b++)
+    {
+        const float col0 = colorWheel[k0][b] / 255.0f;
+        const float col1 = colorWheel[k1][b] / 255.0f;
+
+        float col = (1 - f) * col0 + f * col1;
+
+        if (rad <= 1)
+            col = 1 - rad * (1 - col); // increase saturation with radius
+        else
+            col *= .75; // out of range
+
+        pix[2 - b] = static_cast<uchar>(255.0 * col);
+    }
+
+    return pix;
+}
+
+static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy, Mat& dst, float maxmotion = -1)
+{
+    dst.create(flowx.size(), CV_8UC3);
+    dst.setTo(Scalar::all(0));
+
+    // determine motion range:
+    float maxrad = maxmotion;
+
+    if (maxmotion <= 0)
+    {
+        maxrad = 1;
+        for (int y = 0; y < flowx.rows; ++y)
+        {
+            for (int x = 0; x < flowx.cols; ++x)
+            {
+                Point2f u(flowx(y, x), flowy(y, x));
+
+                if (!isFlowCorrect(u))
+                    continue;
+
+                maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
+            }
+        }
+    }
+
+    for (int y = 0; y < flowx.rows; ++y)
+    {
+        for (int x = 0; x < flowx.cols; ++x)
+        {
+            Point2f u(flowx(y, x), flowy(y, x));
+
+            if (isFlowCorrect(u))
+                dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
+        }
+    }
+}
+
+static void showFlow(const char* name, const GpuMat& d_flow)
+{
+    GpuMat planes[2];
+    cuda::split(d_flow, planes);
+
+    Mat flowx(planes[0]);
+    Mat flowy(planes[1]);
+
+    Mat out;
+    drawOpticalFlow(flowx, flowy, out, 10);
+
+    imshow(name, out);
+}
+
+int main(int argc, const char* argv[])
+{
+    string filename1, filename2;
+    if (argc < 3)
+    {
+        cerr << "Usage : " << argv[0] << " <frame0> <frame1>" << endl;
+        filename1 = "../data/basketball1.png";
+        filename2 = "../data/basketball2.png";
+    }
+    else
+    {
+        filename1 = argv[1];
+        filename2 = argv[2];
+    }
+
+    Mat frame0 = imread(filename1, IMREAD_GRAYSCALE);
+    Mat frame1 = imread(filename2, IMREAD_GRAYSCALE);
+
+    if (frame0.empty())
+    {
+        cerr << "Can't open image [" << filename1 << "]" << endl;
+        return -1;
+    }
+    if (frame1.empty())
+    {
+        cerr << "Can't open image [" << filename2 << "]" << endl;
+        return -1;
+    }
+
+    if (frame1.size() != frame0.size())
+    {
+        cerr << "Images should be of equal sizes" << endl;
+        return -1;
+    }
+
+    GpuMat d_frame0(frame0);
+    GpuMat d_frame1(frame1);
+
+    GpuMat d_flow(frame0.size(), CV_32FC2), d_flowxy;
+
+    Ptr<cuda::BroxOpticalFlow> brox = cuda::BroxOpticalFlow::create(0.197f, 50.0f, 0.8f, 10, 77, 10);
+    Ptr<cuda::DensePyrLKOpticalFlow> lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7));
+    Ptr<cuda::FarnebackOpticalFlow> farn = cuda::FarnebackOpticalFlow::create();
+    Ptr<cuda::OpticalFlowDual_TVL1> tvl1 = cuda::OpticalFlowDual_TVL1::create();
+    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof = cuda::NvidiaOpticalFlow_1_0::create(
+        frame0.size().width, frame0.size().height, NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST);
+
+    {
+        GpuMat d_frame0f;
+        GpuMat d_frame1f;
+
+        d_frame0.convertTo(d_frame0f, CV_32F, 1.0 / 255.0);
+        d_frame1.convertTo(d_frame1f, CV_32F, 1.0 / 255.0);
+
+        const int64 start = getTickCount();
+
+        brox->calc(d_frame0f, d_frame1f, d_flow);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Brox : " << timeSec << " sec" << endl;
+
+        showFlow("Brox", d_flow);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        lk->calc(d_frame0, d_frame1, d_flow);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "LK : " << timeSec << " sec" << endl;
+
+        showFlow("LK", d_flow);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        farn->calc(d_frame0, d_frame1, d_flow);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Farn : " << timeSec << " sec" << endl;
+
+        showFlow("Farn", d_flow);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        tvl1->calc(d_frame0, d_frame1, d_flow);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "TVL1 : " << timeSec << " sec" << endl;
+
+        showFlow("TVL1", d_flow);
+    }
+
+    {
+        //The timing displayed below includes the time taken to copy the input buffers to the OF CUDA input buffers
+        //and to copy the output buffers from the OF CUDA output buffer to the output buffer.
+        //Hence it is expected to be more than what is displayed in the NVIDIA Optical Flow SDK documentation.
+        const int64 start = getTickCount();
+
+        nvof->calc(d_frame0, d_frame1, d_flowxy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "NVIDIAOpticalFlow : " << timeSec << " sec" << endl;
+
+        nvof->upSampler(d_flowxy, frame0.size().width, frame0.size().height,
+            nvof->getGridSize(), d_flow);
+
+        showFlow("NVIDIAOpticalFlow", d_flow);
+    }
+
+    imshow("Frame 0", frame0);
+    imshow("Frame 1", frame1);
+    waitKey();
+
+    return 0;
+}
--- a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
+++ b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -409,6 +409,106 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
    ALL_DEVICES,
    testing::Values(Gamma(0.0), Gamma(1.0))));

+//////////////////////////////////////////////////////
+// NvidiaOpticalFlow_1_0
+
+struct NvidiaOpticalFlow_1_0 : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(NvidiaOpticalFlow_1_0, Regression)
+{
+    cv::Mat frame0 = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    const int width = frame0.size().width;
+    const int height = frame0.size().height;
+    const bool enableTemporalHints = false;
+    const bool enableExternalHints = false;
+    const bool enableCostBuffer = false;
+    const int gpuid = 0;
+
+    cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
+    try
+    {
+        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
+            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
+            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+    }
+    catch (const cv::Exception& e)
+    {
+        if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+            throw SkipTestException("Current configuration is not supported");
+        throw;
+    }
+    const int gridSize = d_nvof->getGridSize();
+
+    Mat flow, upsampledFlow;
+    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
+    d_nvof->upSampler(flow, width, height, gridSize, upsampledFlow);
+
+    std::string fname(cvtest::TS::ptr()->get_data_path());
+    fname += "opticalflow/nvofGolden.flo";
+    cv::Mat golden = cv::readOpticalFlow(fname.c_str());
+    ASSERT_FALSE(golden.empty());
+
+    EXPECT_MAT_SIMILAR(golden, upsampledFlow, 1e-10);
+}
+
+CUDA_TEST_P(NvidiaOpticalFlow_1_0, OpticalFlowNan)
+{
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat r_frame0, r_frame1;
+
+    const int width = frame0.size().width;
+    const int height = frame0.size().height;
+    const bool enableTemporalHints = false;
+    const bool enableExternalHints = false;
+    const bool enableCostBuffer = false;
+    const int gpuid = 0;
+
+    cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
+    try
+    {
+        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
+            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
+            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+    }
+    catch (const cv::Exception& e)
+    {
+        if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+            throw SkipTestException("Current configuration is not supported");
+        throw;
+    }
+
+    Mat flow, flowx, flowy;
+    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
+
+    Mat planes[] = { flowx, flowy };
+    split(flow, planes);
+    flowx = planes[0]; flowy = planes[1];
+
+    EXPECT_TRUE(cv::checkRange(flowx));
+    EXPECT_TRUE(cv::checkRange(flowy));
+};
+
+INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, NvidiaOpticalFlow_1_0, ALL_DEVICES);

 }} // namespace
 #endif // HAVE_CUDA