Merge pull request #2022 from asmorkalov:ocv_dynamic_cuda

f22d9a23 · Roman Donchenko · OpenCV Buildbot · f9268d34 · bc72f4d2 · f22d9a23
Commit f22d9a23 authored Dec 23, 2013 by Roman Donchenko Committed by OpenCV Buildbot Dec 23, 2013
27 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -201,6 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi
 # OpenCV build options
 # ===================================================
+OCV_OPTION(ENABLE_DYNAMIC_CUDA        "Enabled dynamic CUDA linkage"                             ON   IF ANDROID )
 OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  ON   IF (NOT IOS) )
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CMAKE_COMPILER_IS_GNUCXX )
@@ -472,7 +473,11 @@ endif()
 # ----------------------------------------------------------------------------
 # Add CUDA libraries (needed for apps/tools, samples)
 # ----------------------------------------------------------------------------
-if(HAVE_CUDA)
+if(NOT HAVE_CUDA)
+  set(ENABLE_DYNAMIC_CUDA OFF)
+endif()
+if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
  set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
  if(HAVE_CUBLAS)
    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cublas_LIBRARY})
@@ -481,6 +486,7 @@ if(HAVE_CUDA)
    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY})
  endif()
 endif()
 # ----------------------------------------------------------------------------
 # Solution folders:
 # ----------------------------------------------------------------------------

--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -488,7 +488,7 @@ macro(ocv_glob_module_sources)
  file(GLOB lib_cuda_srcs "src/cuda/*.cu")
  set(cuda_objs "")
  set(lib_cuda_hdrs "")
-  if(HAVE_CUDA AND lib_cuda_srcs)
+  if(HAVE_CUDA)
    ocv_include_directories(${CUDA_INCLUDE_DIRS})
    file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
@@ -537,9 +537,6 @@ macro(ocv_create_module)
    target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS})
    target_link_libraries(${the_module} LINK_INTERFACE_LIBRARIES ${OPENCV_MODULE_${the_module}_DEPS})
    target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
-    if (HAVE_CUDA)
-      target_link_libraries(${the_module} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
-    endif()
  endif()
  add_dependencies(opencv_modules ${the_module})

--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
 set(the_description "The Core Functionality")
-ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
-ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
+if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA)
+  ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
+else()
+  ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+endif()
+ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/dynamicuda/include/" ${ZLIB_INCLUDE_DIR})
 if(HAVE_WINRT)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()
+if(ENABLE_DYNAMIC_CUDA)
+  add_definitions(-DDYNAMIC_CUDA_SUPPORT)
+else()
+  if (HAVE_CUDA)
+    add_definitions(-DUSE_CUDA)
+  endif()
+endif()
 if(HAVE_CUDA)
  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
@@ -14,11 +28,26 @@ endif()
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
+if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
+  file(GLOB lib_cuda           "../dynamicuda/src/cuda/*.cu*")
+  ocv_include_directories(${CUDA_INCLUDE_DIRS})
+  ocv_cuda_compile(cuda_objs ${lib_cuda})
+endif()
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
-ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+if (HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
-                        HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+  source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
+endif()
+if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA)
+  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+                          HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+else()
+  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ${lib_cuda} ${cuda_objs}
+                          HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+endif()
 ocv_create_module()
 ocv_add_precompiled_headers(${the_module})

--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -44,7 +44,7 @@
 #include "opencv2/core/gpumat.hpp"
 #include <iostream>
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA)
    #include <cuda_runtime.h>
    #include <npp.h>
@@ -60,493 +60,232 @@
    #endif
 #endif
+#ifdef DYNAMIC_CUDA_SUPPORT
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#endif
+#ifdef ANDROID
+# include <android/log.h>
+# define LOG_TAG "OpenCV::CUDA"
+# define LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__))
+# define LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__))
+# define LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__))
+#endif
 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
-#ifndef HAVE_CUDA
 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
-#else // HAVE_CUDA
+#include "opencv2/dynamicuda/dynamicuda.hpp"
-namespace
-{
-#define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
-#define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-    {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-        {
-            std::ostringstream msg;
-            msg << "NPP API Call Error: " << err;
-            cv::gpu::error(msg.str().c_str(), file, line, func);
-        }
-    }
-}
-#endif // HAVE_CUDA
-//////////////////////////////// Initialization & Info ////////////////////////
-#ifndef HAVE_CUDA
-int cv::gpu::getCudaEnabledDeviceCount() { return 0; }
+#ifdef DYNAMIC_CUDA_SUPPORT
-void cv::gpu::setDevice(int) { throw_nogpu; }
+typedef GpuFuncTable* (*GpuFactoryType)();
-int cv::gpu::getDevice() { throw_nogpu; return 0; }
+typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)();
-void cv::gpu::resetDevice() { throw_nogpu; }
+static GpuFactoryType gpuFactory = NULL;
+static DeviceInfoFactoryType deviceInfoFactory = NULL;
-bool cv::gpu::deviceSupports(FeatureSet) { throw_nogpu; return false; }
+# if defined(__linux__) || defined(__APPLE__) || defined (ANDROID)
+#  ifdef ANDROID
-bool cv::gpu::TargetArchs::builtWith(FeatureSet) { throw_nogpu; return false; }
+static const std::string getCudaSupportLibName()
-bool cv::gpu::TargetArchs::has(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasBin(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreater(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int, int) { throw_nogpu; return false; }
-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { throw_nogpu; return 0; }
-void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu; }
-size_t cv::gpu::DeviceInfo::freeMemory() const { throw_nogpu; return 0; }
-size_t cv::gpu::DeviceInfo::totalMemory() const { throw_nogpu; return 0; }
-bool cv::gpu::DeviceInfo::supports(FeatureSet) const { throw_nogpu; return false; }
-bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu; return false; }
-void cv::gpu::DeviceInfo::query() { throw_nogpu; }
-void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu; }
-void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu; }
-#else // HAVE_CUDA
-int cv::gpu::getCudaEnabledDeviceCount()
 {
-    int count;
+    Dl_info dl_info;
-    cudaError_t error = cudaGetDeviceCount( &count );
+    if(0 != dladdr((void *)getCudaSupportLibName, &dl_info))
-    if (error == cudaErrorInsufficientDriver)
-        return -1;
-    if (error == cudaErrorNoDevice)
-        return 0;
-    cudaSafeCall( error );
-    return count;
-}
-void cv::gpu::setDevice(int device)
-{
-    cudaSafeCall( cudaSetDevice( device ) );
-}
-int cv::gpu::getDevice()
-{
-    int device;
-    cudaSafeCall( cudaGetDevice( &device ) );
-    return device;
-}
-void cv::gpu::resetDevice()
-{
-    cudaSafeCall( cudaDeviceReset() );
-}
-namespace
-{
-    class CudaArch
    {
-    public:
+        LOGD("Library name: %s", dl_info.dli_fname);
-        CudaArch();
+        LOGD("Library base address: %p", dl_info.dli_fbase);
-        bool builtWith(FeatureSet feature_set) const;
-        bool hasPtx(int major, int minor) const;
-        bool hasBin(int major, int minor) const;
-        bool hasEqualOrLessPtx(int major, int minor) const;
-        bool hasEqualOrGreaterPtx(int major, int minor) const;
-        bool hasEqualOrGreaterBin(int major, int minor) const;
-    private:
+        const char* libName=dl_info.dli_fname;
-        static void fromStr(const string& set_as_str, vector<int>& arr);
+        while( ((*libName)=='/') || ((*libName)=='.') )
+        libName++;
-        vector<int> bin;
+        char lineBuf[2048];
-        vector<int> ptx;
+        FILE* file = fopen("/proc/self/smaps", "rt");
-        vector<int> features;
-    };
-    const CudaArch cudaArch;
+        if(file)
+        {
-    CudaArch::CudaArch()
+            while (fgets(lineBuf, sizeof lineBuf, file) != NULL)
-    {
+            {
-        fromStr(CUDA_ARCH_BIN, bin);
+                //verify that line ends with library name
-        fromStr(CUDA_ARCH_PTX, ptx);
+                int lineLength = strlen(lineBuf);
-        fromStr(CUDA_ARCH_FEATURES, features);
+                int libNameLength = strlen(libName);
-    }
-    bool CudaArch::builtWith(FeatureSet feature_set) const
-    {
-        return !features.empty() && (features.back() >= feature_set);
-    }
-    bool CudaArch::hasPtx(int major, int minor) const
-    {
-        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
-    }
-    bool CudaArch::hasBin(int major, int minor) const
-    {
-        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
-    }
-    bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
+                //trim end
-    {
+                for(int i = lineLength - 1; i >= 0 && isspace(lineBuf[i]); --i)
-        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+                {
-    }
+                    lineBuf[i] = 0;
+                    --lineLength;
+                }
-    bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
+                if (0 != strncmp(lineBuf + lineLength - libNameLength, libName, libNameLength))
-    {
+                {
-        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+                //the line does not contain the library name
-    }
+                    continue;
+                }
-    bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
+                //extract path from smaps line
-    {
+                char* pathBegin = strchr(lineBuf, '/');
-        return !bin.empty() && (bin.back() >= major * 10 + minor);
+                if (0 == pathBegin)
-    }
+                {
+                    LOGE("Strange error: could not find path beginning in lin \"%s\"", lineBuf);
+                    continue;
+                }
-    void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
+                char* pathEnd = strrchr(pathBegin, '/');
-    {
+                pathEnd[1] = 0;
-        if (set_as_str.find_first_not_of(" ") == string::npos)
-            return;
-        istringstream stream(set_as_str);
+                LOGD("Libraries folder found: %s", pathBegin);
-        int cur_value;
-        while (!stream.eof())
+                fclose(file);
+                return std::string(pathBegin) + "/libopencv_core_cuda.so";
+            }
+            fclose(file);
+            LOGE("Could not find library path");
+        }
+        else
        {
-            stream >> cur_value;
+            LOGE("Could not read /proc/self/smaps");
-            arr.push_back(cur_value);
        }
-        sort(arr.begin(), arr.end());
    }
-}
-bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
-{
-    return cudaArch.builtWith(feature_set);
-}
-bool cv::gpu::TargetArchs::has(int major, int minor)
-{
-    return hasPtx(major, minor) || hasBin(major, minor);
-}
-bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
-{
-    return cudaArch.hasPtx(major, minor);
-}
-bool cv::gpu::TargetArchs::hasBin(int major, int minor)
-{
-    return cudaArch.hasBin(major, minor);
-}
-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
-{
-    return cudaArch.hasEqualOrLessPtx(major, minor);
-}
-bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
-{
-    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
-}
-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
-{
-    return cudaArch.hasEqualOrGreaterPtx(major, minor);
-}
-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
-{
-    return cudaArch.hasEqualOrGreaterBin(major, minor);
-}
-bool cv::gpu::deviceSupports(FeatureSet feature_set)
-{
-    static int versions[] =
-    {
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-    };
-    static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
-    const int devId = getDevice();
-    int version;
-    if (devId < cache_size && versions[devId] >= 0)
-        version = versions[devId];
    else
    {
-        DeviceInfo dev(devId);
+        LOGE("Could not get library name and base address");
-        version = dev.majorVersion() * 10 + dev.minorVersion();
-        if (devId < cache_size)
-            versions[devId] = version;
    }
-    return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+    return string();
 }
-namespace
+#  else
+static const std::string getCudaSupportLibName()
 {
-    class DeviceProps
+    return "libopencv_core_cuda.so";
-    {
+}
-    public:
+#  endif
-        DeviceProps();
-        ~DeviceProps();
-        cudaDeviceProp* get(int devID);
-    private:
-        std::vector<cudaDeviceProp*> props_;
-    };
-    DeviceProps::DeviceProps()
+static bool loadCudaSupportLib()
-    {
+{
-        props_.resize(10, 0);
+    void* handle;
-    }
+    const std::string name = getCudaSupportLibName();
+    handle = dlopen(name.c_str(), RTLD_LAZY);
+    if (!handle)
+        return false;
-    DeviceProps::~DeviceProps()
+    deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory");
+    if (!deviceInfoFactory)
    {
-        for (size_t i = 0; i < props_.size(); ++i)
+        dlclose(handle);
-        {
+        return false;
-            if (props_[i])
-                delete props_[i];
-        }
-        props_.clear();
    }
-    cudaDeviceProp* DeviceProps::get(int devID)
+    gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory");
+    if (!gpuFactory)
    {
-        if (devID >= (int) props_.size())
+        dlclose(handle);
-            props_.resize(devID + 5, 0);
+        return false;
-        if (!props_[devID])
-        {
-            props_[devID] = new cudaDeviceProp;
-            cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
-        }
-        return props_[devID];
    }
-    DeviceProps deviceProps;
+    return true;
 }
-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const
+# else
-{
+#  error "Dynamic CUDA support is not implemented for this platform!"
-    return deviceProps.get(device_id_)->sharedMemPerBlock;
+# endif
-}
-void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+#endif
-{
-    int prevDeviceID = getDevice();
-    if (prevDeviceID != device_id_)
-        setDevice(device_id_);
-    cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
-    if (prevDeviceID != device_id_)
+static GpuFuncTable* gpuFuncTable()
-        setDevice(prevDeviceID);
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyFuncTable stub;
+   static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub;
+   static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaFuncTable impl;
+   static GpuFuncTable* funcTable = &impl;
+#else
+   static EmptyFuncTable stub;
+   static GpuFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
+}
+static DeviceInfoFuncTable* deviceInfoFuncTable()
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyDeviceInfoFuncTable stub;
+   static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub;
+   static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaDeviceInfoFuncTable impl;
+   static DeviceInfoFuncTable* funcTable = &impl;
+#else
+   static EmptyDeviceInfoFuncTable stub;
+   static DeviceInfoFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
 }
-size_t cv::gpu::DeviceInfo::freeMemory() const
-{
-    size_t _totalMemory, _freeMemory;
-    queryMemory(_totalMemory, _freeMemory);
-    return _freeMemory;
-}
-size_t cv::gpu::DeviceInfo::totalMemory() const
+//////////////////////////////// Initialization & Info ////////////////////////
-{
-    size_t _totalMemory, _freeMemory;
-    queryMemory(_totalMemory, _freeMemory);
-    return _totalMemory;
-}
-bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const
+int cv::gpu::getCudaEnabledDeviceCount() { return deviceInfoFuncTable()->getCudaEnabledDeviceCount(); }
-{
-    int version = majorVersion() * 10 + minorVersion();
-    return version >= feature_set;
-}
-bool cv::gpu::DeviceInfo::isCompatible() const
+void cv::gpu::setDevice(int device) { deviceInfoFuncTable()->setDevice(device); }
-{
+int cv::gpu::getDevice() { return deviceInfoFuncTable()->getDevice(); }
-    // Check PTX compatibility
-    if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
-        return true;
-    // Check BIN compatibility
+void cv::gpu::resetDevice() { deviceInfoFuncTable()->resetDevice(); }
-    for (int i = minorVersion(); i >= 0; --i)
-        if (TargetArchs::hasBin(majorVersion(), i))
-            return true;
-    return false;
+bool cv::gpu::deviceSupports(FeatureSet feature_set) { return deviceInfoFuncTable()->deviceSupports(feature_set); }
-}
-void cv::gpu::DeviceInfo::query()
+bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return deviceInfoFuncTable()->builtWith(feature_set); }
-{
+bool cv::gpu::TargetArchs::has(int major, int minor) { return deviceInfoFuncTable()->has(major, minor); }
-    const cudaDeviceProp* prop = deviceProps.get(device_id_);
+bool cv::gpu::TargetArchs::hasPtx(int major, int minor) {  return deviceInfoFuncTable()->hasPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return deviceInfoFuncTable()->hasBin(major, minor);  }
+bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrLessPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreater(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterBin(major, minor); }
-    name_ = prop->name;
+size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); }
-    multi_processor_count_ = prop->multiProcessorCount;
+void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); }
-    majorVersion_ = prop->major;
+size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); }
-    minorVersion_ = prop->minor;
+size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); }
-}
+bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); }
+bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); }
-namespace
+void cv::gpu::DeviceInfo::query()
 {
-    int convertSMVer2Cores(int major, int minor)
+    deviceInfoFuncTable()->query();
-    {
+    name_ = deviceInfoFuncTable()->name();
-        // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+    multi_processor_count_ = deviceInfoFuncTable()->multiProcessorCount();
-        typedef struct {
+    majorVersion_ = deviceInfoFuncTable()->majorVersion();
-            int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+    minorVersion_ = deviceInfoFuncTable()->minorVersion();
-            int Cores;
-        } SMtoCores;
-        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
-        int index = 0;
-        while (gpuArchCoresPerSM[index].SM != -1)
-        {
-            if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
-                return gpuArchCoresPerSM[index].Cores;
-            index++;
-        }
-        return -1;
-    }
 }
-void cv::gpu::printCudaDeviceInfo(int device)
+void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); }
-{
+void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); }
-    int count = getCudaEnabledDeviceCount();
-    bool valid = (device >= 0) && (device < count);
-    int beg = valid ? device   : 0;
-    int end = valid ? device+1 : count;
-    printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
-    printf("Device count: %d\n", count);
-    int driverVersion = 0, runtimeVersion = 0;
-    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
-    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-    const char *computeMode[] = {
+namespace cv { namespace gpu
-        "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
-        "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
-        "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
-        "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
-        "Unknown",
-        NULL
-    };
-    for(int dev = beg; dev < end; ++dev)
-    {
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-        printf("\nDevice %d: \"%s\"\n", dev, prop.name);
-        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
-        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-        int cores = convertSMVer2Cores(prop.major, prop.minor);
-        if (cores > 0)
-            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
-        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
-        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
-            prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
-            prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
-        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
-            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
-            prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
-        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
-        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
-        printf("  Warp size:                                     %d\n", prop.warpSize);
-        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
-        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
-        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
-        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
-        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
-        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
-        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
-        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
-        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
-        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
-        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
-        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
-        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
-        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
-        printf("  Compute Mode:\n");
-        printf("      %s \n", computeMode[prop.computeMode]);
-    }
-    printf("\n");
-    printf("deviceQuery, CUDA Driver = CUDART");
-    printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
-    printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
-    printf(", NumDevs = %d\n\n", count);
-    fflush(stdout);
-}
-void cv::gpu::printShortCudaDeviceInfo(int device)
 {
-    int count = getCudaEnabledDeviceCount();
+    CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, cudaStream_t);
-    bool valid = (device >= 0) && (device < count);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, cudaStream_t = 0);
-    int beg = valid ? device   : 0;
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, cudaStream_t);
-    int end = valid ? device+1 : count;
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar);
-    int driverVersion = 0, runtimeVersion = 0;
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&);
-    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+}}
-    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-    for(int dev = beg; dev < end; ++dev)
-    {
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-        const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
-        printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
-        printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
-        int cores = convertSMVer2Cores(prop.major, prop.minor);
-        if (cores > 0)
-            printf(", %d cores", cores * prop.multiProcessorCount);
-        printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-    }
-    fflush(stdout);
-}
-#endif // HAVE_CUDA
 //////////////////////////////// GpuMat ///////////////////////////////
@@ -830,601 +569,6 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
    return mat = GpuMat(rows, cols, type);
 }
-namespace
-{
-    class GpuFuncTable
-    {
-    public:
-        virtual ~GpuFuncTable() {}
-        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
-        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
-        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
-        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
-        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
-        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
-        virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
-        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
-        virtual void free(void* devPtr) const = 0;
-    };
-}
-#ifndef HAVE_CUDA
-namespace
-{
-    class EmptyFuncTable : public GpuFuncTable
-    {
-    public:
-        void copy(const Mat&, GpuMat&) const { throw_nogpu; }
-        void copy(const GpuMat&, Mat&) const { throw_nogpu; }
-        void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
-        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
-        void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
-        void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu; }
-        void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu; }
-        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
-        void free(void*) const {}
-    };
-    const GpuFuncTable* gpuFuncTable()
-    {
-        static EmptyFuncTable empty;
-        return &empty;
-    }
-}
-#else // HAVE_CUDA
-namespace cv { namespace gpu { namespace device
-{
-    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
-    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream);
-    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
-}}}
-namespace
-{
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
-    }
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
-    }
-}
-namespace cv { namespace gpu
-{
-    CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, CUstream_st*);
-    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&);
-    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, CUstream_st*);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&);
-}}
-namespace cv { namespace gpu
-{
-    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
-    {
-        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
-        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
-    }
-    void convertTo(const GpuMat& src, GpuMat& dst)
-    {
-        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
-    }
-    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
-    {
-        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
-    }
-    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
-        static const caller_t callers[] =
-        {
-            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-            kernelSetCaller<float>, kernelSetCaller<double>
-        };
-        callers[src.depth()](src, s, stream);
-    }
-    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-        static const caller_t callers[] =
-        {
-            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-            kernelSetCaller<float>, kernelSetCaller<double>
-        };
-        callers[src.depth()](src, s, mask, stream);
-    }
-    void setTo(GpuMat& src, Scalar s)
-    {
-        setTo(src, s, 0);
-    }
-    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
-    {
-        setTo(src, s, mask, 0);
-    }
-}}
-namespace
-{
-    template<int n> struct NPPTypeTraits;
-    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
-    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
-    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
-    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
-    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
-    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
-    template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
-    //////////////////////////////////////////////////////////////////////////
-    // Convert
-    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
-    };
-    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    //////////////////////////////////////////////////////////////////////////
-    // Set
-    template<int SDEPTH, int SCN> struct NppSetFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SCN> struct NppSetFunc<CV_8S, SCN>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<> struct NppSetFunc<CV_8S, 1>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            Scalar_<src_t> nppS = s;
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            Scalar_<src_t> nppS = s;
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, int SCN> struct NppSetMaskFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            Scalar_<src_t> nppS = s;
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            Scalar_<src_t> nppS = s;
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    //////////////////////////////////////////////////////////////////////////
-    // CopyMasked
-    template<int SDEPTH> struct NppCopyMaskedFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
-    {
-        return reinterpret_cast<size_t>(ptr) % size == 0;
-    }
-    //////////////////////////////////////////////////////////////////////////
-    // CudaFuncTable
-    class CudaFuncTable : public GpuFuncTable
-    {
-    public:
-        void copy(const Mat& src, GpuMat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
-        }
-        void copy(const GpuMat& src, Mat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
-        }
-        void copy(const GpuMat& src, GpuMat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
-        }
-        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
-            if (src.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
-            static const func_t funcs[7][4] =
-            {
-                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
-                /*  8S */ {cv::gpu::copyWithMask                         , cv::gpu::copyWithMask, cv::gpu::copyWithMask                         , cv::gpu::copyWithMask                         },
-                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
-                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
-                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
-                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
-                /* 64F */ {cv::gpu::copyWithMask                         , cv::gpu::copyWithMask, cv::gpu::copyWithMask                         , cv::gpu::copyWithMask                         }
-            };
-            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::copyWithMask;
-            func(src, dst, mask, 0);
-        }
-        void convert(const GpuMat& src, GpuMat& dst) const
-        {
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
-            static const func_t funcs[7][7][4] =
-            {
-                {
-                    /*  8U ->  8U */ {0, 0, 0, 0},
-                    /*  8U ->  8S */ {cv::gpu::convertTo                                , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
-                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
-                    /*  8U -> 32S */ {cv::gpu::convertTo                                , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /*  8U -> 64F */ {cv::gpu::convertTo                                , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                }
-                },
-                {
-                    /*  8S ->  8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S ->  8S */ {0,0,0,0},
-                    /*  8S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}
-                },
-                {
-                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
-                    /* 16U ->  8S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 16U */ {0,0,0,0},
-                    /* 16U -> 16S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 64F */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                }
-                },
-                {
-                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
-                    /* 16S ->  8S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 16U */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 16S */ {0,0,0,0},
-                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 64F */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                }
-                },
-                {
-                    /* 32S ->  8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S ->  8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 32S */ {0,0,0,0},
-                    /* 32S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}
-                },
-                {
-                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F ->  8S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 32S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 32F */ {0,0,0,0},
-                    /* 32F -> 64F */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}
-                },
-                {
-                    /* 64F ->  8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F ->  8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 64F */ {0,0,0,0}
-                }
-            };
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-            CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
-            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-            if (!aligned)
-            {
-                cv::gpu::convertTo(src, dst);
-                return;
-            }
-            const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
-            CV_DbgAssert(func != 0);
-            func(src, dst);
-        }
-        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
-            cv::gpu::convertTo(src, dst, alpha, beta);
-        }
-        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
-        {
-            if (mask.empty())
-            {
-                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
-                {
-                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
-                    return;
-                }
-                if (m.depth() == CV_8U)
-                {
-                    int cn = m.channels();
-                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-                    {
-                        int val = saturate_cast<uchar>(s[0]);
-                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
-                        return;
-                    }
-                }
-                typedef void (*func_t)(GpuMat& src, Scalar s);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::setTo                          , cv::gpu::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
-                    {cv::gpu::setTo                          , cv::gpu::setTo                          , cv::gpu::setTo                        , cv::gpu::setTo                          },
-                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
-                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
-                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::setTo                          , cv::gpu::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
-                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::setTo                          , cv::gpu::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
-                    {cv::gpu::setTo                          , cv::gpu::setTo                          , cv::gpu::setTo                        , cv::gpu::setTo                          }
-                };
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-                funcs[m.depth()][m.channels() - 1](m, s);
-            }
-            else
-            {
-                typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
-                    {cv::gpu::setTo                               , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo                               },
-                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
-                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
-                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
-                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
-                    {cv::gpu::setTo                               , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo                               }
-                };
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-                funcs[m.depth()][m.channels() - 1](m, s, mask);
-            }
-        }
-        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
-        {
-            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
-        }
-        void free(void* devPtr) const
-        {
-            cudaFree(devPtr);
-        }
-    };
-    const GpuFuncTable* gpuFuncTable()
-    {
-        static CudaFuncTable funcTable;
-        return &funcTable;
-    }
-}
-#endif // HAVE_CUDA
 void cv::gpu::GpuMat::upload(const Mat& m)
 {
    CV_DbgAssert(!m.empty());
@@ -1492,9 +636,9 @@ void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double bet
    dst.create(size(), rtype);
    if (noScale)
-        gpuFuncTable()->convert(*psrc, dst);
+        cv::gpu::convertTo(*psrc, dst);
    else
-        gpuFuncTable()->convert(*psrc, dst, alpha, beta);
+        cv::gpu::convertTo(*psrc, dst, alpha, beta);
 }
 GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
@@ -1502,7 +646,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
    CV_Assert(mask.empty() || mask.type() == CV_8UC1);
    CV_DbgAssert(!empty());
-    gpuFuncTable()->setTo(*this, s, mask);
+    gpu::setTo(*this, s, mask);
    return *this;
 }
@@ -1562,6 +706,39 @@ void cv::gpu::GpuMat::release()
    refcount = 0;
 }
+namespace cv { namespace gpu
+{
+    void convertTo(const GpuMat& src, GpuMat& dst)
+    {
+        gpuFuncTable()->convert(src, dst);
+    }
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
+    {
+        gpuFuncTable()->convert(src, dst, alpha, beta, stream);
+    }
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
+    {
+        gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream);
+    }
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    {
+        gpuFuncTable()->setTo(src, s, mask, stream);
+    }
+    void setTo(GpuMat& src, Scalar s)
+    {
+        setTo(src, s, 0);
+    }
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        setTo(src, s, mask, 0);
+    }
+}}
 ////////////////////////////////////////////////////////////////////////
 // Error handling

--- a/modules/dynamicuda/CMakeLists.txt
+++ b/modules/dynamicuda/CMakeLists.txt
+if(NOT DYNAMIC_CUDA_SUPPORT)
+  ocv_module_disable(dynamicuda)
+endif()
+set(the_description "Dynamic CUDA linkage")
+add_definitions(-DUSE_CUDA)
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
+set(OPENCV_MODULE_TYPE SHARED)
+if (BUILD_FAT_JAVA_LIB)
+  ocv_define_module(dynamicuda opencv_java PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+else()
+  ocv_define_module(dynamicuda opencv_core PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+endif()
--- a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
+++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
+#ifndef __GPUMAT_CUDA_HPP__
+#define __GPUMAT_CUDA_HPP__
+#ifndef HAVE_CUDA
+typedef void* cudaStream_t;
+#endif
+class DeviceInfoFuncTable
+{
+public:
+    // cv::DeviceInfo
+    virtual size_t sharedMemPerBlock() const = 0;
+    virtual void queryMemory(size_t&, size_t&) const = 0;
+    virtual size_t freeMemory() const = 0;
+    virtual size_t totalMemory() const = 0;
+    virtual bool supports(FeatureSet) const = 0;
+    virtual bool isCompatible() const = 0;
+    virtual void query() = 0;
+    virtual int deviceID() const = 0;
+    virtual std::string name() const = 0;
+    virtual int majorVersion() const = 0;
+    virtual int minorVersion() const = 0;
+    virtual int multiProcessorCount() const = 0;
+    virtual int getCudaEnabledDeviceCount() const = 0;
+    virtual void setDevice(int) const = 0;
+    virtual int getDevice() const = 0;
+    virtual void resetDevice() const  = 0;
+    virtual bool deviceSupports(FeatureSet) const = 0;
+    // cv::TargetArchs
+    virtual bool builtWith(FeatureSet) const = 0;
+    virtual bool has(int, int) const = 0;
+    virtual bool hasPtx(int, int) const = 0;
+    virtual bool hasBin(int, int) const = 0;
+    virtual bool hasEqualOrLessPtx(int, int) const = 0;
+    virtual bool hasEqualOrGreater(int, int) const = 0;
+    virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
+    virtual bool hasEqualOrGreaterBin(int, int) const = 0;
+    virtual void printCudaDeviceInfo(int) const = 0;
+    virtual void printShortCudaDeviceInfo(int) const = 0;
+    virtual ~DeviceInfoFuncTable() {};
+};
+class GpuFuncTable
+{
+public:
+    virtual ~GpuFuncTable() {}
+    // GpuMat routines
+    virtual void copy(const Mat& src, GpuMat& dst) const = 0;
+    virtual void copy(const GpuMat& src, Mat& dst) const = 0;
+    virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
+    virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
+    // gpu::device::convertTo funcs
+    virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0;
+    virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
+    // for gpu::device::setTo funcs
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const = 0;
+    virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
+    virtual void free(void* devPtr) const = 0;
+};
+class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
+{
+public:
+    size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
+    void queryMemory(size_t&, size_t&) const { throw_nogpu; }
+    size_t freeMemory() const { throw_nogpu; return 0; }
+    size_t totalMemory() const { throw_nogpu; return 0; }
+    bool supports(FeatureSet) const { throw_nogpu; return false; }
+    bool isCompatible() const { throw_nogpu; return false; }
+    void query() { throw_nogpu; }
+    int deviceID() const { throw_nogpu; return -1; };
+    std::string name() const { throw_nogpu; return std::string(); }
+    int majorVersion() const { throw_nogpu; return -1; }
+    int minorVersion() const { throw_nogpu; return -1; }
+    int multiProcessorCount() const { throw_nogpu; return -1; }
+    int getCudaEnabledDeviceCount() const { return 0; }
+    void setDevice(int) const { throw_nogpu; }
+    int getDevice() const { throw_nogpu; return 0; }
+    void resetDevice() const { throw_nogpu; }
+    bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
+    bool builtWith(FeatureSet) const { throw_nogpu; return false; }
+    bool has(int, int) const { throw_nogpu; return false; }
+    bool hasPtx(int, int) const { throw_nogpu; return false; }
+    bool hasBin(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
+    void printCudaDeviceInfo(int) const
+    {
+        printf("The library is compiled without CUDA support\n");
+    }
+    void printShortCudaDeviceInfo(int) const
+    {
+        printf("The library is compiled without CUDA support\n");
+    }
+};
+class EmptyFuncTable : public GpuFuncTable
+{
+public:
+    void copy(const Mat&, GpuMat&) const { throw_nogpu; }
+    void copy(const GpuMat&, Mat&) const { throw_nogpu; }
+    void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
+    void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
+    void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
+    void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const { throw_nogpu; }
+    void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
+    void free(void*) const {}
+};
+#if defined(USE_CUDA)
+#define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
+#define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
+inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err)
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);
+}
+inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (err < 0)
+    {
+        std::ostringstream msg;
+        msg << "NPP API Call Error: " << err;
+        cv::gpu::error(msg.str().c_str(), file, line, func);
+    }
+}
+namespace cv { namespace gpu { namespace device
+{
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
+    template <typename T>
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream);
+    template <typename T>
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
+}}}
+template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
+{
+    Scalar_<T> sf = s;
+    cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
+}
+template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+{
+    Scalar_<T> sf = s;
+    cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+}
+template<int n> struct NPPTypeTraits;
+template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
+template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
+template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
+template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
+template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
+//////////////////////////////////////////////////////////////////////////
+// Convert
+template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
+};
+template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
+{
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+    typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
+};
+template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+    static void call(const GpuMat& src, GpuMat& dst)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+        nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
+{
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+    static void call(const GpuMat& src, GpuMat& dst)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+        nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+//////////////////////////////////////////////////////////////////////////
+// Set
+template<int SDEPTH, int SCN> struct NppSetFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<int SCN> struct NppSetFunc<CV_8S, SCN>
+{
+    typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<> struct NppSetFunc<CV_8S, 1>
+{
+    typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    static void call(GpuMat& src, Scalar s)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+        Scalar_<src_t> nppS = s;
+        nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    static void call(GpuMat& src, Scalar s)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+        Scalar_<src_t> nppS = s;
+        nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int SDEPTH, int SCN> struct NppSetMaskFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+        Scalar_<src_t> nppS = s;
+        nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+        Scalar_<src_t> nppS = s;
+        nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+//////////////////////////////////////////////////////////////////////////
+// CopyMasked
+template<int SDEPTH> struct NppCopyMaskedFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+        nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+{
+    return reinterpret_cast<size_t>(ptr) % size == 0;
+}
+namespace cv { namespace gpu { namespace device
+{
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0);
+    void convertTo(const GpuMat& src, GpuMat& dst);
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0);
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask);
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    {
+        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
+    }
+    void convertTo(const GpuMat& src, GpuMat& dst)
+    {
+        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
+    }
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
+    {
+        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
+    }
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
+        static const caller_t callers[] =
+        {
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
+        callers[src.depth()](src, s, stream);
+    }
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+        static const caller_t callers[] =
+        {
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
+        callers[src.depth()](src, s, mask, stream);
+    }
+    void setTo(GpuMat& src, Scalar s)
+    {
+        setTo(src, s, 0);
+    }
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        setTo(src, s, mask, 0);
+    }
+}}}
+class CudaArch
+{
+public:
+    CudaArch()
+    {
+        fromStr(CUDA_ARCH_BIN, bin);
+        fromStr(CUDA_ARCH_PTX, ptx);
+        fromStr(CUDA_ARCH_FEATURES, features);
+    }
+    bool builtWith(FeatureSet feature_set) const
+    {
+        return !features.empty() && (features.back() >= feature_set);
+    }
+    bool hasPtx(int major, int minor) const
+    {
+        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
+    }
+    bool hasBin(int major, int minor) const
+    {
+        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
+    }
+    bool hasEqualOrLessPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+    }
+    bool hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+    }
+    bool hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return !bin.empty() && (bin.back() >= major * 10 + minor);
+    }
+private:
+    void fromStr(const string& set_as_str, vector<int>& arr)
+    {
+        if (set_as_str.find_first_not_of(" ") == string::npos)
+            return;
+        istringstream stream(set_as_str);
+        int cur_value;
+        while (!stream.eof())
+        {
+            stream >> cur_value;
+            arr.push_back(cur_value);
+        }
+        sort(arr.begin(), arr.end());
+    }
+    vector<int> bin;
+    vector<int> ptx;
+    vector<int> features;
+};
+class DeviceProps
+{
+public:
+    DeviceProps()
+    {
+        props_.resize(10, 0);
+    }
+    ~DeviceProps()
+    {
+        for (size_t i = 0; i < props_.size(); ++i)
+        {
+            if (props_[i])
+                delete props_[i];
+        }
+        props_.clear();
+    }
+    cudaDeviceProp* get(int devID)
+    {
+        if (devID >= (int) props_.size())
+            props_.resize(devID + 5, 0);
+        if (!props_[devID])
+        {
+            props_[devID] = new cudaDeviceProp;
+            cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
+        }
+        return props_[devID];
+    }
+private:
+    std::vector<cudaDeviceProp*> props_;
+};
+DeviceProps deviceProps;
+class CudaDeviceInfoFuncTable : public DeviceInfoFuncTable
+{
+public:
+    size_t sharedMemPerBlock() const
+    {
+        return deviceProps.get(device_id_)->sharedMemPerBlock;
+    }
+    void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+    {
+        int prevDeviceID = getDevice();
+        if (prevDeviceID != device_id_)
+            setDevice(device_id_);
+        cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
+        if (prevDeviceID != device_id_)
+            setDevice(prevDeviceID);
+    }
+    size_t freeMemory() const
+    {
+        size_t _totalMemory, _freeMemory;
+        queryMemory(_totalMemory, _freeMemory);
+        return _freeMemory;
+    }
+    size_t totalMemory() const
+    {
+        size_t _totalMemory, _freeMemory;
+        queryMemory(_totalMemory, _freeMemory);
+        return _totalMemory;
+    }
+    bool supports(FeatureSet feature_set) const
+    {
+        int version = majorVersion_ * 10 + minorVersion_;
+        return version >= feature_set;
+    }
+    bool isCompatible() const
+    {
+        // Check PTX compatibility
+        if (hasEqualOrLessPtx(majorVersion_, minorVersion_))
+            return true;
+        // Check BIN compatibility
+            for (int i = minorVersion_; i >= 0; --i)
+                if (hasBin(majorVersion_, i))
+                    return true;
+                return false;
+    }
+    void query()
+    {
+        const cudaDeviceProp* prop = deviceProps.get(device_id_);
+        name_ = prop->name;
+        multi_processor_count_ = prop->multiProcessorCount;
+        majorVersion_ = prop->major;
+        minorVersion_ = prop->minor;
+    }
+    int deviceID() const
+    {
+        return device_id_;
+    }
+    std::string name() const
+    {
+        return name_;
+    }
+    int majorVersion() const
+    {
+        return majorVersion_;
+    }
+    int minorVersion() const
+    {
+        return minorVersion_;
+    }
+    int multiProcessorCount() const
+    {
+        return multi_processor_count_;
+    }
+    int getCudaEnabledDeviceCount() const
+    {
+        int count;
+        cudaError_t error = cudaGetDeviceCount( &count );
+        if (error == cudaErrorInsufficientDriver)
+            return -1;
+        if (error == cudaErrorNoDevice)
+            return 0;
+        cudaSafeCall( error );
+        return count;
+    }
+    void setDevice(int device) const
+    {
+        cudaSafeCall( cudaSetDevice( device ) );
+    }
+    int getDevice() const
+    {
+        int device;
+        cudaSafeCall( cudaGetDevice( &device ) );
+        return device;
+    }
+    void resetDevice() const
+    {
+        cudaSafeCall( cudaDeviceReset() );
+    }
+    bool builtWith(FeatureSet feature_set) const
+    {
+        return cudaArch.builtWith(feature_set);
+    }
+    bool has(int major, int minor) const
+    {
+        return hasPtx(major, minor) || hasBin(major, minor);
+    }
+    bool hasPtx(int major, int minor) const
+    {
+        return cudaArch.hasPtx(major, minor);
+    }
+    bool hasBin(int major, int minor) const
+    {
+        return cudaArch.hasBin(major, minor);
+    }
+    bool hasEqualOrLessPtx(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrLessPtx(major, minor);
+    }
+    bool hasEqualOrGreater(int major, int minor) const
+    {
+        return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+    }
+    bool hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrGreaterPtx(major, minor);
+    }
+    bool hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrGreaterBin(major, minor);
+    }
+    bool deviceSupports(FeatureSet feature_set) const
+    {
+        static int versions[] =
+        {
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+        };
+        static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
+        const int devId = getDevice();
+        int version;
+        if (devId < cache_size && versions[devId] >= 0)
+            version = versions[devId];
+        else
+        {
+            DeviceInfo dev(devId);
+            version = dev.majorVersion() * 10 + dev.minorVersion();
+            if (devId < cache_size)
+                versions[devId] = version;
+        }
+        return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+    }
+    void printCudaDeviceInfo(int device) const
+    {
+        int count = getCudaEnabledDeviceCount();
+        bool valid = (device >= 0) && (device < count);
+        int beg = valid ? device   : 0;
+        int end = valid ? device+1 : count;
+        printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
+        printf("Device count: %d\n", count);
+        int driverVersion = 0, runtimeVersion = 0;
+        cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+        cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+        const char *computeMode[] = {
+            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+               "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+               "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+               "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+               "Unknown",
+               NULL
+        };
+        for(int dev = beg; dev < end; ++dev)
+        {
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+            printf("\nDevice %d: \"%s\"\n", dev, prop.name);
+            printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+            printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
+            printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
+        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
+        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
+               prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
+               prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
+        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
+               prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
+               prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
+        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
+        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
+        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
+        printf("  Warp size:                                     %d\n", prop.warpSize);
+        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
+        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
+        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
+        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
+        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
+        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
+        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
+        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
+        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
+        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
+        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
+        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
+        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
+        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
+        printf("  Compute Mode:\n");
+        printf("      %s \n", computeMode[prop.computeMode]);
+        }
+        printf("\n");
+        printf("deviceQuery, CUDA Driver = CUDART");
+        printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
+        printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
+        printf(", NumDevs = %d\n\n", count);
+        fflush(stdout);
+    }
+    void printShortCudaDeviceInfo(int device) const
+    {
+        int count = getCudaEnabledDeviceCount();
+        bool valid = (device >= 0) && (device < count);
+        int beg = valid ? device   : 0;
+        int end = valid ? device+1 : count;
+        int driverVersion = 0, runtimeVersion = 0;
+        cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+        cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+        for(int dev = beg; dev < end; ++dev)
+        {
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+            const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
+            printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
+            printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
+            int cores = convertSMVer2Cores(prop.major, prop.minor);
+            if (cores > 0)
+                printf(", %d cores", cores * prop.multiProcessorCount);
+            printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+        }
+        fflush(stdout);
+    }
+private:
+    int device_id_;
+    std::string name_;
+    int multi_processor_count_;
+    int majorVersion_;
+    int minorVersion_;
+    const CudaArch cudaArch;
+    int convertSMVer2Cores(int major, int minor) const
+    {
+        // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+        typedef struct {
+            int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+            int Cores;
+        } SMtoCores;
+        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
+        int index = 0;
+        while (gpuArchCoresPerSM[index].SM != -1)
+        {
+            if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
+                return gpuArchCoresPerSM[index].Cores;
+            index++;
+        }
+        return -1;
+    }
+};
+class CudaFuncTable : public GpuFuncTable
+{
+public:
+    void copy(const Mat& src, GpuMat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
+    }
+    void copy(const GpuMat& src, Mat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
+    }
+    void copy(const GpuMat& src, GpuMat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
+    }
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
+    {
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+        if (src.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
+        static const func_t funcs[7][4] =
+        {
+            /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
+            /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
+            /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
+            /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
+            /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
+            /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
+            /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
+         };
+         const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
+         func(src, dst, mask, 0);
+    }
+    void convert(const GpuMat& src, GpuMat& dst) const
+    {
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
+        static const func_t funcs[7][7][4] =
+        {
+            {
+                /*  8U ->  8U */ {0, 0, 0, 0},
+                /*  8U ->  8S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
+                /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
+                /*  8U -> 32S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 64F */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /*  8S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S ->  8S */ {0,0,0,0},
+                /*  8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
+                /* 16U ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 16U */ {0,0,0,0},
+                /* 16U -> 16S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
+                /* 16S ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 16U */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 16S */ {0,0,0,0},
+                /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /* 32S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 32S */ {0,0,0,0},
+                /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F ->  8S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 32S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 32F */ {0,0,0,0},
+                /* 32F -> 64F */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 64F ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 64F */ {0,0,0,0}
+            }
+        };
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(dst.depth() <= CV_64F);
+        CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
+        if (src.depth() == CV_64F || dst.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+        bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
+        if (!aligned)
+        {
+            cv::gpu::device::convertTo(src, dst);
+            return;
+        }
+        const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
+        CV_DbgAssert(func != 0);
+        func(src, dst);
+    }
+    void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
+    {
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(dst.depth() <= CV_64F);
+        if (src.depth() == CV_64F || dst.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+        cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
+    }
+    void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
+    {
+        if (mask.empty())
+        {
+            if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
+            {
+                cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
+                return;
+            }
+            if (m.depth() == CV_8U)
+            {
+                int cn = m.channels();
+                if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
+                {
+                    int val = saturate_cast<uchar>(s[0]);
+                    cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
+                    return;
+                }
+            }
+            typedef void (*func_t)(GpuMat& src, Scalar s);
+            static const func_t funcs[7][4] =
+            {
+                {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
+                {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          },
+                {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
+                {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
+                {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
+                {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
+                {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
+            };
+            CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+            if (m.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+            }
+            if (stream)
+                cv::gpu::device::setTo(m, s, stream);
+            else
+                funcs[m.depth()][m.channels() - 1](m, s);
+        }
+        else
+        {
+            typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
+            static const func_t funcs[7][4] =
+            {
+                {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
+                {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               },
+                {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
+                {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
+                {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
+                {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
+                {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               }
+            };
+            CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+            if (m.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+            }
+            if (stream)
+                cv::gpu::device::setTo(m, s, mask, stream);
+            else
+                funcs[m.depth()][m.channels() - 1](m, s, mask);
+        }
+    }
+    void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
+    {
+        cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
+    }
+    void free(void* devPtr) const
+    {
+        cudaFree(devPtr);
+    }
+};
+#endif
+#endif
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
--- a/modules/dynamicuda/src/main.cpp
+++ b/modules/dynamicuda/src/main.cpp
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include <stdio.h>
+#include <iostream>
+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+#include <npp.h>
+#define CUDART_MINIMUM_REQUIRED_VERSION 4020
+#define NPP_MINIMUM_REQUIRED_VERSION 4200
+#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
+#error "Insufficient Cuda Runtime library version, please update it."
+#endif
+#if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+#error "Insufficient NPP version, please update it."
+#endif
+#endif
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
+#include "opencv2/dynamicuda/dynamicuda.hpp"
+#ifdef HAVE_CUDA
+static CudaDeviceInfoFuncTable deviceInfoTable;
+static CudaFuncTable gpuTable;
+#else
+static EmptyDeviceInfoFuncTable deviceInfoTable;
+static EmptyFuncTable gpuTable;
+#endif
+extern "C" {
+DeviceInfoFuncTable* deviceInfoFactory();
+GpuFuncTable* gpuFactory();
+DeviceInfoFuncTable* deviceInfoFactory()
+{
+    return (DeviceInfoFuncTable*)&deviceInfoTable;
+}
+GpuFuncTable* gpuFactory()
+{
+    return (GpuFuncTable*)&gpuTable;
+}
+}
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,7 +3,8 @@ if(IOS)
 endif()
 set(the_description "GPU-accelerated Computer Vision")
-ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy)
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy
+               OPTIONAL ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")

--- a/modules/gpu/perf4au/CMakeLists.txt
+++ b/modules/gpu/perf4au/CMakeLists.txt
@@ -2,26 +2,28 @@ set(PERF4AU_REQUIRED_DEPS opencv_core opencv_imgproc opencv_highgui opencv_video
 ocv_check_dependencies(${PERF4AU_REQUIRED_DEPS})
-set(the_target gpu_perf4au)
+if (OCV_DEPENDENCIES_FOUND)
-project(${the_target})
+  set(the_target gpu_perf4au)
+  project(${the_target})
-ocv_include_modules(${PERF4AU_REQUIRED_DEPS})
+  ocv_include_modules(${PERF4AU_REQUIRED_DEPS})
-if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
+  if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
-endif()
+  endif()
-file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp)
+  file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp)
-add_executable(${the_target} ${srcs})
+  add_executable(${the_target} ${srcs})
-target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS})
+  target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS})
-if(ENABLE_SOLUTION_FOLDERS)
+  if(ENABLE_SOLUTION_FOLDERS)
-  set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
+    set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
-endif()
+  endif()
-if(WIN32)
+  if(WIN32)
    if(MSVC AND NOT BUILD_SHARED_LIBS)
-        set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
+      set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
    endif()
-endif()
+  endif()
+endif()
\ No newline at end of file
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -297,6 +297,12 @@ if(BUILD_FAT_JAVA_LIB)
      list(REMOVE_ITEM __deps ${m})
    endif()
  endforeach()
+  if (ENABLE_DYNAMIC_CUDA)
+    list(REMOVE_ITEM __deps "opencv_dynamicuda")
+  endif()
+  if (ANDROID AND HAVE_opencv_gpu)
+    list(REMOVE_ITEM __deps "opencv_gpu")
+  endif()
  ocv_list_unique(__deps)
  set(__extradeps ${__deps})
  ocv_list_filterout(__extradeps "^opencv_")

--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
 set(the_description "Images stitching")
-ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree)
+if (ENABLE_DYNAMIC_CUDA)
+  ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_nonfree)
+else()
+  ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree)
+endif()
\ No newline at end of file
--- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
@@ -227,7 +227,7 @@ private:
 };
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class CV_EXPORTS GraphCutSeamFinderGpu : public GraphCutSeamFinderBase, public PairwiseSeamFinder
 {
 public:

--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -46,7 +46,7 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 # include "opencv2/gpu/gpu.hpp"
 #endif
@@ -331,7 +331,7 @@ public:
 };
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
 {
 public:

--- a/modules/stitching/include/opencv2/stitching/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/warpers.hpp
@@ -145,7 +145,7 @@ public:
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class PlaneWarperGpu: public WarperCreator
 {
 public:

--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -189,7 +189,7 @@ Rect FeatherBlender::createWeightMaps(const vector<Mat> &masks, const vector<Poi
 MultiBandBlender::MultiBandBlender(int try_gpu, int num_bands, int weight_type)
 {
    setNumBands(num_bands);
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
    can_use_gpu_ = try_gpu && gpu::getCudaEnabledDeviceCount();
 #else
    (void)try_gpu;
@@ -491,7 +491,7 @@ void createLaplacePyr(const Mat &img, int num_levels, vector<Mat> &pyr)
 void createLaplacePyrGpu(const Mat &img, int num_levels, vector<Mat> &pyr)
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
    pyr.resize(num_levels + 1);
    vector<gpu::GpuMat> gpu_pyr(num_levels + 1);
@@ -531,7 +531,7 @@ void restoreImageFromLaplacePyr(vector<Mat> &pyr)
 void restoreImageFromLaplacePyrGpu(vector<Mat> &pyr)
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
    if (pyr.empty())
        return;

--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -46,7 +46,7 @@ using namespace std;
 using namespace cv;
 using namespace cv::detail;
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 using namespace cv::gpu;
 #endif
@@ -129,7 +129,7 @@ private:
    float match_conf_;
 };
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class GpuMatcher : public FeaturesMatcher
 {
 public:
@@ -204,7 +204,7 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
    LOG("1->2 & 2->1 matches: " << matches_info.matches.size() << endl);
 }
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &features2, MatchesInfo& matches_info)
 {
    matches_info.matches.clear();
@@ -432,7 +432,7 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features)
    }
 }
-#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 SurfFeaturesFinderGpu::SurfFeaturesFinderGpu(double hess_thresh, int num_octaves, int num_layers,
                                             int num_octaves_descr, int num_layers_descr)
 {
@@ -533,7 +533,7 @@ void FeaturesMatcher::operator ()(const vector<ImageFeatures> &features, vector<
 BestOf2NearestMatcher::BestOf2NearestMatcher(bool try_use_gpu, float match_conf, int num_matches_thresh1, int num_matches_thresh2)
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
    if (try_use_gpu && getCudaEnabledDeviceCount() > 0)
        impl_ = new GpuMatcher(match_conf);
    else

--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -68,7 +68,7 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/features2d/features2d.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
    #include "opencv2/gpu/gpu.hpp"
    #ifdef HAVE_OPENCV_NONFREE

--- a/modules/stitching/src/seam_finders.cpp
+++ b/modules/stitching/src/seam_finders.cpp
@@ -1318,7 +1318,7 @@ void GraphCutSeamFinder::find(const vector<Mat> &src, const vector<Point> &corne
 }
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 void GraphCutSeamFinderGpu::find(const vector<Mat> &src, const vector<Point> &corners,
                                 vector<Mat> &masks)
 {

--- a/modules/stitching/src/stitcher.cpp
+++ b/modules/stitching/src/stitcher.cpp
@@ -58,7 +58,7 @@ Stitcher Stitcher::createDefault(bool try_use_gpu)
    stitcher.setFeaturesMatcher(new detail::BestOf2NearestMatcher(try_use_gpu));
    stitcher.setBundleAdjuster(new detail::BundleAdjusterRay());
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
    if (try_use_gpu && gpu::getCudaEnabledDeviceCount() > 0)
    {
 #if defined(HAVE_OPENCV_NONFREE)

--- a/modules/stitching/src/warpers.cpp
+++ b/modules/stitching/src/warpers.cpp
@@ -212,7 +212,7 @@ void SphericalWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_b
 }
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, gpu::GpuMat &xmap, gpu::GpuMat &ymap)
 {
    return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32F), xmap, ymap);

--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -4,4 +4,4 @@ endif()
 set(the_description "Super Resolution")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef)
-ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl)
+ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
--- a/modules/videostab/CMakeLists.txt
+++ b/modules/videostab/CMakeLists.txt
 set(the_description "Video stabilization")
-ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu)
+if(ENABLE_DYNAMIC_CUDA)
+  ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui)
+else()
+  ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu)
+endif()
--- a/modules/videostab/include/opencv2/videostab/optical_flow.hpp
+++ b/modules/videostab/include/opencv2/videostab/optical_flow.hpp
@@ -46,7 +46,7 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 #  include "opencv2/gpu/gpu.hpp"
 #endif
@@ -98,7 +98,7 @@ public:
            OutputArray status, OutputArray errors);
 };
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class CV_EXPORTS DensePyrLkOptFlowEstimatorGpu
        : public PyrLkOptFlowEstimatorBase, public IDenseOptFlowEstimator
 {

--- a/modules/videostab/src/inpainting.cpp
+++ b/modules/videostab/src/inpainting.cpp
@@ -323,7 +323,7 @@ public:
 MotionInpainter::MotionInpainter()
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
    setOptFlowEstimator(new DensePyrLkOptFlowEstimatorGpu());
 #else
    CV_Error(CV_StsNotImplemented, "Current implementation of MotionInpainter requires GPU");

--- a/modules/videostab/src/optical_flow.cpp
+++ b/modules/videostab/src/optical_flow.cpp
@@ -59,7 +59,7 @@ void SparsePyrLkOptFlowEstimator::run(
 }
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 DensePyrLkOptFlowEstimatorGpu::DensePyrLkOptFlowEstimatorGpu()
 {
    CV_Assert(gpu::getCudaEnabledDeviceCount() > 0);

--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@@ -355,7 +355,7 @@ int main(int argc, char* argv[])
    Ptr<FeaturesFinder> finder;
    if (features_type == "surf")
    {
-#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
        if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
            finder = new SurfFeaturesFinderGpu();
        else
@@ -543,7 +543,7 @@ int main(int argc, char* argv[])
    // Warp images and their masks
    Ptr<WarperCreator> warper_creator;
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
    if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
    {
        if (warp_type == "plane") warper_creator = new cv::PlaneWarperGpu();
@@ -608,7 +608,7 @@ int main(int argc, char* argv[])
        seam_finder = new detail::VoronoiSeamFinder();
    else if (seam_find_type == "gc_color")
    {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
        if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
            seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR);
        else
@@ -617,7 +617,7 @@ int main(int argc, char* argv[])
    }
    else if (seam_find_type == "gc_colorgrad")
    {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
        if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
            seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR_GRAD);
        else