Merge pull request #328 from jet47:new-gpu-fixes

11dfceb2 · cuda-geek · OpenCV Buildbot · 2b4ffd11 · 395f0201 · 11dfceb2
Commit 11dfceb2 authored Jan 29, 2013 by cuda-geek Committed by OpenCV Buildbot Jan 29, 2013
169 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,14 +110,15 @@ endif()
 # Optional 3rd party components
 # ===================================================
-OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS AND NOT CARMA) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
-OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"         ON   IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_CUFFT          "Include NVidia Cuda Fast Fourier Transform (FFT) library support"            ON  IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_NVCUVID        "Include NVidia Video Decoding library support"                               OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) )
 OCV_OPTION(WITH_EIGEN          "Include Eigen2/Eigen3 support"               ON)
-OCV_OPTION(WITH_FFMPEG         "Include FFMPEG support"                      ON   IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_FFMPEG         "Include FFMPEG support"                      ON   IF (NOT ANDROID AND NOT IOS))
 OCV_OPTION(WITH_GSTREAMER      "Include Gstreamer support"                   ON   IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_GTK            "Include GTK support"                         ON   IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_IMAGEIO        "ImageIO support for OS X"                    OFF  IF APPLE)
@@ -140,9 +141,9 @@ OCV_OPTION(WITH_V4L            "Include Video 4 Linux support"               ON
 OCV_OPTION(WITH_VIDEOINPUT     "Build HighGUI with DirectShow support"       ON   IF WIN32 )
 OCV_OPTION(WITH_XIMEA          "Include XIMEA cameras support"               OFF  IF (NOT ANDROID AND NOT APPLE) )
 OCV_OPTION(WITH_XINE           "Include Xine support (GPL)"                  OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
-OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
-OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
-OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
 # OpenCV build components
@@ -161,12 +162,12 @@ OCV_OPTION(BUILD_ANDROID_SERVICE    "Build OpenCV Manager for Google Play" OFF I
 OCV_OPTION(BUILD_ANDROID_PACKAGE    "Build platform-specific package for Google Play" OFF IF ANDROID )
 # 3rd party libs
-OCV_OPTION(BUILD_ZLIB               "Build zlib from source"         WIN32 OR APPLE )
+OCV_OPTION(BUILD_ZLIB               "Build zlib from source"         WIN32 OR APPLE OR CARMA )
-OCV_OPTION(BUILD_TIFF               "Build libtiff from source"      WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_TIFF               "Build libtiff from source"      WIN32 OR ANDROID OR APPLE OR CARMA )
-OCV_OPTION(BUILD_JASPER             "Build libjasper from source"    WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_JASPER             "Build libjasper from source"    WIN32 OR ANDROID OR APPLE OR CARMA )
-OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"      WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"      WIN32 OR ANDROID OR APPLE OR CARMA )
-OCV_OPTION(BUILD_PNG                "Build libpng from source"       WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_PNG                "Build libpng from source"       WIN32 OR ANDROID OR APPLE OR CARMA )
-OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"      WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"      WIN32 OR ANDROID OR APPLE OR CARMA )
 # OpenCV installation options
@@ -778,6 +779,7 @@ if(HAVE_CUDA)
  status("    Use CUFFT:"            HAVE_CUFFT   THEN YES ELSE NO)
  status("    Use CUBLAS:"           HAVE_CUBLAS  THEN YES ELSE NO)
+  status("    USE NVCUVID:"          HAVE_NVCUVID THEN YES ELSE NO)
  status("    NVIDIA GPU arch:"      ${OPENCV_CUDA_ARCH_BIN})
  status("    NVIDIA PTX archs:"     ${OPENCV_CUDA_ARCH_PTX})
  status("    Use fast math:"        CUDA_FAST_MATH THEN YES ELSE NO)

--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -3,17 +3,17 @@ if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
  return()
 endif()
-if (WIN32 AND NOT MSVC)
+if(WIN32 AND NOT MSVC)
-  message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler suppoted on your platform).")
+  message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler supported on your platform).")
  return()
 endif()
-if (CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+if(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-  message(STATUS "CUDA compilation is disabled (due to Clang unsuppoted on your platform).")
+  message(STATUS "CUDA compilation is disabled (due to Clang unsupported on your platform).")
  return()
 endif()
-find_package(CUDA 4.1)
+find_package(CUDA 4.2 QUIET)
 if(CUDA_FOUND)
  set(HAVE_CUDA 1)
@@ -26,16 +26,21 @@ if(CUDA_FOUND)
    set(HAVE_CUBLAS 1)
  endif()
+  if(WITH_NVCUVID)
+    find_cuda_helper_libs(nvcuvid)
+    set(HAVE_NVCUVID 1)
+  endif()
  message(STATUS "CUDA detected: " ${CUDA_VERSION})
-  if(${CUDA_VERSION_STRING} VERSION_GREATER "4.1")
+  if (CARMA)
-    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_BIN "2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
  else()
-    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
  endif()
-  set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
  string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
  string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
@@ -78,6 +83,15 @@ if(CUDA_FOUND)
    set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
  endforeach()
+  if(CARMA)
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM" )
+    if (CMAKE_VERSION VERSION_LESS 2.8.10)
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -ccbin=${CMAKE_CXX_COMPILER}" )
+    endif()
+  endif()
  # These vars will be processed in other scripts
  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
  set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}")
@@ -92,7 +106,6 @@ if(CUDA_FOUND)
  mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
-  unset(CUDA_npp_LIBRARY CACHE)
  find_cuda_helper_libs(npp)
  macro(ocv_cuda_compile VAR)
@@ -106,15 +119,15 @@ if(CUDA_FOUND)
      string(REPLACE "-ggdb3" "" ${var} "${${var}}")
    endforeach()
-    if (BUILD_SHARED_LIBS)
+    if(BUILD_SHARED_LIBS)
      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -DCVAPI_EXPORTS)
    endif()
    if(UNIX OR APPLE)
-      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
    endif()
    if(APPLE)
-      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
    endif()
    # disabled because of multiple warnings during building nvcc auto generated files

--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@@ -44,6 +44,7 @@ set(OpenCV_COMPUTE_CAPABILITIES @OpenCV_CUDA_CC_CONFIGCMAKE@)
 set(OpenCV_CUDA_VERSION @OpenCV_CUDA_VERSION@)
 set(OpenCV_USE_CUBLAS   @HAVE_CUBLAS@)
 set(OpenCV_USE_CUFFT    @HAVE_CUFFT@)
+set(OpenCV_USE_NVCUVID  @HAVE_NVCUVID@)
 # Android API level from which OpenCV has been compiled is remembered
 set(OpenCV_ANDROID_NATIVE_API_LEVEL @OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE@)
@@ -218,17 +219,22 @@ foreach(__opttype OPT DBG)
  else()
    #TODO: duplicates are annoying but they should not be the problem
  endif()
-  # fix hard coded paths for CUDA libraries under Windows
-  if(WIN32 AND OpenCV_CUDA_VERSION AND NOT OpenCV_SHARED)
+  # CUDA
+  if(OpenCV_CUDA_VERSION AND (CARMA OR (WIN32 AND NOT OpenCV_SHARED)))
    if(NOT CUDA_FOUND)
      find_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
    else()
      if(NOT CUDA_VERSION_STRING VERSION_EQUAL OpenCV_CUDA_VERSION)
-        message(FATAL_ERROR "OpenCV static library compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
+        if(WIN32)
+          message(FATAL_ERROR "OpenCV static library was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
+        else()
+          message(FATAL_ERROR "OpenCV library for CARMA was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
+        endif()
      endif()
    endif()
-    list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY} ${CUDA_nvcuvid_LIBRARY} ${CUDA_nvcuvenc_LIBRARY})
+    list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
    if(OpenCV_USE_CUBLAS)
      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUBLAS_LIBRARIES})
@@ -238,6 +244,13 @@ foreach(__opttype OPT DBG)
      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUFFT_LIBRARIES})
    endif()
+    if(OpenCV_USE_NVCUVID)
+      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvid_LIBRARIES})
+    endif()
+    if(WIN32)
+      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvenc_LIBRARIES})
+    endif()
  endif()
 endforeach()

--- a/cmake/templates/cvconfig.h.cmake
+++ b/cmake/templates/cvconfig.h.cmake
@@ -175,21 +175,15 @@
 /* NVidia Cuda Runtime API*/
 #cmakedefine HAVE_CUDA
-/* OpenCL Support */
-#cmakedefine HAVE_OPENCL
-/* AMD's OpenCL Fast Fourier Transform Library*/
-#cmakedefine HAVE_CLAMDFFT
-/* AMD's Basic Linear Algebra Subprograms Library*/
-#cmakedefine HAVE_CLAMDBLAS
 /* NVidia Cuda Fast Fourier Transform (FFT) API*/
 #cmakedefine HAVE_CUFFT
 /* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
 #cmakedefine HAVE_CUBLAS
+/* NVidia Video Decoding API*/
+#cmakedefine HAVE_NVCUVID
 /* Compile for 'real' NVIDIA GPU architectures */
 #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"
@@ -202,6 +196,15 @@
 /* Create PTX or BIN for 1.0 compute capability */
 #cmakedefine CUDA_ARCH_BIN_OR_PTX_10
+/* OpenCL Support */
+#cmakedefine HAVE_OPENCL
+/* AMD's OpenCL Fast Fourier Transform Library*/
+#cmakedefine HAVE_CLAMDFFT
+/* AMD's Basic Linear Algebra Subprograms Library*/
+#cmakedefine HAVE_CLAMDBLAS
 /* VideoInput library */
 #cmakedefine HAVE_VIDEOINPUT

--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -10,7 +10,6 @@ if(HAVE_CUDA)
  file(GLOB lib_cuda "src/cuda/*.cu")
  ocv_cuda_compile(cuda_objs ${lib_cuda})
  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 else()
  set(lib_cuda "")

--- a/modules/core/include/opencv2/core/cuda_devptrs.hpp
+++ b/modules/core/include/opencv2/core/cuda_devptrs.hpp
@@ -177,6 +177,20 @@ namespace cv
 //#undef __CV_GPU_DEPR_BEFORE__
 //#undef __CV_GPU_DEPR_AFTER__
+        namespace device
+        {
+            using cv::gpu::PtrSz;
+            using cv::gpu::PtrStep;
+            using cv::gpu::PtrStepSz;
+            using cv::gpu::PtrStepSzb;
+            using cv::gpu::PtrStepSzf;
+            using cv::gpu::PtrStepSzi;
+            using cv::gpu::PtrStepb;
+            using cv::gpu::PtrStepf;
+            using cv::gpu::PtrStepi;
+        }
    }
 }

--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -79,6 +79,8 @@ namespace cv { namespace gpu
        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
    };
+    CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
    // Gives information about what GPU archs this OpenCV GPU module was
    // compiled for
    class CV_EXPORTS TargetArchs

--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@@ -44,6 +44,7 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/type_traits.hpp"
 namespace cv { namespace gpu { namespace device
 {
@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
    void writeScalar(const int*);
    void writeScalar(const float*);
    void writeScalar(const double*);
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
    void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
 }}}
@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
    //////////////////////////////// ConvertTo ////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////
-    template <typename T, typename D> struct Convertor : unary_function<T, D>
+    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
    {
-        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
+        Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
-        __device__ __forceinline__ D operator()(const T& src) const
+        __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
        {
            return saturate_cast<D>(alpha * src + beta);
        }
-        double alpha, beta;
+        S alpha, beta;
    };
    namespace detail
@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
        };
    }
-    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
    {
    };
-    template<typename T, typename D>
+    template<typename T, typename D, typename S>
    void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
    {
        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-        Convertor<T, D> op(alpha, beta);
+        Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
    }
@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
    {
        typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
-        static const caller_t tab[8][8] =
+        static const caller_t tab[7][7] =
        {
-            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
+            {
-            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
+                cvt_<uchar, uchar, float>,
+                cvt_<uchar, schar, float>,
-            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
+                cvt_<uchar, ushort, float>,
-            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
+                cvt_<uchar, short, float>,
+                cvt_<uchar, int, float>,
-            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
+                cvt_<uchar, float, float>,
-            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
+                cvt_<uchar, double, double>
+            },
-            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
+            {
-            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
+                cvt_<schar, uchar, float>,
+                cvt_<schar, schar, float>,
-            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
+                cvt_<schar, ushort, float>,
-            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
+                cvt_<schar, short, float>,
+                cvt_<schar, int, float>,
-            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
+                cvt_<schar, float, float>,
-            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
+                cvt_<schar, double, double>
+            },
-            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
+            {
-            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
+                cvt_<ushort, uchar, float>,
+                cvt_<ushort, schar, float>,
-            {0,0,0,0,0,0,0,0}
+                cvt_<ushort, ushort, float>,
+                cvt_<ushort, short, float>,
+                cvt_<ushort, int, float>,
+                cvt_<ushort, float, float>,
+                cvt_<ushort, double, double>
+            },
+            {
+                cvt_<short, uchar, float>,
+                cvt_<short, schar, float>,
+                cvt_<short, ushort, float>,
+                cvt_<short, short, float>,
+                cvt_<short, int, float>,
+                cvt_<short, float, float>,
+                cvt_<short, double, double>
+            },
+            {
+                cvt_<int, uchar, float>,
+                cvt_<int, schar, float>,
+                cvt_<int, ushort, float>,
+                cvt_<int, short, float>,
+                cvt_<int, int, double>,
+                cvt_<int, float, double>,
+                cvt_<int, double, double>
+            },
+            {
+                cvt_<float, uchar, float>,
+                cvt_<float, schar, float>,
+                cvt_<float, ushort, float>,
+                cvt_<float, short, float>,
+                cvt_<float, int, float>,
+                cvt_<float, float, float>,
+                cvt_<float, double, double>
+            },
+            {
+                cvt_<double, uchar, double>,
+                cvt_<double, schar, double>,
+                cvt_<double, ushort, double>,
+                cvt_<double, short, double>,
+                cvt_<double, int, double>,
+                cvt_<double, float, double>,
+                cvt_<double, double, double>
+            }
        };
        caller_t func = tab[sdepth][ddepth];
-        if (!func)
-            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
        func(src, dst, alpha, beta, stream);
    }

--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 set(the_description "GPU-accelerated Computer Vision")
 ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_photo opencv_legacy)
-ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda" "${CMAKE_CURRENT_SOURCE_DIR}/../highgui/src")
+ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")
 file(GLOB lib_hdrs               "include/opencv2/${name}/*.hpp"               "include/opencv2/${name}/*.h")
 file(GLOB lib_device_hdrs        "include/opencv2/${name}/device/*.hpp"        "include/opencv2/${name}/device/*.h")
@@ -22,17 +22,14 @@ source_group("Device" FILES ${lib_device_hdrs})
 source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
 if (HAVE_CUDA)
-  file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
+  file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
  file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
-  file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
+  set(ncv_files ${ncv_srcs} ${ncv_cuda})
-  set(ncv_files ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
  source_group("Src\\NVidia" FILES ${ncv_files})
  ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS})
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations /wd4211 /wd4201 /wd4100 /wd4505 /wd4408)
  string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
  #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
  if(MSVC)
@@ -47,23 +44,18 @@ if (HAVE_CUDA)
  ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})
-  #CUDA_BUILD_CLEAN_TARGET()
  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
-  if(NOT APPLE)
+  if(WITH_NVCUVID)
-    unset(CUDA_nvcuvid_LIBRARY CACHE)
-    find_cuda_helper_libs(nvcuvid)
    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
  endif()
  if(WIN32)
-    unset(CUDA_nvcuvenc_LIBRARY CACHE)
    find_cuda_helper_libs(nvcuvenc)
    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
  endif()
-  if(NOT APPLE AND WITH_FFMPEG)
+  if(WITH_FFMPEG)
    set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES})
  endif()
 else()

--- a/modules/gpu/doc/matrix_reductions.rst
+++ b/modules/gpu/doc/matrix_reductions.rst
--- a/modules/gpu/src/opencv2/gpu/device/block.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/block.hpp
--- a/modules/gpu/include/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/color.hpp
@@ -216,6 +216,86 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
    #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
+    #undef OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
+    #undef OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
+    #undef OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
+    #undef OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS
 }}} // namespace cv { namespace gpu { namespace device
 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/common.hpp
@@ -85,8 +85,6 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
        cv::gpu::error(cudaGetErrorString(err), file, line, func);
 }
-#ifdef __CUDACC__
 namespace cv { namespace gpu
 {
    __host__ __device__ __forceinline__ int divUp(int total, int grain)
@@ -96,19 +94,25 @@ namespace cv { namespace gpu
    namespace device
    {
+        using cv::gpu::divUp;
+#ifdef __CUDACC__
        typedef unsigned char uchar;
        typedef unsigned short ushort;
        typedef signed char schar;
+        #ifdef _WIN32
            typedef unsigned int uint;
+        #endif
        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
        {
            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
        }
+#endif // __CUDACC__
    }
 }}
-#endif // __CUDACC__
 #endif // __OPENCV_GPU_COMMON_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
--- a/modules/gpu/include/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/emulation.hpp
@@ -44,7 +44,6 @@
 #define OPENCV_GPU_EMULATION_HPP_
 #include "warp_reduce.hpp"
-#include <stdio.h>
 namespace cv { namespace gpu { namespace device
 {

--- a/modules/gpu/include/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp
@@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device
    template <> struct name<type> : binary_function<type, type, type> \
    { \
        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
-        __device__ __forceinline__ name(const name& other):binary_function<type, type, type>(){}\
+        __device__ __forceinline__ name() {}\
-        __device__ __forceinline__ name():binary_function<type, type, type>(){}\
+        __device__ __forceinline__ name(const name&) {}\
    };
    template <typename T> struct maximum : binary_function<T, T, T>
    {
        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
        {
-            return lhs < rhs ? rhs : lhs;
+            return max(lhs, rhs);
        }
-        __device__ __forceinline__ maximum(const maximum& other):binary_function<T, T, T>(){}
+        __device__ __forceinline__ maximum() {}
-        __device__ __forceinline__ maximum():binary_function<T, T, T>(){}
+        __device__ __forceinline__ maximum(const maximum&) {}
    };
    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
@@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device
    {
        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
        {
-            return lhs < rhs ? lhs : rhs;
+            return min(lhs, rhs);
        }
-        __device__ __forceinline__ minimum(const minimum& other):binary_function<T, T, T>(){}
+        __device__ __forceinline__ minimum() {}
-        __device__ __forceinline__ minimum():binary_function<T, T, T>(){}
+        __device__ __forceinline__ minimum(const minimum&) {}
    };
    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
@@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device
    // Math functions
 ///bound=========================================
+    template <typename T> struct abs_func : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return abs(x);
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
+    {
+        __device__ __forceinline__ unsigned char operator ()(unsigned char x) const
+        {
+            return x;
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<signed char> : unary_function<signed char, signed char>
+    {
+        __device__ __forceinline__ signed char operator ()(signed char x) const
+        {
+            return ::abs((int)x);
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<char> : unary_function<char, char>
+    {
+        __device__ __forceinline__ char operator ()(char x) const
+        {
+            return ::abs((int)x);
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
+    {
+        __device__ __forceinline__ unsigned short operator ()(unsigned short x) const
+        {
+            return x;
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<short> : unary_function<short, short>
+    {
+        __device__ __forceinline__ short operator ()(short x) const
+        {
+            return ::abs((int)x);
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
+    {
+        __device__ __forceinline__ unsigned int operator ()(unsigned int x) const
+        {
+            return x;
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<int> : unary_function<int, int>
+    {
+        __device__ __forceinline__ int operator ()(int x) const
+        {
+            return ::abs(x);
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<float> : unary_function<float, float>
+    {
+        __device__ __forceinline__ float operator ()(float x) const
+        {
+            return ::fabsf(x);
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<double> : unary_function<double, double>
+    {
+        __device__ __forceinline__ double operator ()(double x) const
+        {
+            return ::fabs(x);
+        }
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
    template <typename T> struct name ## _func : unary_function<T, float> \
    { \
@@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device
        { \
            return func ## f(v); \
        } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    }; \
    template <> struct name ## _func<double> : unary_function<double, double> \
    { \
@@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device
        { \
            return func(v); \
        } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    };
 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
@@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device
        } \
    };
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)

--- a/modules/gpu/include/opencv2/gpu/device/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp
--- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
@@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
    {
-        return (uchar) ::max((int)v, 0);
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
    {
-        return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
    {
-        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+        uint res = 0;
+        asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
    {
-        return (uchar) ::min(v, (uint)UCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
-    {
-        return saturate_cast<uchar>((uint)v);
-    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
    {
-        int iv = __float2int_rn(v);
+        uint res = 0;
-        return saturate_cast<uchar>(iv);
+        asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+    #if __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
+        uint res = 0;
-        return saturate_cast<uchar>(iv);
+        asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<uchar>((float)v);
    #endif
@@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device
    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
    {
-        return (schar) ::min((int)v, SCHAR_MAX);
+        uint res = 0;
+        uint vi = v;
+        asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
    {
-        return (schar) ::min((uint)v, (uint)SCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
    {
-        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+        uint res = 0;
+        asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
    {
-        return saturate_cast<schar>((int)v);
+        uint res = 0;
+        asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
    {
-        return (schar) ::min(v, (uint)SCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
    {
-        int iv = __float2int_rn(v);
+        uint res = 0;
-        return saturate_cast<schar>(iv);
+        asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+    #if __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
+        uint res = 0;
-        return saturate_cast<schar>(iv);
+        asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<schar>((float)v);
    #endif
@@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
    {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        int vi = v;
+        asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
    {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
    {
-        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
    {
-        return (ushort) ::min(v, (uint)USHRT_MAX);
+        ushort res = 0;
+        asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
    {
-        int iv = __float2int_rn(v);
+        ushort res = 0;
-        return saturate_cast<ushort>(iv);
+        asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+    #if __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
+        ushort res = 0;
-        return saturate_cast<ushort>(iv);
+        asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<ushort>((float)v);
    #endif
@@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device
    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
    {
-        return (short) ::min((int)v, SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
    {
-        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
+        short res = 0;
+        asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
    {
-        return (short) ::min(v, (uint)SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
    {
-        int iv = __float2int_rn(v);
+        short res = 0;
-        return saturate_cast<short>(iv);
+        asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+    #if __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
+        short res = 0;
-        return saturate_cast<short>(iv);
+        asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<short>((float)v);
    #endif
    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
+    {
+        int res = 0;
+        asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
    {
        return __float2int_rn(v);
@@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
    #endif
    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
    {
        return __float2uint_rn(v);

--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@@ -45,7 +45,6 @@
 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"
-#include "detail/reduction_detail.hpp"
 namespace cv { namespace gpu { namespace device
 {
@@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
        }
    };
-    ///////////////////////////////////////////////////////////////////////////////
-    // Reduction
-    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
-    }
-    template <int n, typename T, typename V, typename Pred>
-    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
-    }
-    template <int n, typename T, typename V1, typename V2, typename Pred>
-    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-    }
    ///////////////////////////////////////////////////////////////////////////////
    // Solve linear system

--- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_HPP__
-#include "utility.hpp"
+#include "reduce.hpp"
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
        }
        __device__ __forceinline__ operator int() const
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
        }
        __device__ __forceinline__ operator float() const
@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
        }
        __device__ __forceinline__ operator float() const
@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
        }
        __device__ __forceinline__ operator int() const

--- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
@@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \

--- a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
+#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T>
+    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl(val, srcLane, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl((int) val, srcLane, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+        lo = __shfl(lo, srcLane, width);
+        hi = __shfl(hi, srcLane, width);
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+    template <typename T>
+    __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_down(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_down((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+        lo = __shfl_down(lo, delta, width);
+        hi = __shfl_down(hi, delta, width);
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+    template <typename T>
+    __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_up(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_up((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+        lo = __shfl_up(lo, delta, width);
+        hi = __shfl_up(hi, delta, width);
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+}}}
+#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
--- a/modules/gpu/misc/carma.toolchain.cmake
+++ b/modules/gpu/misc/carma.toolchain.cmake
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+set(CMAKE_SYSTEM_PROCESSOR arm)
+set(CMAKE_C_COMPILER    arm-linux-gnueabi-gcc-4.5)
+set(CMAKE_CXX_COMPILER  arm-linux-gnueabi-g++-4.5)
+#suppress compiller varning
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-psabi" )
+set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -Wno-psabi" )
+# can be any other plases
+set(__arm_linux_eabi_root /usr/arm-linux-gnueabi)
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${__arm_linux_eabi_root})
+if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
+    set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
+endif()
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+set(CARMA 1)
+add_definitions(-DCARMA)
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -89,7 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
    SANITY_CHECK(found_locations);
 }
 ///////////////////////////////////////////////////////////////
 // HaarClassifier

--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -622,7 +622,7 @@ private:
        }
        // copy data structures on gpu
-        stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, (uchar*)&(stages[0]) ));
+        stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
        trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
        nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
        leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));

--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
--- a/modules/gpu/src/cuda/column_filter.0.cu
+++ b/modules/gpu/src/cuda/column_filter.0.cu
--- a/modules/gpu/src/cuda/column_filter.1.cu
+++ b/modules/gpu/src/cuda/column_filter.1.cu
--- a/modules/gpu/src/cuda/column_filter.10.cu
+++ b/modules/gpu/src/cuda/column_filter.10.cu
--- a/modules/gpu/src/cuda/column_filter.11.cu
+++ b/modules/gpu/src/cuda/column_filter.11.cu
--- a/modules/gpu/src/cuda/column_filter.12.cu
+++ b/modules/gpu/src/cuda/column_filter.12.cu
--- a/modules/gpu/src/cuda/column_filter.13.cu
+++ b/modules/gpu/src/cuda/column_filter.13.cu
--- a/modules/gpu/src/cuda/column_filter.14.cu
+++ b/modules/gpu/src/cuda/column_filter.14.cu
--- a/modules/gpu/src/cuda/column_filter.2.cu
+++ b/modules/gpu/src/cuda/column_filter.2.cu
--- a/modules/gpu/src/cuda/column_filter.3.cu
+++ b/modules/gpu/src/cuda/column_filter.3.cu
--- a/modules/gpu/src/cuda/column_filter.4.cu
+++ b/modules/gpu/src/cuda/column_filter.4.cu
--- a/modules/gpu/src/cuda/column_filter.5.cu
+++ b/modules/gpu/src/cuda/column_filter.5.cu
--- a/modules/gpu/src/cuda/column_filter.6.cu
+++ b/modules/gpu/src/cuda/column_filter.6.cu
--- a/modules/gpu/src/cuda/column_filter.7.cu
+++ b/modules/gpu/src/cuda/column_filter.7.cu
--- a/modules/gpu/src/cuda/column_filter.8.cu
+++ b/modules/gpu/src/cuda/column_filter.8.cu
--- a/modules/gpu/src/cuda/column_filter.9.cu
+++ b/modules/gpu/src/cuda/column_filter.9.cu
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
--- a/modules/gpu/src/cuda/column_filter.h
+++ b/modules/gpu/src/cuda/column_filter.h
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
--- a/modules/gpu/src/cuda/global_motion.cu
+++ b/modules/gpu/src/cuda/global_motion.cu
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
--- a/modules/gpu/src/cuda/row_filter.0.cu
+++ b/modules/gpu/src/cuda/row_filter.0.cu
--- a/modules/gpu/src/cuda/row_filter.1.cu
+++ b/modules/gpu/src/cuda/row_filter.1.cu
--- a/modules/gpu/src/cuda/row_filter.10.cu
+++ b/modules/gpu/src/cuda/row_filter.10.cu
--- a/modules/gpu/src/cuda/row_filter.11.cu
+++ b/modules/gpu/src/cuda/row_filter.11.cu
--- a/modules/gpu/src/cuda/row_filter.12.cu
+++ b/modules/gpu/src/cuda/row_filter.12.cu
--- a/modules/gpu/src/cuda/row_filter.13.cu
+++ b/modules/gpu/src/cuda/row_filter.13.cu
--- a/modules/gpu/src/cuda/row_filter.14.cu
+++ b/modules/gpu/src/cuda/row_filter.14.cu
--- a/modules/gpu/src/cuda/row_filter.2.cu
+++ b/modules/gpu/src/cuda/row_filter.2.cu
--- a/modules/gpu/src/cuda/row_filter.3.cu
+++ b/modules/gpu/src/cuda/row_filter.3.cu
--- a/modules/gpu/src/cuda/row_filter.4.cu
+++ b/modules/gpu/src/cuda/row_filter.4.cu
--- a/modules/gpu/src/cuda/row_filter.5.cu
+++ b/modules/gpu/src/cuda/row_filter.5.cu
--- a/modules/gpu/src/cuda/row_filter.6.cu
+++ b/modules/gpu/src/cuda/row_filter.6.cu
--- a/modules/gpu/src/cuda/row_filter.7.cu
+++ b/modules/gpu/src/cuda/row_filter.7.cu
--- a/modules/gpu/src/cuda/row_filter.8.cu
+++ b/modules/gpu/src/cuda/row_filter.8.cu
--- a/modules/gpu/src/cuda/row_filter.9.cu
+++ b/modules/gpu/src/cuda/row_filter.9.cu
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
--- a/modules/gpu/src/cuda/row_filter.h
+++ b/modules/gpu/src/cuda/row_filter.h
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
--- a/modules/gpu/src/cuvid_video_source.cpp
+++ b/modules/gpu/src/cuvid_video_source.cpp
--- a/modules/gpu/src/cuvid_video_source.h
+++ b/modules/gpu/src/cuvid_video_source.h
--- a/modules/gpu/src/cvt_colot_internal.h
+++ b/modules/gpu/src/cvt_colot_internal.h
--- a/modules/gpu/src/denoising.cpp
+++ b/modules/gpu/src/denoising.cpp
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
--- a/modules/gpu/src/ffmpeg_video_source.cpp
+++ b/modules/gpu/src/ffmpeg_video_source.cpp
--- a/modules/gpu/src/ffmpeg_video_source.h
+++ b/modules/gpu/src/ffmpeg_video_source.h
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
--- a/modules/gpu/src/frame_queue.cpp
+++ b/modules/gpu/src/frame_queue.cpp
--- a/modules/gpu/src/frame_queue.h
+++ b/modules/gpu/src/frame_queue.h
--- a/modules/gpu/src/gftt.cpp
+++ b/modules/gpu/src/gftt.cpp
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
--- a/modules/gpu/src/thread_wrappers.cpp
+++ b/modules/gpu/src/thread_wrappers.cpp
--- a/modules/gpu/src/thread_wrappers.h
+++ b/modules/gpu/src/thread_wrappers.h
--- a/modules/gpu/src/video_decoder.cpp
+++ b/modules/gpu/src/video_decoder.cpp
--- a/modules/gpu/src/video_decoder.h
+++ b/modules/gpu/src/video_decoder.h
--- a/modules/gpu/src/video_parser.cpp
+++ b/modules/gpu/src/video_parser.cpp
--- a/modules/gpu/src/video_parser.h
+++ b/modules/gpu/src/video_parser.h
--- a/modules/gpu/src/video_reader.cpp
+++ b/modules/gpu/src/video_reader.cpp
--- a/modules/gpu/src/video_writer.cpp
+++ b/modules/gpu/src/video_writer.cpp
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
--- a/modules/gpu/test/interpolation.hpp
+++ b/modules/gpu/test/interpolation.hpp
--- a/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
+++ b/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
--- a/modules/gpu/test/nvidia/main_nvidia.cpp
+++ b/modules/gpu/test/nvidia/main_nvidia.cpp
--- a/modules/gpu/test/test_bgfg.cpp
+++ b/modules/gpu/test/test_bgfg.cpp
--- a/modules/gpu/test/test_calib3d.cpp
+++ b/modules/gpu/test/test_calib3d.cpp
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
--- a/modules/gpu/test/test_denoising.cpp
+++ b/modules/gpu/test/test_denoising.cpp
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
--- a/modules/gpu/test/test_global_motion.cpp
+++ b/modules/gpu/test/test_global_motion.cpp
--- a/modules/gpu/test/test_gpumat.cpp
+++ b/modules/gpu/test/test_gpumat.cpp
--- a/modules/gpu/test/test_hough.cpp
+++ b/modules/gpu/test/test_hough.cpp
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
--- a/modules/gpu/test/test_labeling.cpp
+++ b/modules/gpu/test/test_labeling.cpp
--- a/modules/gpu/test/test_nvidia.cpp
+++ b/modules/gpu/test/test_nvidia.cpp
--- a/modules/gpu/test/test_objdetect.cpp
+++ b/modules/gpu/test/test_objdetect.cpp
--- a/modules/gpu/test/test_optflow.cpp
+++ b/modules/gpu/test/test_optflow.cpp
--- a/modules/gpu/test/test_precomp.hpp
+++ b/modules/gpu/test/test_precomp.hpp
--- a/modules/gpu/test/test_pyramids.cpp
+++ b/modules/gpu/test/test_pyramids.cpp
--- a/modules/gpu/test/test_remap.cpp
+++ b/modules/gpu/test/test_remap.cpp
--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
--- a/modules/gpu/test/test_threshold.cpp
+++ b/modules/gpu/test/test_threshold.cpp
--- a/modules/gpu/test/test_video.cpp
+++ b/modules/gpu/test/test_video.cpp
--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
--- a/modules/gpu/test/utility.cpp
+++ b/modules/gpu/test/utility.cpp
--- a/modules/gpu/test/utility.hpp
+++ b/modules/gpu/test/utility.hpp
--- a/modules/ts/include/opencv2/ts/ts.hpp
+++ b/modules/ts/include/opencv2/ts/ts.hpp
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp