Merge pull request #1540 from jet47:gpuarithm-cudev

21233656 · Roman Donchenko · OpenCV Buildbot · e290436a · 8ed47c01 · 21233656
Commit 21233656 authored Oct 21, 2013 by Roman Donchenko Committed by OpenCV Buildbot Oct 21, 2013
62 changed files
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@@ -216,7 +216,7 @@ namespace
    template <typename T>
    void copyWithMask(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream)
    {
-        gridTransform_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
+        gridTransformUnary_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
    }
 }
@@ -268,14 +268,14 @@ namespace
    void setToWithOutMask(const GpuMat& mat, Scalar _scalar, Stream& stream)
    {
        Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
-        gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
+        gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
    }
    template <typename T>
    void setToWithMask(const GpuMat& mat, const GpuMat& mask, Scalar _scalar, Stream& stream)
    {
        Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
-        gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
+        gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
    }
 }
@@ -382,7 +382,7 @@ namespace
        typedef typename LargerType<src_elem_type, float>::type larger_elem_type;
        typedef typename LargerType<float, dst_elem_type>::type scalar_type;
-        gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
+        gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
    }
    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
@@ -408,7 +408,7 @@ namespace
        op.alpha = cv::saturate_cast<scalar_type>(alpha);
        op.beta = cv::saturate_cast<scalar_type>(beta);
-        gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
+        gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
    }
 }

--- a/modules/cudaarithm/CMakeLists.txt
+++ b/modules/cudaarithm/CMakeLists.txt
@@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Operations on Matrices")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
-ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudalegacy)
+ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev)
 ocv_module_include_directories()
 ocv_glob_module_sources()

--- a/modules/cudaarithm/perf/perf_arithm.cpp
+++ b/modules/cudaarithm/perf/perf_arithm.cpp
@@ -248,60 +248,3 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
        CPU_SANITY_CHECK(dst);
    }
 }
-//////////////////////////////////////////////////////////////////////
-// Integral
-PERF_TEST_P(Sz, Integral,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;
-        TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
-        CUDA_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-        TEST_CYCLE() cv::integral(src, dst);
-        CPU_SANITY_CHECK(dst);
-    }
-}
-//////////////////////////////////////////////////////////////////////
-// IntegralSqr
-PERF_TEST_P(Sz, IntegralSqr,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat dst, buf;
-        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
-        CUDA_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
--- a/modules/cudaarithm/perf/perf_reductions.cpp
+++ b/modules/cudaarithm/perf/perf_reductions.cpp
@@ -373,7 +373,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
        const cv::cuda::GpuMat d_src(src);
        cv::cuda::GpuMat dst;
-        TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp);
+        TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp, CV_32F);
        CUDA_SANITY_CHECK(dst);
    }
@@ -381,7 +381,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
    {
        cv::Mat dst;
-        TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp);
+        TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp, CV_32F);
        CPU_SANITY_CHECK(dst);
    }
@@ -465,3 +465,60 @@ PERF_TEST_P(Sz, MeanStdDev,
        SANITY_CHECK(cpu_stddev);
    }
 }
+//////////////////////////////////////////////////////////////////////
+// Integral
+PERF_TEST_P(Sz, Integral,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+        cv::cuda::GpuMat d_buf;
+        TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+        TEST_CYCLE() cv::integral(src, dst);
+        CPU_SANITY_CHECK(dst);
+    }
+}
+//////////////////////////////////////////////////////////////////////
+// IntegralSqr
+PERF_TEST_P(Sz, IntegralSqr,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst, buf;
+        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
--- a/modules/cudaarithm/src/arithm.cpp
+++ b/modules/cudaarithm/src/arithm.cpp
@@ -292,95 +292,6 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
 #endif
 }
-//////////////////////////////////////////////////////////////////////////////
-// mulSpectrums
-#ifdef HAVE_CUFFT
-namespace cv { namespace cuda { namespace device
-{
-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
-}}}
-#endif
-void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
-{
-#ifndef HAVE_CUFFT
-    (void) _src1;
-    (void) _src2;
-    (void) _dst;
-    (void) flags;
-    (void) conjB;
-    (void) stream;
-    throw_no_cuda();
-#else
-    (void) flags;
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
-    static Caller callers[] = { device::mulSpectrums, device::mulSpectrums_CONJ };
-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
-    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
-    CV_Assert( src1.size() == src2.size() );
-    _dst.create(src1.size(), CV_32FC2);
-    GpuMat dst = _dst.getGpuMat();
-    Caller caller = callers[(int)conjB];
-    caller(src1, src2, dst, StreamAccessor::getStream(stream));
-#endif
-}
-//////////////////////////////////////////////////////////////////////////////
-// mulAndScaleSpectrums
-#ifdef HAVE_CUFFT
-namespace cv { namespace cuda { namespace device
-{
-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
-}}}
-#endif
-void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
-{
-#ifndef HAVE_CUFFT
-    (void) _src1;
-    (void) _src2;
-    (void) _dst;
-    (void) flags;
-    (void) scale;
-    (void) conjB;
-    (void) stream;
-    throw_no_cuda();
-#else
-    (void)flags;
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
-    static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ };
-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
-    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
-    CV_Assert( src1.size() == src2.size() );
-    _dst.create(src1.size(), CV_32FC2);
-    GpuMat dst = _dst.getGpuMat();
-    Caller caller = callers[(int)conjB];
-    caller(src1, src2, scale, dst, StreamAccessor::getStream(stream));
-#endif
-}
 //////////////////////////////////////////////////////////////////////////////
 // dft

--- a/modules/cudaarithm/src/core.cpp
+++ b/modules/cudaarithm/src/core.cpp
--- a/modules/cudaarithm/src/cuda/absdiff_mat.cu
+++ b/modules/cudaarithm/src/cuda/absdiff_mat.cu
@@ -40,43 +40,22 @@
 //
 //M*/
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
-#include "opencv2/core/cuda/common.hpp"
+#ifndef HAVE_OPENCV_CUDEV
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"
-using namespace cv::cuda;
+#else
-using namespace cv::cuda::device;
-namespace arithm
+#include "opencv2/cudev.hpp"
-{
-    struct VAbsDiff4 : binary_function<uint, uint, uint>
-    {
-        __device__ __forceinline__ uint operator ()(uint a, uint b) const
-        {
-            return vabsdiff4(a, b);
-        }
-        __host__ __device__ __forceinline__ VAbsDiff4() {}
-        __host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
-    };
-    struct VAbsDiff2 : binary_function<uint, uint, uint>
+using namespace cv::cudev;
-    {
-        __device__ __forceinline__ uint operator ()(uint a, uint b) const
-        {
-            return vabsdiff2(a, b);
-        }
-        __host__ __device__ __forceinline__ VAbsDiff2() {}
+void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
-        __host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
-    };
+namespace
+{
    __device__ __forceinline__ int _abs(int a)
    {
        return ::abs(a);
@@ -90,58 +69,120 @@ namespace arithm
        return ::fabs(a);
    }
-    template <typename T> struct AbsDiffMat : binary_function<T, T, T>
+    template <typename T> struct AbsDiffOp1 : binary_function<T, T, T>
    {
        __device__ __forceinline__ T operator ()(T a, T b) const
        {
            return saturate_cast<T>(_abs(a - b));
        }
-        __host__ __device__ __forceinline__ AbsDiffMat() {}
-        __host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
    };
-}
-namespace cv { namespace cuda { namespace device
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
-{
-    template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
-    template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
+        enum {
+            shift = 1
+        };
    };
-    template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <typename T>
+    void absDiffMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
    {
+        gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), AbsDiffOp1<T>(), stream);
+    }
+    struct AbsDiffOp2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff2(a, b);
+        }
    };
-}}}
-namespace arithm
+    void absDiffMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
-{
-    void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        device::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
+        const int vcols = src1.cols >> 1;
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+        gridTransformBinary(src1_, src2_, dst_, AbsDiffOp2(), stream);
    }
-    void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    struct AbsDiffOp4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff4(a, b);
+        }
+    };
+    void absDiffMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
    {
-        device::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
+        const int vcols = src1.cols >> 2;
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+        gridTransformBinary(src1_, src2_, dst_, AbsDiffOp4(), stream);
    }
+}
-    template <typename T>
+void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
-    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        absDiffMat_v1<uchar>,
+        absDiffMat_v1<schar>,
+        absDiffMat_v1<ushort>,
+        absDiffMat_v1<short>,
+        absDiffMat_v1<int>,
+        absDiffMat_v1<float>,
+        absDiffMat_v1<double>
+    };
+    const int depth = src1.depth();
+    CV_DbgAssert( depth <= CV_64F );
+    GpuMat src1_ = src1.reshape(1);
+    GpuMat src2_ = src2.reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+    if (depth == CV_8U || depth == CV_16U)
    {
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+        if (isAllAligned)
+        {
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
+            {
+                absDiffMat_v4(src1_, src2_, dst_, stream);
+                return;
+            }
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
+            {
+                absDiffMat_v2(src1_, src2_, dst_, stream);
+                return;
+            }
+        }
    }
-    template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    const func_t func = funcs[depth];
-    template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    if (!func)
-    template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-    template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    func(src1_, src2_, dst_, stream);
-    template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 }
-#endif // CUDA_DISABLER
+#endif
--- a/modules/cudaarithm/src/cuda/absdiff_scalar.cu
+++ b/modules/cudaarithm/src/cuda/absdiff_scalar.cu
@@ -40,59 +40,71 @@
 //
 //M*/
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
-#include "opencv2/core/cuda/common.hpp"
+#ifndef HAVE_OPENCV_CUDEV
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"
-using namespace cv::cuda;
+#else
-using namespace cv::cuda::device;
-namespace arithm
+#include "opencv2/cudev.hpp"
+using namespace cv::cudev;
+void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
+namespace
 {
-    template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
+    template <typename T, typename S> struct AbsDiffScalarOp : unary_function<T, T>
    {
        S val;
-        __host__ explicit AbsDiffScalar(S val_) : val(val_) {}
        __device__ __forceinline__ T operator ()(T a) const
        {
            abs_func<S> f;
            return saturate_cast<T>(f(a - val));
        }
    };
-}
-namespace cv { namespace cuda { namespace device
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
-{
+    {
-    template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
    {
+        enum {
+            shift = 1
+        };
    };
-}}}
-namespace arithm
+    template <typename SrcType, typename ScalarDepth>
+    void absDiffScalarImpl(const GpuMat& src, double value, GpuMat& dst, Stream& stream)
+    {
+        AbsDiffScalarOp<SrcType, ScalarDepth> op;
+        op.val = static_cast<ScalarDepth>(value);
+        gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<SrcType>(dst), op, stream);
+    }
+}
+void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
 {
-    template <typename T, typename S>
+    typedef void (*func_t)(const GpuMat& src, double val, GpuMat& dst, Stream& stream);
-    void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    static const func_t funcs[] =
    {
-        AbsDiffScalar<T, S> op(static_cast<S>(val));
+        absDiffScalarImpl<uchar, float>,
+        absDiffScalarImpl<schar, float>,
+        absDiffScalarImpl<ushort, float>,
+        absDiffScalarImpl<short, float>,
+        absDiffScalarImpl<int, float>,
+        absDiffScalarImpl<float, float>,
+        absDiffScalarImpl<double, double>
+    };
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
+    const int depth = src.depth();
-    }
+    CV_DbgAssert( depth <= CV_64F );
-    template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    funcs[depth](src, val[0], dst, stream);
-    template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 }
-#endif // CUDA_DISABLER
+#endif
--- a/modules/cudaarithm/src/cuda/add_mat.cu
+++ b/modules/cudaarithm/src/cuda/add_mat.cu
--- a/modules/cudaarithm/src/cuda/add_scalar.cu
+++ b/modules/cudaarithm/src/cuda/add_scalar.cu
--- a/modules/cudaarithm/src/cuda/add_weighted.cu
+++ b/modules/cudaarithm/src/cuda/add_weighted.cu
--- a/modules/cudaarithm/src/cuda/bitwise_mat.cu
+++ b/modules/cudaarithm/src/cuda/bitwise_mat.cu
--- a/modules/cudaarithm/src/cuda/bitwise_scalar.cu
+++ b/modules/cudaarithm/src/cuda/bitwise_scalar.cu
@@ -40,65 +40,132 @@
 //
 //M*/
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
-#include "opencv2/core/cuda/common.hpp"
+#ifndef HAVE_OPENCV_CUDEV
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"
-using namespace cv::cuda;
+#else
-using namespace cv::cuda::device;
-namespace cv { namespace cuda { namespace device
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+using namespace cv::cudev;
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
+namespace
 {
-    template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <template <typename> class Op, typename T>
+    void bitScalarOp(const GpuMat& src, uint value, GpuMat& dst, Stream& stream)
    {
-    };
+        gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), value), stream);
+    }
+    typedef void (*bit_scalar_func_t)(const GpuMat& src, uint value, GpuMat& dst, Stream& stream);
-    template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <typename T, bit_scalar_func_t func> struct BitScalar
    {
+        static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+        {
+            func(src, cv::saturate_cast<T>(value[0]), dst, stream);
+        }
    };
-    template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <bit_scalar_func_t func> struct BitScalar4
    {
+        static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+        {
+            uint packedVal = 0;
+            packedVal |= cv::saturate_cast<uchar>(value[0]);
+            packedVal |= cv::saturate_cast<uchar>(value[1]) << 8;
+            packedVal |= cv::saturate_cast<uchar>(value[2]) << 16;
+            packedVal |= cv::saturate_cast<uchar>(value[3]) << 24;
+            func(src, packedVal, dst, stream);
+        }
    };
-}}}
-namespace arithm
+    template <int DEPTH, int cn> struct NppBitwiseCFunc
-{
-    template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
    {
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
-    }
+        typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
+    };
-    template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
    {
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
+        typedef typename NppBitwiseCFunc<DEPTH, cn>::npp_type npp_type;
-    }
+        static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& _stream)
+        {
+            cudaStream_t stream = StreamAccessor::getStream(_stream);
+            NppStreamHandler h(stream);
+            NppiSize oSizeROI;
+            oSizeROI.width = src.cols;
+            oSizeROI.height = src.rows;
+            const npp_type pConstants[] =
+            {
+                cv::saturate_cast<npp_type>(value[0]),
+                cv::saturate_cast<npp_type>(value[1]),
+                cv::saturate_cast<npp_type>(value[2]),
+                cv::saturate_cast<npp_type>(value[3])
+            };
+            nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+            if (stream == 0)
+                CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+        }
+    };
+}
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
+{
+    (void) mask;
-    template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
+    static const func_t funcs[3][6][4] =
    {
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
+        {
-    }
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        },
+        {
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+        },
+        {
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        }
+    };
-    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    const int depth = src.depth();
-    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    const int cn = src.channels();
-    template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    CV_DbgAssert( depth <= CV_32F );
-    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    CV_DbgAssert( cn == 1 || cn == 3 || cn == 4 );
-    template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    CV_DbgAssert( mask.empty() );
-    template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    CV_DbgAssert( op >= 0 && op < 3 );
-    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    funcs[op][depth][cn - 1](src, value, dst, stream);
-    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 }
-#endif // CUDA_DISABLER
+#endif
--- a/modules/cudaarithm/src/cuda/cmp_mat.cu
+++ b/modules/cudaarithm/src/cuda/cmp_mat.cu
--- a/modules/cudaarithm/src/cuda/cmp_scalar.cu
+++ b/modules/cudaarithm/src/cuda/cmp_scalar.cu
--- a/modules/cudaarithm/src/cuda/copy_make_border.cu
+++ b/modules/cudaarithm/src/cuda/copy_make_border.cu
--- a/modules/cudaarithm/src/cuda/countnonzero.cu
+++ b/modules/cudaarithm/src/cuda/countnonzero.cu
@@ -40,137 +40,57 @@
 //
 //M*/
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
-#include "opencv2/core/cuda/common.hpp"
+#ifndef HAVE_OPENCV_CUDEV
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-using namespace cv::cuda;
+#error "opencv_cudev is required"
-using namespace cv::cuda::device;
-namespace countNonZero
+#else
-{
-    __device__ unsigned int blocks_finished = 0;
-    template <int BLOCK_SIZE, typename T>
-    __global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
-    {
-        __shared__ unsigned int scount[BLOCK_SIZE];
-        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        unsigned int mycount = 0;
-        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
-        {
-            const T* ptr = src.ptr(y);
-            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
-            {
-                const T srcVal = ptr[x];
-                mycount += (srcVal != 0);
-            }
-        }
-        device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
-    #if __CUDA_ARCH__ >= 200
-        if (tid == 0)
-            ::atomicAdd(count, mycount);
-    #else
-        __shared__ bool is_last;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-        if (tid == 0)
-        {
-            count[bid] = mycount;
-            __threadfence();
-            unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-            is_last = (ticket == gridDim.x * gridDim.y - 1);
-        }
-        __syncthreads();
-        if (is_last)
-        {
-            mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
-            device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
-            if (tid == 0)
+using namespace cv::cudev;
-            {
-                count[0] = mycount;
-                blocks_finished = 0;
+namespace
-            }
+{
-        }
+    template <typename T>
-    #endif
+    int countNonZeroImpl(const GpuMat& _src, GpuMat& _buf)
-    }
-    const int threads_x = 32;
-    const int threads_y = 8;
-    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
    {
-        block = dim3(threads_x, threads_y);
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
-        grid = dim3(divUp(cols, block.x * block.y),
-                    divUp(rows, block.y * block.x));
-        grid.x = ::min(grid.x, block.x);
+        gridCountNonZero(src, buf);
-        grid.y = ::min(grid.y, block.y);
-    }
-    void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
+        int data;
-    {
+        buf.download(cv::Mat(1, 1, buf.type(), &data));
-        dim3 block, grid;
-        getLaunchCfg(cols, rows, block, grid);
-        bufcols = grid.x * grid.y * sizeof(int);
+        return data;
-        bufrows = 1;
    }
+}
-    template <typename T>
+int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
-    int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
+{
+    typedef int (*func_t)(const GpuMat& _src, GpuMat& _buf);
+    static const func_t funcs[] =
    {
-        dim3 block, grid;
+        countNonZeroImpl<uchar>,
-        getLaunchCfg(src.cols, src.rows, block, grid);
+        countNonZeroImpl<schar>,
+        countNonZeroImpl<ushort>,
-        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+        countNonZeroImpl<short>,
-        const int theight = divUp(divUp(src.rows, grid.y), block.y);
+        countNonZeroImpl<int>,
+        countNonZeroImpl<float>,
+        countNonZeroImpl<double>
+    };
-        unsigned int* count_buf = buf.ptr(0);
+    GpuMat src = _src.getGpuMat();
-        cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
+    CV_Assert( src.channels() == 1 );
-        kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
+    const func_t func = funcs[src.depth()];
-        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaDeviceSynchronize() );
-        unsigned int count;
-        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
-        return count;
-    }
-    template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    return func(src, buf);
-    template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
-    template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
-    template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
-    template int run<int   >(const PtrStepSzb src, PtrStep<unsigned int> buf);
-    template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
-    template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
 }
-#endif // CUDA_DISABLER
+#endif
--- a/modules/cudaarithm/src/cuda/div_mat.cu
+++ b/modules/cudaarithm/src/cuda/div_mat.cu
--- a/modules/cudaarithm/src/cuda/div_scalar.cu
+++ b/modules/cudaarithm/src/cuda/div_scalar.cu
--- a/modules/cudaarithm/src/cuda/integral.cu
+++ b/modules/cudaarithm/src/cuda/integral.cu
--- a/modules/cudaarithm/src/cuda/unroll_detail.hpp
+++ b/modules/cudaarithm/src/cuda/unroll_detail.hpp
--- a/modules/cudaarithm/src/cuda/math.cu
+++ b/modules/cudaarithm/src/cuda/math.cu
--- a/modules/cudaarithm/src/cuda/minmax.cu
+++ b/modules/cudaarithm/src/cuda/minmax.cu
--- a/modules/cudaarithm/src/cuda/minmax_mat.cu
+++ b/modules/cudaarithm/src/cuda/minmax_mat.cu
--- a/modules/cudaarithm/src/cuda/minmaxloc.cu
+++ b/modules/cudaarithm/src/cuda/minmaxloc.cu
--- a/modules/cudaarithm/src/cuda/mul_mat.cu
+++ b/modules/cudaarithm/src/cuda/mul_mat.cu
--- a/modules/cudaarithm/src/cuda/mul_scalar.cu
+++ b/modules/cudaarithm/src/cuda/mul_scalar.cu
--- a/modules/cudaarithm/src/cuda/mul_spectrums.cu
+++ b/modules/cudaarithm/src/cuda/mul_spectrums.cu
--- a/modules/cudaarithm/src/cuda/arithm_func_traits.hpp
+++ b/modules/cudaarithm/src/cuda/arithm_func_traits.hpp
--- a/modules/cudaarithm/src/cuda/polar_cart.cu
+++ b/modules/cudaarithm/src/cuda/polar_cart.cu
--- a/modules/cudaarithm/src/cuda/reduce.cu
+++ b/modules/cudaarithm/src/cuda/reduce.cu
--- a/modules/cudaarithm/src/cuda/split_merge.cu
+++ b/modules/cudaarithm/src/cuda/split_merge.cu
--- a/modules/cudaarithm/src/cuda/sub_mat.cu
+++ b/modules/cudaarithm/src/cuda/sub_mat.cu
--- a/modules/cudaarithm/src/cuda/sub_scalar.cu
+++ b/modules/cudaarithm/src/cuda/sub_scalar.cu
--- a/modules/cudaarithm/src/cuda/sum.cu
+++ b/modules/cudaarithm/src/cuda/sum.cu
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
--- a/modules/cudaarithm/src/precomp.hpp
+++ b/modules/cudaarithm/src/precomp.hpp
@@ -52,13 +52,6 @@
 #include "opencv2/core/private.cuda.hpp"
-#include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_CUDALEGACY
-#  include "opencv2/cudalegacy.hpp"
-#  include "opencv2/cudalegacy/private.hpp"
-#endif
 #ifdef HAVE_CUBLAS
 #  include <cublas.h>
 #endif

--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
--- a/modules/cudaarithm/test/test_arithm.cpp
+++ b/modules/cudaarithm/test/test_arithm.cpp
--- a/modules/cudaarithm/test/test_reductions.cpp
+++ b/modules/cudaarithm/test/test_reductions.cpp
--- a/modules/cudev/CMakeLists.txt
+++ b/modules/cudev/CMakeLists.txt
@@ -4,7 +4,7 @@ endif()
 set(the_description "CUDA device layer")
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable -Wenum-compare)
 ocv_add_module(cudev)

--- a/modules/cudev/include/opencv2/cudev.hpp
+++ b/modules/cudev/include/opencv2/cudev.hpp
@@ -73,7 +73,7 @@
 #include "cudev/block/vec_distance.hpp"
 #include "cudev/grid/copy.hpp"
-#include "cudev/grid/glob_reduce.hpp"
+#include "cudev/grid/reduce.hpp"
 #include "cudev/grid/histogram.hpp"
 #include "cudev/grid/integral.hpp"
 #include "cudev/grid/pyramids.hpp"

--- a/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
--- a/modules/cudev/include/opencv2/cudev/functional/functional.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/detail/glob_reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/glob_reduce.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/glob_reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/glob_reduce.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/transform.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transform.hpp
--- a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
--- a/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
--- a/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
--- a/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
--- a/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
--- a/modules/cudev/test/test_reduction.cu
+++ b/modules/cudev/test/test_reduction.cu