fixed gpu core tests (added additional check for device's feature support)

added assertion on double types for old devices

fixed gpu core tests (added additional check for device's feature support)
added assertion on double types for old devices
26691e00 · Vladislav Vinogradov · 98d7b10c · 26691e00 · 26691e00 · 26691e00
Commit 26691e00 authored Mar 26, 2012 by Vladislav Vinogradov
6 changed files
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -69,16 +69,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
 {
 #ifndef HAVE_CUBLAS
-    OPENCV_GPU_UNUSED(src1);
+    CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
-    OPENCV_GPU_UNUSED(src2);
-    OPENCV_GPU_UNUSED(alpha);
-    OPENCV_GPU_UNUSED(src3);
-    OPENCV_GPU_UNUSED(beta);
-    OPENCV_GPU_UNUSED(dst);
-    OPENCV_GPU_UNUSED(flags);
-    OPENCV_GPU_UNUSED(stream);
-    throw_nogpu();
 #else
@@ -87,6 +78,12 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    bool tr1 = (flags & GEMM_1_T) != 0;
    bool tr2 = (flags & GEMM_2_T) != 0;
    bool tr3 = (flags & GEMM_3_T) != 0;
@@ -230,6 +227,9 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
    }
    else // if (src.elemSize() == 8)
    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
        NppStStreamHandler h(stream);
        NcvSize32u sz;
@@ -290,7 +290,6 @@ namespace
 void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
    static const func_t funcs[6][4] =
    {
        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
@@ -403,12 +402,12 @@ namespace
 void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    ::npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
+    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
 }
 void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    ::npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
+    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
 }
 ////////////////////////////////////////////////////////////////////////
@@ -429,7 +428,7 @@ namespace
    {
        using namespace ::cv::gpu::device::mathfunc;
-        CV_DbgAssert(x.size() == y.size() && x.type() == y.type());
+        CV_Assert(x.size() == y.size() && x.type() == y.type());
        CV_Assert(x.depth() == CV_32F);
        if (mag)
@@ -449,7 +448,7 @@ namespace
    {
        using namespace ::cv::gpu::device::mathfunc;
-        CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
+        CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
        CV_Assert(mag.depth() == CV_32F);
        x.create(mag.size(), mag.type());

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -1096,18 +1096,18 @@ namespace cv { namespace gpu { namespace device
        enum { smart_shift = 4 };
    };
-    template <typename T> void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
+    template <typename T> void absdiff_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, Absdiff<T>(), WithOutMask(), stream);
    }
-    template void absdiff_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<uchar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<schar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<ushort>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<short >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<int   >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<float >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<double>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    template <typename T> struct AbsdiffScalar : unary_function<T, T>
    {
@@ -1140,20 +1140,20 @@ namespace cv { namespace gpu { namespace device
        enum { smart_shift = 4 };
    };
-    template <typename T> void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)
+    template <typename T> void absdiff_gpu(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream)
    {
        cudaSafeCall( cudaSetDoubleForDevice(&val) );
        AbsdiffScalar<T> op(val);
        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, WithOutMask(), stream);
    }
-    //template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<uchar >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<schar >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    //template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<ushort>(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<short >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<int   >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    //template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<float >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<double>(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
    //////////////////////////////////////////////////////////////////////////////////////
    // Compare
@@ -1587,60 +1587,60 @@ namespace cv { namespace gpu { namespace device
    };
    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    void min_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
-        cv::gpu::device::transform(src1, src2, dst, minimum<T>(), WithOutMask(), stream);
+        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minimum<T>(), WithOutMask(), stream);
    }
-    template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void min_gpu<uchar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void min_gpu<schar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    void max_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
-        cv::gpu::device::transform(src1, src2, dst, maximum<T>(), WithOutMask(), stream);
+        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, maximum<T>(), WithOutMask(), stream);
    }
-    template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void max_gpu<uchar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void max_gpu<schar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    void min_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream)
    {
-        cv::gpu::device::transform(src1, dst, device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
+        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, device::bind2nd(minimum<T>(), val), WithOutMask(), stream);
    }
-    template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void min_gpu<uchar >(const DevMem2Db src, uchar  val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void min_gpu<schar >(const DevMem2Db src, schar  val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2Db src, ushort val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2Db src, short  val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2Db src, int    val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2Db src, float  val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream);
    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    void max_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream)
    {
-        cv::gpu::device::transform(src1, dst, device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
+        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, device::bind2nd(maximum<T>(), val), WithOutMask(), stream);
    }
-    template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void max_gpu<uchar >(const DevMem2Db src, uchar  val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void max_gpu<schar >(const DevMem2Db src, schar  val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2Db src, ushort val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2Db src, short  val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2Db src, int    val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2Db src, float  val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream);
    //////////////////////////////////////////////////////////////////////////
    // threshold
@@ -1805,19 +1805,64 @@ namespace cv { namespace gpu { namespace device
    //////////////////////////////////////////////////////////////////////////
    // addWeighted
-    template <typename T1, typename T2, typename D> struct AddWeighted : binary_function<T1, T2, D>
+    namespace detail
+    {
+        template <typename T> struct UseDouble
+        {
+            enum {value = 0};
+        };
+        template <> struct UseDouble<int>
+        {
+            enum {value = 1};
+        };
+        template <> struct UseDouble<float>
+        {
+            enum {value = 1};
+        };
+        template <> struct UseDouble<double>
+        {
+            enum {value = 1};
+        };
+    }
+    template <typename T1, typename T2, typename D> struct UseDouble
    {
-        __host__ __device__ __forceinline__ AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
+        enum {value = (detail::UseDouble<T1>::value || detail::UseDouble<T2>::value || detail::UseDouble<D>::value)};
+    };
-        __device__ __forceinline__ D operator ()(typename TypeTraits<T1>::ParameterType a, typename TypeTraits<T2>::ParameterType b) const
+    namespace detail
+    {
+        template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted;
+        template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, false> : binary_function<T1, T2, D>
        {
-            return saturate_cast<D>(alpha * a + beta * b + gamma);
+            AddWeighted(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
+            __device__ __forceinline__ D operator ()(T1 a, T2 b) const
+            {
+                return saturate_cast<D>(a * alpha + b * beta + gamma);
+            }
+            const float alpha;
+            const float beta;
+            const float gamma;
+        };
+        template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, true> : binary_function<T1, T2, D>
+        {
+            AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
+            __device__ __forceinline__ D operator ()(T1 a, T2 b) const
+            {
+                return saturate_cast<D>(a * alpha + b * beta + gamma);
            }
            const double alpha;
            const double beta;
            const double gamma;
        };
+    }
+    template <typename T1, typename T2, typename D> struct AddWeighted : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>
+    {
+        AddWeighted(double alpha_, double beta_, double gamma_) : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
+    };
    template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >
    {
@@ -1877,10 +1922,13 @@ namespace cv { namespace gpu { namespace device
    template <typename T1, typename T2, typename D>
    void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream)
+    {
+        if (UseDouble<T1, T2, D>::value)
        {
            cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
            cudaSafeCall( cudaSetDoubleForDevice(&beta) );
            cudaSafeCall( cudaSetDoubleForDevice(&gamma) );
+        }
        AddWeighted<T1, T2, D> op(alpha, beta, gamma);

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -950,90 +950,62 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
 namespace cv { namespace gpu { namespace device
 {
    template <typename T>
-    void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    void absdiff_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    template <typename T>
-    void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
+    void absdiff_gpu(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream);
 }}}
-void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
+namespace
 {
-    using namespace ::cv::gpu::device;
+    template <int DEPTH> struct NppAbsDiffFunc
-    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    static const func_t funcs[] =
    {
-       absdiff_gpu<unsigned char>, absdiff_gpu<signed char>, absdiff_gpu<unsigned short>, absdiff_gpu<short>, absdiff_gpu<int>, absdiff_gpu<float>, absdiff_gpu<double>
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-    };
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    dst.create( src1.size(), src1.type() );
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppiSize sz;
+        typedef NppStatus (*func_t)(const npp_t* src1, int src1_step, const npp_t* src2, int src2_step, npp_t* dst, int dst_step, NppiSize sz);
-    sz.width  = src1.cols * src1.channels();
+    };
-    sz.height = src1.rows;
-    if (src1.depth() == CV_8U)
+    template <int DEPTH, typename NppAbsDiffFunc<DEPTH>::func_t func> struct NppAbsDiff
    {
-        NppStreamHandler h(stream);
+        typedef typename NppAbsDiffFunc<DEPTH>::npp_t npp_t;
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-        if (stream == 0)
+        static void call(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else if (src1.depth() == CV_16U)
        {
            NppStreamHandler h(stream);
-        nppSafeCall( nppiAbsDiff_16u_C1R(src1.ptr<Npp16u>(), static_cast<int>(src1.step), src2.ptr<Npp16u>(), static_cast<int>(src2.step),
+            NppiSize sz;
-            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+            sz.width  = src1.cols;
+            sz.height = src1.rows;
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else if (src1.depth() == CV_32F)
-    {
-        NppStreamHandler h(stream);
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step),
+            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
-            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+                              (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-    else
+    };
-    {
-        const func_t func = funcs[src1.depth()];
-        CV_Assert(func != 0);
-        func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-    }
-}
-namespace
-{
    template <int DEPTH> struct NppAbsDiffCFunc
    {
        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+        typedef npp_t scalar_t;
        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, npp_t* pDst,  int nDstStep,  NppiSize oSizeROI, npp_t nConstant);
    };
    template <> struct NppAbsDiffCFunc<CV_16U>
    {
+        typedef NppTypeTraits<CV_16U>::npp_t npp_t;
+        typedef Npp32u scalar_t;
        typedef NppStatus (*func_t)(const Npp16u* pSrc1, int nSrc1Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, Npp32u nConstant);
    };
    template <int DEPTH, typename NppAbsDiffCFunc<DEPTH>::func_t func> struct NppAbsDiffC
    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+        typedef typename NppAbsDiffCFunc<DEPTH>::npp_t npp_t;
+        typedef typename NppAbsDiffCFunc<DEPTH>::scalar_t scalar_t;
-        static void call(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)
+        static void call(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);
@@ -1041,8 +1013,8 @@ namespace
            sz.width  = src1.cols;
            sz.height = src1.rows;
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (npp_t*)dst.data, static_cast<int>(dst.step),
+            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step),
-                sz, static_cast<npp_t>(val)) );
+                              (npp_t*)dst.data, static_cast<int>(dst.step), sz, static_cast<scalar_t>(val)) );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -1050,12 +1022,41 @@ namespace
    };
 }
-void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s)
+void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 {
    using namespace cv::gpu::device;
-    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
+    typedef void (*func_t)(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    static const func_t funcs[] =
+    {
+        NppAbsDiff<CV_8U, nppiAbsDiff_8u_C1R>::call,
+        absdiff_gpu<signed char>,
+        NppAbsDiff<CV_16U, nppiAbsDiff_16u_C1R>::call,
+        absdiff_gpu<short>,
+        absdiff_gpu<int>,
+        NppAbsDiff<CV_32F, nppiAbsDiff_32f_C1R>::call,
+        absdiff_gpu<double>
+    };
+    CV_Assert(src1.depth() <= CV_64F);
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+    dst.create(src1.size(), src1.type());
+    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+}
+void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& stream)
+{
+    using namespace cv::gpu::device;
+    typedef void (*func_t)(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        NppAbsDiffC<CV_8U, nppiAbsDiffC_8u_C1R>::call,
@@ -1067,13 +1068,18 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
        absdiff_gpu<double>
    };
+    CV_Assert(src1.depth() <= CV_64F);
    CV_Assert(src1.channels() == 1);
-    dst.create(src1.size(), src1.type());
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    dst.create(src1.size(), src1.type());
-    funcs[src1.depth()](src1, src2.val[0], dst, stream);
+    funcs[src1.depth()](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
 }
 //////////////////////////////////////////////////////////////////////////////
@@ -1359,34 +1365,38 @@ namespace cv { namespace gpu { namespace device
 void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
 {
-    using namespace ::cv::gpu::device;
+    using namespace cv::gpu::device;
    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
    static const func_t funcs[7][4] =
    {
-        {compare_eq<unsigned char>, compare_ne<unsigned char>, compare_lt<unsigned char>, compare_le<unsigned char>},
+        {compare_eq<unsigned char> , compare_ne<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> },
-        {compare_eq<signed char>, compare_ne<signed char>, compare_lt<signed char>, compare_le<signed char>},
+        {compare_eq<signed char>   , compare_ne<signed char>   , compare_lt<signed char>   , compare_le<signed char>   },
        {compare_eq<unsigned short>, compare_ne<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>},
-        {compare_eq<short>, compare_ne<short>, compare_lt<short>, compare_le<short>},
+        {compare_eq<short>         , compare_ne<short>         , compare_lt<short>         , compare_le<short>         },
-        {compare_eq<int>, compare_ne<int>, compare_lt<int>, compare_le<int>},
+        {compare_eq<int>           , compare_ne<int>           , compare_lt<int>           , compare_le<int>           },
-        {compare_eq<float>, compare_ne<float>, compare_lt<float>, compare_le<float>},
+        {compare_eq<float>         , compare_ne<float>         , compare_lt<float>         , compare_le<float>         },
-        {compare_eq<double>, compare_ne<double>, compare_lt<double>, compare_le<double>}
+        {compare_eq<double>        , compare_ne<double>        , compare_lt<double>        , compare_le<double>        }
    };
+    CV_Assert(src1.depth() <= CV_64F);
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    static const int codes[] =
    {
        0, 2, 3, 2, 3, 1
    };
    const GpuMat* psrc1[] =
    {
        &src1, &src2, &src2, &src1, &src1, &src1
    };
    const GpuMat* psrc2[] =
    {
        &src2, &src1, &src1, &src2, &src2, &src2
@@ -1415,17 +1425,15 @@ namespace
    {
        dst.create(src.size(), src.type());
-        ::cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
+        cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
    }
    void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device;
+        using namespace cv::gpu::device;
-        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static Caller callers[] =
+        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        static func_t funcs[] =
        {
            bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>,
            bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>,
@@ -1433,19 +1441,19 @@ namespace
            bitwiseMaskNotCaller<unsigned int>
        };
+        CV_Assert(src.depth() <= CV_64F);
        CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
        dst.create(src.size(), src.type());
-        Caller caller = callers[src.depth()];
+        const func_t func = funcs[src.depth()];
-        CV_Assert(caller);
        int cn = src.depth() != CV_64F ? src.channels() : src.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(src.rows, src.cols, cn, src, mask, dst, stream);
-    }
+        func(src.rows, src.cols, cn, src, mask, dst, stream);
+    }
 }
 void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
 {
    if (mask.empty())
@@ -1454,7 +1462,6 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
        bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));
 }
 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations
@@ -1481,18 +1488,18 @@ namespace
    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
    }
    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device;
+        using namespace cv::gpu::device;
-        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static Caller callers[] =
+        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        static func_t funcs[] =
        {
            bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>,
            bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>,
@@ -1500,33 +1507,35 @@ namespace
            bitwiseMaskOrCaller<unsigned int>
        };
+        CV_Assert(src1.depth() <= CV_64F);
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
        dst.create(src1.size(), src1.type());
-        Caller caller = callers[src1.depth()];
+        const func_t func = funcs[src1.depth()];
-        CV_Assert(caller);
        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
    }
    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
    }
    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device;
+        using namespace cv::gpu::device;
-        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static Caller callers[] =
+        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        static func_t funcs[] =
        {
            bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>,
            bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>,
@@ -1534,33 +1543,35 @@ namespace
            bitwiseMaskAndCaller<unsigned int>
        };
+        CV_Assert(src1.depth() <= CV_64F);
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
        dst.create(src1.size(), src1.type());
-        Caller caller = callers[src1.depth()];
+        const func_t func = funcs[src1.depth()];
-        CV_Assert(caller);
        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
    }
    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
    }
    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device;
+        using namespace cv::gpu::device;
-        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static Caller callers[] =
+        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        static func_t funcs[] =
        {
            bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>,
            bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>,
@@ -1568,14 +1579,17 @@ namespace
            bitwiseMaskXorCaller<unsigned int>
        };
+        CV_Assert(src1.depth() <= CV_64F);
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
        dst.create(src1.size(), src1.type());
-        Caller caller = callers[src1.depth()];
+        const func_t func = funcs[src1.depth()];
-        CV_Assert(caller);
        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
    }
 }
@@ -1661,10 +1675,9 @@ namespace
 void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {NppBitwiseC<CV_8U, 1, nppiOrC_8u_C1R>::call, 0, NppBitwiseC<CV_8U, 3, nppiOrC_8u_C3R>::call, NppBitwiseC<CV_8U, 4, nppiOrC_8u_C4R>::call},
+        {NppBitwiseC<CV_8U , 1, nppiOrC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiOrC_8u_C4R >::call},
        {0,0,0,0},
        {NppBitwiseC<CV_16U, 1, nppiOrC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
        {0,0,0,0},
@@ -1682,10 +1695,9 @@ void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Strea
 void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {NppBitwiseC<CV_8U, 1, nppiAndC_8u_C1R>::call, 0, NppBitwiseC<CV_8U, 3, nppiAndC_8u_C3R>::call, NppBitwiseC<CV_8U, 4, nppiAndC_8u_C4R>::call},
+        {NppBitwiseC<CV_8U , 1, nppiAndC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiAndC_8u_C4R >::call},
        {0,0,0,0},
        {NppBitwiseC<CV_16U, 1, nppiAndC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
        {0,0,0,0},
@@ -1703,10 +1715,9 @@ void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
 void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {NppBitwiseC<CV_8U, 1, nppiXorC_8u_C1R>::call, 0, NppBitwiseC<CV_8U, 3, nppiXorC_8u_C3R>::call, NppBitwiseC<CV_8U, 4, nppiXorC_8u_C4R>::call},
+        {NppBitwiseC<CV_8U , 1, nppiXorC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiXorC_8u_C4R >::call},
        {0,0,0,0},
        {NppBitwiseC<CV_16U, 1, nppiXorC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
        {0,0,0,0},
@@ -1822,107 +1833,140 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
 namespace cv { namespace gpu { namespace device
 {
-    template <typename T>
+    template <typename T> void min_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+    template <typename T> void max_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
-    template <typename T>
+    template <typename T> void min_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream);
-    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+    template <typename T> void max_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream);
-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
 }}}
-namespace
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 {
-    template <typename T>
+    using namespace cv::gpu::device;
-    void min_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    typedef void (*func_t)(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    static const func_t funcs[] =
    {
+        min_gpu<unsigned char>,
+        min_gpu<signed char>,
+        min_gpu<unsigned short>,
+        min_gpu<short>,
+        min_gpu<int>,
+        min_gpu<float>,
+        min_gpu<double>
+    };
+    CV_Assert(src1.depth() <= CV_64F);
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-    }
-    template <typename T>
+    if (src1.depth() == CV_64F)
-    void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
    {
-        dst.create(src1.size(), src1.type());
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-        ::cv::gpu::device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }
-    template <typename T>
-    void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-    }
-    template <typename T>
+    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
-    void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
-    {
-        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
-    }
 }
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 {
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    using namespace cv::gpu::device;
-    CV_Assert((src1.depth() != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    typedef void (*func_t)(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
-        min_caller<unsigned char>, min_caller<signed char>, min_caller<unsigned short>, min_caller<short>, min_caller<int>,
+        max_gpu<unsigned char>,
-        min_caller<float>, min_caller<double>
+        max_gpu<signed char>,
+        max_gpu<unsigned short>,
+        max_gpu<short>,
+        max_gpu<int>,
+        max_gpu<float>,
+        max_gpu<double>
    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+    CV_Assert(src1.depth() <= CV_64F);
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+    dst.create(src1.size(), src1.type());
+    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
 }
-void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
+namespace
 {
-    CV_Assert((src1.depth() != CV_64F) ||
+    template <typename T> void minScalar(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream)
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+    {
+        cv::gpu::device::min_gpu(src, saturate_cast<T>(val), dst, stream);
+    }
-    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    template <typename T> void maxScalar(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream)
-    static const func_t funcs[] =
    {
-        min_caller<unsigned char>, min_caller<signed char>, min_caller<unsigned short>, min_caller<short>, min_caller<int>,
+        cv::gpu::device::max_gpu(src, saturate_cast<T>(val), dst, stream);
-        min_caller<float>, min_caller<double>
+    }
-    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
 }
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    typedef void (*func_t)(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    CV_Assert((src1.depth() != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
-        max_caller<unsigned char>, max_caller<signed char>, max_caller<unsigned short>, max_caller<short>, max_caller<int>,
+        minScalar<unsigned char>,
-        max_caller<float>, max_caller<double>
+        minScalar<signed char>,
+        minScalar<unsigned short>,
+        minScalar<short>,
+        minScalar<int>,
+        minScalar<float>,
+        minScalar<double>
    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+    CV_Assert(src.depth() <= CV_64F);
+    CV_Assert(src.channels() == 1);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+    dst.create(src.size(), src.type());
+    funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
 }
-void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
+void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    CV_Assert((src1.depth() != CV_64F) ||
+    typedef void (*func_t)(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
-        max_caller<unsigned char>, max_caller<signed char>, max_caller<unsigned short>, max_caller<short>, max_caller<int>,
+        maxScalar<unsigned char>,
-        max_caller<float>, max_caller<double>
+        maxScalar<signed char>,
+        maxScalar<unsigned short>,
+        maxScalar<short>,
+        maxScalar<int>,
+        maxScalar<float>,
+        maxScalar<double>
    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+    CV_Assert(src.depth() <= CV_64F);
+    CV_Assert(src.channels() == 1);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+    dst.create(src.size(), src.type());
+    funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
 }
 ////////////////////////////////////////////////////////////////////////
@@ -1947,6 +1991,12 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
    CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
    CV_Assert(type <= THRESH_TOZERO_INV);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    dst.create(src.size(), src.type());
    cudaStream_t stream = StreamAccessor::getStream(s);
@@ -1967,9 +2017,8 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
    }
    else
    {
-        typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+        static const func_t funcs[] =
-        static const caller_t callers[] =
        {
            threshold_caller<unsigned char>, threshold_caller<signed char>,
            threshold_caller<unsigned short>, threshold_caller<short>,
@@ -1982,7 +2031,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
            maxVal = cvRound(maxVal);
        }
-        callers[src.depth()](src, dst, thresh, maxVal, type, stream);
+        funcs[src.depth()](src, dst, thresh, maxVal, type, stream);
    }
    return thresh;
@@ -1993,8 +2042,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 namespace cv { namespace gpu { namespace device
 {
-    template<typename T>
+    template<typename T> void pow_caller(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
-    void pow_caller(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
 }}}
 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
@@ -2002,7 +2050,6 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
    using namespace cv::gpu::device;
    typedef void (*func_t)(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        pow_caller<unsigned char>,  pow_caller<signed char>,
@@ -2010,6 +2057,14 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
        pow_caller<int>, pow_caller<float>, pow_caller<double>
    };
+    CV_Assert(src.depth() <= CV_64F);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    dst.create(src.size(), src.type());
    funcs[src.depth()](src.reshape(1), power, dst.reshape(1), StreamAccessor::getStream(stream));
@@ -2075,8 +2130,7 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
        NppAlphaComp<CV_16U, nppiAlphaComp_16u_AC4R>::call,
        0,
        NppAlphaComp<CV_32S, nppiAlphaComp_32s_AC4R>::call,
-        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call,
+        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
-        0
    };
    CV_Assert(img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4);
@@ -2085,7 +2139,6 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
    dst.create(img1.size(), img1.type());
    const func_t func = funcs[img1.depth()];
-    CV_Assert(func != 0);
    func(img1, img2, dst, npp_alpha_ops[alpha_op], StreamAccessor::getStream(stream));
 }
@@ -2569,6 +2622,14 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
    dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type();
+    CV_Assert(src1.depth() <= CV_64F && src2.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
+    if (src1.depth() == CV_64F || src2.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    dst.create(src1.size(), dtype);
    const GpuMat* psrc1 = &src1;
@@ -2581,7 +2642,9 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
    }
    const func_t func = funcs[psrc1->depth()][psrc2->depth()][dst.depth()];
-    CV_Assert(func != 0);
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
    func(psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream));
 }

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -148,6 +148,8 @@ double cv::gpu::norm(const GpuMat& src, int normType)
 double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
 {
+    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
    GpuMat src_single_channel = src.reshape(1);
    if (normType == NORM_L1)
@@ -156,22 +158,16 @@ double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
    if (normType == NORM_L2)
        return std::sqrt(sqrSum(src_single_channel, buf)[0]);
-    if (normType == NORM_INF)
+    // NORM_INF
-    {
    double min_val, max_val;
    minMax(src_single_channel, &min_val, &max_val, GpuMat(), buf);
    return std::max(std::abs(min_val), std::abs(max_val));
-    }
-    CV_Error(CV_StsBadArg, "norm: unsupported norm type");
-    return 0;
 }
 double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 {
-    CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(src1.type() == CV_8UC1);
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
@@ -239,23 +235,25 @@ Scalar cv::gpu::sum(const GpuMat& src)
 Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::sum;
+    using namespace cv::gpu::device::matrix_reductions::sum;
    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
    {
        sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
        sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
-        sumMultipassCaller<int>, sumMultipassCaller<float>, 0 
+        sumMultipassCaller<int>, sumMultipassCaller<float>
    };
-    static Caller singlepass_callers[7] = { 
+    static Caller singlepass_callers[] = {
        sumCaller<unsigned char>, sumCaller<char>,
        sumCaller<unsigned short>, sumCaller<short>,
-        sumCaller<int>, sumCaller<float>, 0 
+        sumCaller<int>, sumCaller<float>
    };
+    CV_Assert(src.depth() <= CV_32F);
    Size buf_size;
    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
@@ -265,7 +263,6 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
        callers = singlepass_callers;
    Caller caller = callers[src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
    double result[4];
    caller(src, buf, result, src.channels());
@@ -282,24 +279,26 @@ Scalar cv::gpu::absSum(const GpuMat& src)
 Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::sum;
+    using namespace cv::gpu::device::matrix_reductions::sum;
    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
    {
        absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>,
        absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>,
-        absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 
+        absSumMultipassCaller<int>, absSumMultipassCaller<float>
    };
-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[] =
    {
        absSumCaller<unsigned char>, absSumCaller<char>,
        absSumCaller<unsigned short>, absSumCaller<short>,
-        absSumCaller<int>, absSumCaller<float>, 0 
+        absSumCaller<int>, absSumCaller<float>
    };
+    CV_Assert(src.depth() <= CV_32F);
    Size buf_size;
    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
@@ -309,7 +308,6 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
        callers = singlepass_callers;
    Caller caller = callers[src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "absSum: unsupported type");
    double result[4];
    caller(src, buf, result, src.channels());
@@ -326,24 +324,26 @@ Scalar cv::gpu::sqrSum(const GpuMat& src)
 Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::sum;
+    using namespace cv::gpu::device::matrix_reductions::sum;
    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
    {
        sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
        sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
-        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 
+        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>
    };
    static Caller singlepass_callers[7] =
    {
        sqrSumCaller<unsigned char>, sqrSumCaller<char>,
        sqrSumCaller<unsigned short>, sqrSumCaller<short>,
-        sqrSumCaller<int>, sqrSumCaller<float>, 0 
+        sqrSumCaller<int>, sqrSumCaller<float>
    };
+    CV_Assert(src.depth() <= CV_32F);
    Caller* callers = multipass_callers;
    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
        callers = singlepass_callers;
@@ -353,7 +353,6 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
    ensureSizeIsEnough(buf_size, CV_8U, buf);
    Caller caller = callers[src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
    double result[4];
    caller(src, buf, result, src.channels());
@@ -401,38 +400,44 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);
    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
    {
        minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
        minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
        minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0
    };
-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[] =
    {
        minMaxCaller<unsigned char>, minMaxCaller<char>,
        minMaxCaller<unsigned short>, minMaxCaller<short>,
        minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double>
    };
-    static MaskedCaller masked_multipass_callers[7] = 
+    static MaskedCaller masked_multipass_callers[] =
    {
        minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
        minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
        minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
    };
-    static MaskedCaller masked_singlepass_callers[7] =
+    static MaskedCaller masked_singlepass_callers[] =
    {
        minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
        minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
        minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double>
    };
+    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);
    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
@@ -447,7 +452,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
            callers = singlepass_callers;
        Caller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, minVal, maxVal, buf);
    }
    else
@@ -457,7 +462,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
            callers = masked_singlepass_callers;
        MaskedCaller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, mask, minVal, maxVal, buf);
    }
 }
@@ -508,38 +513,44 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
    {
        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
    };
-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[] =
    {
        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double>
    };
-    static MaskedCaller masked_multipass_callers[7] = 
+    static MaskedCaller masked_multipass_callers[] =
    {
        minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
    };
-    static MaskedCaller masked_singlepass_callers[7] = 
+    static MaskedCaller masked_singlepass_callers[] =
    {
        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
    };
+    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);
    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    int minLoc_[2];
@@ -558,7 +569,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
            callers = singlepass_callers;
        Caller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }
    else
@@ -568,7 +579,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
            callers = masked_singlepass_callers;
        MaskedCaller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }
@@ -622,8 +633,15 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
        countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
        countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };
+    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    Size buf_size;
    getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
@@ -633,7 +651,7 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
        callers = singlepass_callers;
    Caller caller = callers[src.type()];
-    if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
+    CV_Assert(caller != 0);
    return caller(src, buf);
 }
@@ -719,6 +737,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
        };
        const caller_t func = callers[src.depth()][dst.depth()];
        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
@@ -781,6 +800,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
        };
        const caller_t func = callers[src.depth()][dst.depth()];
        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");

--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -106,7 +106,7 @@
        #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
    #endif
-    static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); }
+    static inline void throw_nogpu() { CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform"); }
 #else /* defined(HAVE_CUDA) */

--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -995,6 +995,20 @@ TEST_P(AbsDiff, Array)
    cv::Mat src1 = randomMat(size, depth);
    cv::Mat src2 = randomMat(size, depth);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::absdiff(loadMat(src1), loadMat(src2), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
        cv::gpu::absdiff(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
@@ -1002,6 +1016,7 @@ TEST_P(AbsDiff, Array)
        cv::absdiff(src1, src2, dst_gold);
        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 TEST_P(AbsDiff, Scalar)
@@ -1009,6 +1024,20 @@ TEST_P(AbsDiff, Scalar)
    cv::Mat src = randomMat(size, depth);
    cv::Scalar val = randomScalar(0.0, 255.0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::absdiff(loadMat(src), val, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
        cv::gpu::absdiff(loadMat(src, useRoi), val, dst);
@@ -1016,6 +1045,7 @@ TEST_P(AbsDiff, Scalar)
        cv::absdiff(src, val, dst_gold);
        EXPECT_MAT_NEAR(dst_gold, dst, depth <= CV_32F ? 1.0 : 1e-5);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, AbsDiff, testing::Combine(
@@ -1243,6 +1273,40 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Exp
+template <typename T> void expImpl(const cv::Mat& src, cv::Mat& dst)
+{
+    dst.create(src.size(), src.type());
+    for (int y = 0; y < src.rows; ++y)
+    {
+        for (int x = 0; x < src.cols; ++x)
+            dst.at<T>(y, x) = cv::saturate_cast<T>(static_cast<int>(std::exp(static_cast<float>(src.at<T>(y, x)))));
+    }
+}
+void expImpl_float(const cv::Mat& src, cv::Mat& dst)
+{
+    dst.create(src.size(), src.type());
+    for (int y = 0; y < src.rows; ++y)
+    {
+        for (int x = 0; x < src.cols; ++x)
+            dst.at<float>(y, x) = std::exp(static_cast<float>(src.at<float>(y, x)));
+    }
+}
+void expGold(const cv::Mat& src, cv::Mat& dst)
+{
+    typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
+    const func_t funcs[] =
+    {
+        expImpl<uchar>, expImpl<schar>, expImpl<ushort>, expImpl<short>,
+        expImpl<int>, expImpl_float
+    };
+    funcs[src.depth()](src, dst);
+}
 PARAM_TEST_CASE(Exp, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
@@ -1269,7 +1333,7 @@ TEST_P(Exp, Accuracy)
    cv::gpu::exp(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
-    cv::exp(src, dst_gold);
+    expGold(src, dst_gold);
    EXPECT_MAT_NEAR(dst_gold, dst, 1e-2);
 }
@@ -1277,7 +1341,10 @@ TEST_P(Exp, Accuracy)
 INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_32FC1)),
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_16UC1),
+                    MatType(CV_16SC1),
+                    MatType(CV_32FC1)),
    WHOLE_SUBMAT));
 ////////////////////////////////////////////////////////////////////////////////
@@ -1311,6 +1378,20 @@ TEST_P(Compare, Accuracy)
    cv::Mat src1 = randomMat(size, depth);
    cv::Mat src2 = randomMat(size, depth);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::compare(loadMat(src1), loadMat(src2), dst, cmp_code);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        cv::gpu::GpuMat dst = createMat(size, CV_8UC1, useRoi);
        cv::gpu::compare(loadMat(src1, useRoi), loadMat(src2, useRoi), dst, cmp_code);
@@ -1318,6 +1399,7 @@ TEST_P(Compare, Accuracy)
        cv::compare(src1, src2, dst_gold, cmp_code);
        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Compare, testing::Combine(
@@ -1635,17 +1717,60 @@ PARAM_TEST_CASE(Min, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
    }
 };
-TEST_P(Min, Accuracy)
+TEST_P(Min, Array)
 {
    cv::Mat src1 = randomMat(size, depth);
    cv::Mat src2 = randomMat(size, depth);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::min(loadMat(src1), loadMat(src2), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
        cv::gpu::min(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
        cv::Mat dst_gold = cv::min(src1, src2);
        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+TEST_P(Min, Scalar)
+{
+    cv::Mat src = randomMat(size, depth);
+    double val = randomDouble(0.0, 255.0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::min(loadMat(src), val, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::min(loadMat(src, useRoi), val, dst);
+        cv::Mat dst_gold = cv::min(src, val);
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Min, testing::Combine(
@@ -1675,17 +1800,60 @@ PARAM_TEST_CASE(Max, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
    }
 };
-TEST_P(Max, Accuracy)
+TEST_P(Max, Array)
 {
    cv::Mat src1 = randomMat(size, depth);
    cv::Mat src2 = randomMat(size, depth);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::max(loadMat(src1), loadMat(src2), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
        cv::gpu::max(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
        cv::Mat dst_gold = cv::max(src1, src2);
        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+TEST_P(Max, Scalar)
+{
+    cv::Mat src = randomMat(size, depth);
+    double val = randomDouble(0.0, 255.0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::max(loadMat(src), val, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::max(loadMat(src, useRoi), val, dst);
+        cv::Mat dst_gold = cv::max(src, val);
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Max, testing::Combine(
@@ -1723,6 +1891,20 @@ TEST_P(Pow, Accuracy)
    if (src.depth() < CV_32F)
        power = static_cast<int>(power);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::pow(loadMat(src), power, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
        cv::gpu::pow(loadMat(src, useRoi), power, dst);
@@ -1730,6 +1912,7 @@ TEST_P(Pow, Accuracy)
        cv::pow(src, power, dst_gold);
        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 0.0 : 1e-1);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Pow, testing::Combine(
@@ -1750,7 +1933,6 @@ PARAM_TEST_CASE(AddWeighted, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth,
    int dst_depth;
    bool useRoi;
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
@@ -1772,6 +1954,20 @@ TEST_P(AddWeighted, Accuracy)
    double beta = randomDouble(-10.0, 10.0);
    double gamma = randomDouble(-10.0, 10.0);
+    if ((depth1 == CV_64F || depth2 == CV_64F || dst_depth == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::addWeighted(loadMat(src1), alpha, loadMat(src2), beta, gamma, dst, dst_depth);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        cv::gpu::GpuMat dst = createMat(size, dst_depth, useRoi);
        cv::gpu::addWeighted(loadMat(src1, useRoi), alpha, loadMat(src2, useRoi), beta, gamma, dst, dst_depth);
@@ -1779,6 +1975,7 @@ TEST_P(AddWeighted, Accuracy)
        cv::addWeighted(src1, alpha, src2, beta, gamma, dst_gold, dst_depth);
        EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 1.0 : 1e-12);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, AddWeighted, testing::Combine(
@@ -1823,6 +2020,43 @@ TEST_P(GEMM, Accuracy)
    double alpha = randomDouble(-10.0, 10.0);
    double beta = randomDouble(-10.0, 10.0);
+#ifndef HAVE_CUBLAS
+    try
+    {
+        cv::gpu::GpuMat dst;
+        cv::gpu::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+    }
+    catch (const cv::Exception& e)
+    {
+        ASSERT_EQ(CV_StsNotImplemented, e.code);
+    }
+#else
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else if (type == CV_64FC2 && flags != 0)
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
        cv::gpu::gemm(loadMat(src1, useRoi), loadMat(src2, useRoi), alpha, loadMat(src3, useRoi), beta, dst, flags);
@@ -1830,6 +2064,8 @@ TEST_P(GEMM, Accuracy)
        cv::gemm(src1, src2, alpha, src3, beta, dst_gold, flags);
        EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1e-10);
+    }
+#endif
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, GEMM, testing::Combine(
@@ -1864,6 +2100,20 @@ TEST_P(Transpose, Accuracy)
 {
    cv::Mat src = randomMat(size, type);
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::transpose(loadMat(src), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        cv::gpu::GpuMat dst = createMat(cv::Size(size.height, size.width), type, useRoi);
        cv::gpu::transpose(loadMat(src, useRoi), dst);
@@ -1871,6 +2121,7 @@ TEST_P(Transpose, Accuracy)
        cv::transpose(src, dst_gold);
        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Transpose, testing::Combine(
@@ -2498,6 +2749,20 @@ TEST_P(MinMax, WithoutMask)
 {
    cv::Mat src = randomMat(size, depth);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::gpu::minMax(loadMat(src), &minVal, &maxVal);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        double minVal, maxVal;
        cv::gpu::minMax(loadMat(src, useRoi), &minVal, &maxVal);
@@ -2506,6 +2771,7 @@ TEST_P(MinMax, WithoutMask)
        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
 }
 TEST_P(MinMax, WithMask)
@@ -2513,6 +2779,20 @@ TEST_P(MinMax, WithMask)
    cv::Mat src = randomMat(size, depth);
    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::gpu::minMax(loadMat(src), &minVal, &maxVal, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        double minVal, maxVal;
        cv::gpu::minMax(loadMat(src, useRoi), &minVal, &maxVal, loadMat(mask, useRoi));
@@ -2521,13 +2801,38 @@ TEST_P(MinMax, WithMask)
        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
 }
 TEST_P(MinMax, NullPtr)
 {
    cv::Mat src = randomMat(size, depth);
-    cv::gpu::minMax(loadMat(src, useRoi), 0, 0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::gpu::minMax(loadMat(src), &minVal, 0);
+            cv::gpu::minMax(loadMat(src), 0, &maxVal);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::gpu::minMax(loadMat(src, useRoi), &minVal, 0);
+        cv::gpu::minMax(loadMat(src, useRoi), 0, &maxVal);
+        double minVal_gold, maxVal_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, 0, 0);
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, MinMax, testing::Combine(
@@ -2585,6 +2890,21 @@ TEST_P(MinMaxLoc, WithoutMask)
 {
    cv::Mat src = randomMat(size, depth);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::gpu::minMaxLoc(loadMat(src), &minVal, &maxVal, &minLoc, &maxLoc);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        double minVal, maxVal;
        cv::Point minLoc, maxLoc;
        cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc);
@@ -2598,6 +2918,7 @@ TEST_P(MinMaxLoc, WithoutMask)
        expectEqual(src, minLoc_gold, minLoc);
        expectEqual(src, maxLoc_gold, maxLoc);
+    }
 }
 TEST_P(MinMaxLoc, WithMask)
@@ -2605,6 +2926,21 @@ TEST_P(MinMaxLoc, WithMask)
    cv::Mat src = randomMat(size, depth);
    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::gpu::minMaxLoc(loadMat(src), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        double minVal, maxVal;
        cv::Point minLoc, maxLoc;
        cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask, useRoi));
@@ -2618,13 +2954,48 @@ TEST_P(MinMaxLoc, WithMask)
        expectEqual(src, minLoc_gold, minLoc);
        expectEqual(src, maxLoc_gold, maxLoc);
+    }
 }
 TEST_P(MinMaxLoc, NullPtr)
 {
    cv::Mat src = randomMat(size, depth);
-    cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, 0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, 0, 0, 0);
+            cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, &maxVal, 0, 0);
+            cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, &minLoc, 0);
+            cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, &maxLoc);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, 0, 0, 0);
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, &maxVal, 0, 0);
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, &minLoc, 0);
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, &maxLoc);
+        double minVal_gold, maxVal_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+        expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, MinMaxLoc, testing::Combine(
@@ -2661,12 +3032,25 @@ TEST_P(CountNonZero, Accuracy)
    cv::Mat src;
    srcBase.convertTo(src, depth);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::countNonZero(loadMat(src));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
        int val = cv::gpu::countNonZero(loadMat(src, useRoi));
        int val_gold = cv::countNonZero(src);
        ASSERT_EQ(val_gold, val);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, CountNonZero, testing::Combine(