fixed gpu core tests (added additional check for device's feature support)

added assertion on double types for old devices

fixed gpu core tests (added additional check for device's feature support)
added assertion on double types for old devices
26691e00 · Vladislav Vinogradov · 98d7b10c · 26691e00 · 26691e00 · 26691e00
Commit 26691e00 authored Mar 26, 2012 by Vladislav Vinogradov
6 changed files
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -69,16 +69,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
 {
 #ifndef HAVE_CUBLAS
-    OPENCV_GPU_UNUSED(src1);
+    CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
-    OPENCV_GPU_UNUSED(src2);
-    OPENCV_GPU_UNUSED(alpha);
-    OPENCV_GPU_UNUSED(src3);
-    OPENCV_GPU_UNUSED(beta);
-    OPENCV_GPU_UNUSED(dst);
-    OPENCV_GPU_UNUSED(flags);
-    OPENCV_GPU_UNUSED(stream);
-    throw_nogpu();
 #else
@@ -87,6 +78,12 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    bool tr1 = (flags & GEMM_1_T) != 0;
    bool tr2 = (flags & GEMM_2_T) != 0;
    bool tr3 = (flags & GEMM_3_T) != 0;
@@ -230,6 +227,9 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
    }
    else // if (src.elemSize() == 8)
    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
        NppStStreamHandler h(stream);
        NcvSize32u sz;
@@ -290,7 +290,6 @@ namespace
 void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
    static const func_t funcs[6][4] =
    {
        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
@@ -403,12 +402,12 @@ namespace
 void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    ::npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
+    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
 }
 void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    ::npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
+    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
 }
 ////////////////////////////////////////////////////////////////////////
@@ -429,7 +428,7 @@ namespace
    {
        using namespace ::cv::gpu::device::mathfunc;
-        CV_DbgAssert(x.size() == y.size() && x.type() == y.type());
+        CV_Assert(x.size() == y.size() && x.type() == y.type());
        CV_Assert(x.depth() == CV_32F);
        if (mag)
@@ -449,7 +448,7 @@ namespace
    {
        using namespace ::cv::gpu::device::mathfunc;
-        CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
+        CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
        CV_Assert(mag.depth() == CV_32F);
        x.create(mag.size(), mag.type());

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -1096,18 +1096,18 @@ namespace cv { namespace gpu { namespace device
        enum { smart_shift = 4 };
    };
-    template <typename T> void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
+    template <typename T> void absdiff_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, Absdiff<T>(), WithOutMask(), stream);
    }
-    template void absdiff_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<uchar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<schar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<ushort>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<short >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<int   >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<float >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<double>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    template <typename T> struct AbsdiffScalar : unary_function<T, T>
    {
@@ -1140,20 +1140,20 @@ namespace cv { namespace gpu { namespace device
        enum { smart_shift = 4 };
    };
-    template <typename T> void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)
+    template <typename T> void absdiff_gpu(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream)
    {
        cudaSafeCall( cudaSetDoubleForDevice(&val) );
        AbsdiffScalar<T> op(val);
        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, WithOutMask(), stream);
    }
-    //template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<uchar >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<schar >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    //template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<ushort>(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<short >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<int   >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    //template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<float >(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<double>(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
    //////////////////////////////////////////////////////////////////////////////////////
    // Compare
@@ -1587,60 +1587,60 @@ namespace cv { namespace gpu { namespace device
    };
    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    void min_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
-        cv::gpu::device::transform(src1, src2, dst, minimum<T>(), WithOutMask(), stream);
+        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minimum<T>(), WithOutMask(), stream);
    }
-    template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void min_gpu<uchar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void min_gpu<schar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    void max_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
-        cv::gpu::device::transform(src1, src2, dst, maximum<T>(), WithOutMask(), stream);
+        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, maximum<T>(), WithOutMask(), stream);
    }
-    template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void max_gpu<uchar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void max_gpu<schar >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    void min_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream)
    {
-        cv::gpu::device::transform(src1, dst, device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
+        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, device::bind2nd(minimum<T>(), val), WithOutMask(), stream);
    }
-    template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void min_gpu<uchar >(const DevMem2Db src, uchar  val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void min_gpu<schar >(const DevMem2Db src, schar  val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2Db src, ushort val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2Db src, short  val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2Db src, int    val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2Db src, float  val, DevMem2Db dst, cudaStream_t stream);
-    template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream);
    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    void max_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream)
    {
-        cv::gpu::device::transform(src1, dst, device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
+        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, device::bind2nd(maximum<T>(), val), WithOutMask(), stream);
    }
-    template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void max_gpu<uchar >(const DevMem2Db src, uchar  val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void max_gpu<schar >(const DevMem2Db src, schar  val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2Db src, ushort val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2Db src, short  val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2Db src, int    val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2Db src, float  val, DevMem2Db dst, cudaStream_t stream);
-    template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream);
    //////////////////////////////////////////////////////////////////////////
    // threshold
@@ -1805,18 +1805,63 @@ namespace cv { namespace gpu { namespace device
    //////////////////////////////////////////////////////////////////////////
    // addWeighted
-    template <typename T1, typename T2, typename D> struct AddWeighted : binary_function<T1, T2, D>
+    namespace detail
    {
-        __host__ __device__ __forceinline__ AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
+        template <typename T> struct UseDouble
+        {
+            enum {value = 0};
+        };
+        template <> struct UseDouble<int>
+        {
+            enum {value = 1};
+        };
+        template <> struct UseDouble<float>
+        {
+            enum {value = 1};
+        };
+        template <> struct UseDouble<double>
+        {
+            enum {value = 1};
+        };
+    }
+    template <typename T1, typename T2, typename D> struct UseDouble
+    {
+        enum {value = (detail::UseDouble<T1>::value || detail::UseDouble<T2>::value || detail::UseDouble<D>::value)};
+    };
-        __device__ __forceinline__ D operator ()(typename TypeTraits<T1>::ParameterType a, typename TypeTraits<T2>::ParameterType b) const
+    namespace detail
+    {
+        template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted;
+        template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, false> : binary_function<T1, T2, D>
        {
-            return saturate_cast<D>(alpha * a + beta * b + gamma);
+            AddWeighted(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
-        }
-        const double alpha;
+            __device__ __forceinline__ D operator ()(T1 a, T2 b) const
-        const double beta;
+            {
-        const double gamma;
+                return saturate_cast<D>(a * alpha + b * beta + gamma);
+            }
+            const float alpha;
+            const float beta;
+            const float gamma;
+        };
+        template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, true> : binary_function<T1, T2, D>
+        {
+            AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
+            __device__ __forceinline__ D operator ()(T1 a, T2 b) const
+            {
+                return saturate_cast<D>(a * alpha + b * beta + gamma);
+            }
+            const double alpha;
+            const double beta;
+            const double gamma;
+        };
+    }
+    template <typename T1, typename T2, typename D> struct AddWeighted : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>
+    {
+        AddWeighted(double alpha_, double beta_, double gamma_) : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
    };
    template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >
@@ -1878,9 +1923,12 @@ namespace cv { namespace gpu { namespace device
    template <typename T1, typename T2, typename D>
    void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream)
    {
-        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
+        if (UseDouble<T1, T2, D>::value)
-        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
+        {
-        cudaSafeCall( cudaSetDoubleForDevice(&gamma) );
+            cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
+            cudaSafeCall( cudaSetDoubleForDevice(&beta) );
+            cudaSafeCall( cudaSetDoubleForDevice(&gamma) );
+        }
        AddWeighted<T1, T2, D> op(alpha, beta, gamma);

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -950,90 +950,62 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
 namespace cv { namespace gpu { namespace device
 {
    template <typename T>
-    void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    void absdiff_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    template <typename T>
-    void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
+    void absdiff_gpu(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream);
 }}}
-void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
+namespace
 {
-    using namespace ::cv::gpu::device;
+    template <int DEPTH> struct NppAbsDiffFunc
-    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-       absdiff_gpu<unsigned char>, absdiff_gpu<signed char>, absdiff_gpu<unsigned short>, absdiff_gpu<short>, absdiff_gpu<int>, absdiff_gpu<float>, absdiff_gpu<double>
-    };
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    dst.create( src1.size(), src1.type() );
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppiSize sz;
-    sz.width  = src1.cols * src1.channels();
-    sz.height = src1.rows;
-    if (src1.depth() == CV_8U)
    {
-        NppStreamHandler h(stream);
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step),
+        typedef NppStatus (*func_t)(const npp_t* src1, int src1_step, const npp_t* src2, int src2_step, npp_t* dst, int dst_step, NppiSize sz);
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+    };
-        if (stream == 0)
+    template <int DEPTH, typename NppAbsDiffFunc<DEPTH>::func_t func> struct NppAbsDiff
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else if (src1.depth() == CV_16U)
    {
-        NppStreamHandler h(stream);
+        typedef typename NppAbsDiffFunc<DEPTH>::npp_t npp_t;
-        nppSafeCall( nppiAbsDiff_16u_C1R(src1.ptr<Npp16u>(), static_cast<int>(src1.step), src2.ptr<Npp16u>(), static_cast<int>(src2.step),
+        static void call(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
-            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+        {
+            NppStreamHandler h(stream);
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else if (src1.depth() == CV_32F)
-    {
-        NppStreamHandler h(stream);
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step),
+            NppiSize sz;
-            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+            sz.width  = src1.cols;
+            sz.height = src1.rows;
-        if (stream == 0)
+            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
-            cudaSafeCall( cudaDeviceSynchronize() );
+                              (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
-    }
-    else
-    {
-        const func_t func = funcs[src1.depth()];
-        CV_Assert(func != 0);
-        func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+            if (stream == 0)
-    }
+                cudaSafeCall( cudaDeviceSynchronize() );
-}
+        }
+    };
-namespace
-{
    template <int DEPTH> struct NppAbsDiffCFunc
    {
        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+        typedef npp_t scalar_t;
        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, npp_t* pDst,  int nDstStep,  NppiSize oSizeROI, npp_t nConstant);
    };
    template <> struct NppAbsDiffCFunc<CV_16U>
    {
+        typedef NppTypeTraits<CV_16U>::npp_t npp_t;
+        typedef Npp32u scalar_t;
        typedef NppStatus (*func_t)(const Npp16u* pSrc1, int nSrc1Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, Npp32u nConstant);
    };
    template <int DEPTH, typename NppAbsDiffCFunc<DEPTH>::func_t func> struct NppAbsDiffC
    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+        typedef typename NppAbsDiffCFunc<DEPTH>::npp_t npp_t;
+        typedef typename NppAbsDiffCFunc<DEPTH>::scalar_t scalar_t;
-        static void call(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)
+        static void call(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream)
        {
            NppStreamHandler h(stream);
@@ -1041,8 +1013,8 @@ namespace
            sz.width  = src1.cols;
            sz.height = src1.rows;
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (npp_t*)dst.data, static_cast<int>(dst.step),
+            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step),
-                sz, static_cast<npp_t>(val)) );
+                              (npp_t*)dst.data, static_cast<int>(dst.step), sz, static_cast<scalar_t>(val)) );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -1050,12 +1022,41 @@ namespace
    };
 }
-void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s)
+void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 {
    using namespace cv::gpu::device;
-    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
+    typedef void (*func_t)(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    static const func_t funcs[] =
+    {
+        NppAbsDiff<CV_8U, nppiAbsDiff_8u_C1R>::call,
+        absdiff_gpu<signed char>,
+        NppAbsDiff<CV_16U, nppiAbsDiff_16u_C1R>::call,
+        absdiff_gpu<short>,
+        absdiff_gpu<int>,
+        NppAbsDiff<CV_32F, nppiAbsDiff_32f_C1R>::call,
+        absdiff_gpu<double>
+    };
+    CV_Assert(src1.depth() <= CV_64F);
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+    dst.create(src1.size(), src1.type());
+    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+}
+void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& stream)
+{
+    using namespace cv::gpu::device;
+    typedef void (*func_t)(const DevMem2Db src1, double val, DevMem2Db dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        NppAbsDiffC<CV_8U, nppiAbsDiffC_8u_C1R>::call,
@@ -1067,13 +1068,18 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
        absdiff_gpu<double>
    };
+    CV_Assert(src1.depth() <= CV_64F);
    CV_Assert(src1.channels() == 1);
-    dst.create(src1.size(), src1.type());
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    dst.create(src1.size(), src1.type());
-    funcs[src1.depth()](src1, src2.val[0], dst, stream);
+    funcs[src1.depth()](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
 }
 //////////////////////////////////////////////////////////////////////////////
@@ -1359,34 +1365,38 @@ namespace cv { namespace gpu { namespace device
 void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
 {
-    using namespace ::cv::gpu::device;
+    using namespace cv::gpu::device;
    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
    static const func_t funcs[7][4] =
    {
-        {compare_eq<unsigned char>, compare_ne<unsigned char>, compare_lt<unsigned char>, compare_le<unsigned char>},
+        {compare_eq<unsigned char> , compare_ne<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> },
-        {compare_eq<signed char>, compare_ne<signed char>, compare_lt<signed char>, compare_le<signed char>},
+        {compare_eq<signed char>   , compare_ne<signed char>   , compare_lt<signed char>   , compare_le<signed char>   },
        {compare_eq<unsigned short>, compare_ne<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>},
-        {compare_eq<short>, compare_ne<short>, compare_lt<short>, compare_le<short>},
+        {compare_eq<short>         , compare_ne<short>         , compare_lt<short>         , compare_le<short>         },
-        {compare_eq<int>, compare_ne<int>, compare_lt<int>, compare_le<int>},
+        {compare_eq<int>           , compare_ne<int>           , compare_lt<int>           , compare_le<int>           },
-        {compare_eq<float>, compare_ne<float>, compare_lt<float>, compare_le<float>},
+        {compare_eq<float>         , compare_ne<float>         , compare_lt<float>         , compare_le<float>         },
-        {compare_eq<double>, compare_ne<double>, compare_lt<double>, compare_le<double>}
+        {compare_eq<double>        , compare_ne<double>        , compare_lt<double>        , compare_le<double>        }
    };
+    CV_Assert(src1.depth() <= CV_64F);
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    static const int codes[] =
    {
        0, 2, 3, 2, 3, 1
    };
    const GpuMat* psrc1[] =
    {
        &src1, &src2, &src2, &src1, &src1, &src1
    };
    const GpuMat* psrc2[] =
    {
        &src2, &src1, &src1, &src2, &src2, &src2
@@ -1415,17 +1425,15 @@ namespace
    {
        dst.create(src.size(), src.type());
-        ::cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
+        cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
    }
    void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device;
+        using namespace cv::gpu::device;
-        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        static func_t funcs[] =
-        static Caller callers[] =
        {
            bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>,
            bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>,
@@ -1433,19 +1441,19 @@ namespace
            bitwiseMaskNotCaller<unsigned int>
        };
+        CV_Assert(src.depth() <= CV_64F);
        CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
        dst.create(src.size(), src.type());
-        Caller caller = callers[src.depth()];
+        const func_t func = funcs[src.depth()];
-        CV_Assert(caller);
        int cn = src.depth() != CV_64F ? src.channels() : src.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(src.rows, src.cols, cn, src, mask, dst, stream);
-    }
+        func(src.rows, src.cols, cn, src, mask, dst, stream);
+    }
 }
 void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
 {
    if (mask.empty())
@@ -1454,7 +1462,6 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
        bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));
 }
 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations
@@ -1481,18 +1488,18 @@ namespace
    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
    }
    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device;
+        using namespace cv::gpu::device;
-        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static Caller callers[] =
+        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        static func_t funcs[] =
        {
            bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>,
            bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>,
@@ -1500,33 +1507,35 @@ namespace
            bitwiseMaskOrCaller<unsigned int>
        };
+        CV_Assert(src1.depth() <= CV_64F);
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
        dst.create(src1.size(), src1.type());
-        Caller caller = callers[src1.depth()];
+        const func_t func = funcs[src1.depth()];
-        CV_Assert(caller);
        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
    }
    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
    }
    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device;
+        using namespace cv::gpu::device;
-        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        static func_t funcs[] =
-        static Caller callers[] =
        {
            bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>,
            bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>,
@@ -1534,33 +1543,35 @@ namespace
            bitwiseMaskAndCaller<unsigned int>
        };
+        CV_Assert(src1.depth() <= CV_64F);
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
        dst.create(src1.size(), src1.type());
-        Caller caller = callers[src1.depth()];
+        const func_t func = funcs[src1.depth()];
-        CV_Assert(caller);
        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
    }
    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
    }
    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace ::cv::gpu::device;
+        using namespace cv::gpu::device;
-        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+        static func_t funcs[] =
-        static Caller callers[] =
        {
            bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>,
            bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>,
@@ -1568,14 +1579,17 @@ namespace
            bitwiseMaskXorCaller<unsigned int>
        };
+        CV_Assert(src1.depth() <= CV_64F);
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
        dst.create(src1.size(), src1.type());
-        Caller caller = callers[src1.depth()];
+        const func_t func = funcs[src1.depth()];
-        CV_Assert(caller);
        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
    }
 }
@@ -1661,10 +1675,9 @@ namespace
 void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {NppBitwiseC<CV_8U, 1, nppiOrC_8u_C1R>::call, 0, NppBitwiseC<CV_8U, 3, nppiOrC_8u_C3R>::call, NppBitwiseC<CV_8U, 4, nppiOrC_8u_C4R>::call},
+        {NppBitwiseC<CV_8U , 1, nppiOrC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiOrC_8u_C4R >::call},
        {0,0,0,0},
        {NppBitwiseC<CV_16U, 1, nppiOrC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
        {0,0,0,0},
@@ -1682,10 +1695,9 @@ void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Strea
 void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {NppBitwiseC<CV_8U, 1, nppiAndC_8u_C1R>::call, 0, NppBitwiseC<CV_8U, 3, nppiAndC_8u_C3R>::call, NppBitwiseC<CV_8U, 4, nppiAndC_8u_C4R>::call},
+        {NppBitwiseC<CV_8U , 1, nppiAndC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiAndC_8u_C4R >::call},
        {0,0,0,0},
        {NppBitwiseC<CV_16U, 1, nppiAndC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
        {0,0,0,0},
@@ -1703,10 +1715,9 @@ void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
 void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {NppBitwiseC<CV_8U, 1, nppiXorC_8u_C1R>::call, 0, NppBitwiseC<CV_8U, 3, nppiXorC_8u_C3R>::call, NppBitwiseC<CV_8U, 4, nppiXorC_8u_C4R>::call},
+        {NppBitwiseC<CV_8U , 1, nppiXorC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiXorC_8u_C4R >::call},
        {0,0,0,0},
        {NppBitwiseC<CV_16U, 1, nppiXorC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
        {0,0,0,0},
@@ -1822,107 +1833,140 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
 namespace cv { namespace gpu { namespace device
 {
-    template <typename T>
+    template <typename T> void min_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+    template <typename T> void max_gpu(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
-    template <typename T>
+    template <typename T> void min_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream);
-    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+    template <typename T> void max_gpu(const DevMem2Db src, T val, DevMem2Db dst, cudaStream_t stream);
-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
 }}}
-namespace
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 {
-    template <typename T>
+    using namespace cv::gpu::device;
-    void min_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-    }
-    template <typename T>
+    typedef void (*func_t)(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
-    void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
+    static const func_t funcs[] =
    {
-        dst.create(src1.size(), src1.type());
+        min_gpu<unsigned char>,
-        ::cv::gpu::device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+        min_gpu<signed char>,
-    }
+        min_gpu<unsigned short>,
+        min_gpu<short>,
+        min_gpu<int>,
+        min_gpu<float>,
+        min_gpu<double>
+    };
-    template <typename T>
+    CV_Assert(src1.depth() <= CV_64F);
-    void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-        ::cv::gpu::device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-    }
-    template <typename T>
+    if (src1.depth() == CV_64F)
-    void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
    {
-        dst.create(src1.size(), src1.type());
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-        ::cv::gpu::device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }
+    dst.create(src1.size(), src1.type());
+    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
 }
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 {
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    using namespace cv::gpu::device;
-    CV_Assert((src1.depth() != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    typedef void (*func_t)(const DevMem2Db src1, const DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
-        min_caller<unsigned char>, min_caller<signed char>, min_caller<unsigned short>, min_caller<short>, min_caller<int>,
+        max_gpu<unsigned char>,
-        min_caller<float>, min_caller<double>
+        max_gpu<signed char>,
+        max_gpu<unsigned short>,
+        max_gpu<short>,
+        max_gpu<int>,
+        max_gpu<float>,
+        max_gpu<double>
    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+    CV_Assert(src1.depth() <= CV_64F);
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    if (src1.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+    dst.create(src1.size(), src1.type());
+    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
 }
-void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
+namespace
 {
-    CV_Assert((src1.depth() != CV_64F) ||
+    template <typename T> void minScalar(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream)
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+    {
+        cv::gpu::device::min_gpu(src, saturate_cast<T>(val), dst, stream);
+    }
-    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    template <typename T> void maxScalar(const DevMem2Db src, double val, DevMem2Db dst, cudaStream_t stream)
-    static const func_t funcs[] =
    {
-        min_caller<unsigned char>, min_caller<signed char>, min_caller<unsigned short>, min_caller<short>, min_caller<int>,
+        cv::gpu::device::max_gpu(src, saturate_cast<T>(val), dst, stream);
-        min_caller<float>, min_caller<double>
+    }
-    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
 }
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    typedef void (*func_t)(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-    CV_Assert((src1.depth() != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
-        max_caller<unsigned char>, max_caller<signed char>, max_caller<unsigned short>, max_caller<short>, max_caller<int>,
+        minScalar<unsigned char>,
-        max_caller<float>, max_caller<double>
+        minScalar<signed char>,
+        minScalar<unsigned short>,
+        minScalar<short>,
+        minScalar<int>,
+        minScalar<float>,
+        minScalar<double>
    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+    CV_Assert(src.depth() <= CV_64F);
+    CV_Assert(src.channels() == 1);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+    dst.create(src.size(), src.type());
+    funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
 }
-void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
+void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    CV_Assert((src1.depth() != CV_64F) ||
+    typedef void (*func_t)(const DevMem2Db src1, double src2, DevMem2Db dst, cudaStream_t stream);
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
-        max_caller<unsigned char>, max_caller<signed char>, max_caller<unsigned short>, max_caller<short>, max_caller<int>,
+        maxScalar<unsigned char>,
-        max_caller<float>, max_caller<double>
+        maxScalar<signed char>,
+        maxScalar<unsigned short>,
+        maxScalar<short>,
+        maxScalar<int>,
+        maxScalar<float>,
+        maxScalar<double>
    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+    CV_Assert(src.depth() <= CV_64F);
+    CV_Assert(src.channels() == 1);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+    dst.create(src.size(), src.type());
+    funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
 }
 ////////////////////////////////////////////////////////////////////////
@@ -1947,6 +1991,12 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
    CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
    CV_Assert(type <= THRESH_TOZERO_INV);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    dst.create(src.size(), src.type());
    cudaStream_t stream = StreamAccessor::getStream(s);
@@ -1967,9 +2017,8 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
    }
    else
    {
-        typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+        static const func_t funcs[] =
-        static const caller_t callers[] =
        {
            threshold_caller<unsigned char>, threshold_caller<signed char>,
            threshold_caller<unsigned short>, threshold_caller<short>,
@@ -1982,7 +2031,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
            maxVal = cvRound(maxVal);
        }
-        callers[src.depth()](src, dst, thresh, maxVal, type, stream);
+        funcs[src.depth()](src, dst, thresh, maxVal, type, stream);
    }
    return thresh;
@@ -1993,8 +2042,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 namespace cv { namespace gpu { namespace device
 {
-    template<typename T>
+    template<typename T> void pow_caller(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
-    void pow_caller(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
 }}}
 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
@@ -2002,7 +2050,6 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
    using namespace cv::gpu::device;
    typedef void (*func_t)(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
    static const func_t funcs[] =
    {
        pow_caller<unsigned char>,  pow_caller<signed char>,
@@ -2010,6 +2057,14 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
        pow_caller<int>, pow_caller<float>, pow_caller<double>
    };
+    CV_Assert(src.depth() <= CV_64F);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    dst.create(src.size(), src.type());
    funcs[src.depth()](src.reshape(1), power, dst.reshape(1), StreamAccessor::getStream(stream));
@@ -2075,8 +2130,7 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
        NppAlphaComp<CV_16U, nppiAlphaComp_16u_AC4R>::call,
        0,
        NppAlphaComp<CV_32S, nppiAlphaComp_32s_AC4R>::call,
-        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call,
+        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
-        0
    };
    CV_Assert(img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4);
@@ -2085,7 +2139,6 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
    dst.create(img1.size(), img1.type());
    const func_t func = funcs[img1.depth()];
-    CV_Assert(func != 0);
    func(img1, img2, dst, npp_alpha_ops[alpha_op], StreamAccessor::getStream(stream));
 }
@@ -2569,6 +2622,14 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
    dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type();
+    CV_Assert(src1.depth() <= CV_64F && src2.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
+    if (src1.depth() == CV_64F || src2.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    dst.create(src1.size(), dtype);
    const GpuMat* psrc1 = &src1;
@@ -2581,7 +2642,9 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
    }
    const func_t func = funcs[psrc1->depth()][psrc2->depth()][dst.depth()];
-    CV_Assert(func != 0);
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
    func(psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream));
 }

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -132,7 +132,7 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat
    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
    cudaSafeCall( cudaDeviceSynchronize() );
    double* ptrs[2] = {mean.val, stddev.val};
    dbuf.download(ptrs);
 }
@@ -148,6 +148,8 @@ double cv::gpu::norm(const GpuMat& src, int normType)
 double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
 {
+    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
    GpuMat src_single_channel = src.reshape(1);
    if (normType == NORM_L1)
@@ -156,22 +158,16 @@ double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
    if (normType == NORM_L2)
        return std::sqrt(sqrSum(src_single_channel, buf)[0]);
-    if (normType == NORM_INF)
+    // NORM_INF
-    {
+    double min_val, max_val;
-        double min_val, max_val;
+    minMax(src_single_channel, &min_val, &max_val, GpuMat(), buf);
-        minMax(src_single_channel, &min_val, &max_val, GpuMat(), buf);
+    return std::max(std::abs(min_val), std::abs(max_val));
-        return std::max(std::abs(min_val), std::abs(max_val));
-    }
-    CV_Error(CV_StsBadArg, "norm: unsupported norm type");
-    return 0;
 }
 double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 {
-    CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(src1.type() == CV_8UC1);
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
@@ -184,7 +180,7 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    sz.height = src1.rows;
    int funcIdx = normType >> 1;
    double retVal;
    DeviceBuffer dbuf;
@@ -192,7 +188,7 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
    cudaSafeCall( cudaDeviceSynchronize() );
    dbuf.download(&retVal);
    return retVal;
@@ -201,9 +197,9 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 ////////////////////////////////////////////////////////////////////////
 // Sum
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
        namespace sum
        {
@@ -230,34 +226,36 @@ namespace cv { namespace gpu { namespace device
    }
 }}}
-Scalar cv::gpu::sum(const GpuMat& src) 
+Scalar cv::gpu::sum(const GpuMat& src)
 {
    GpuMat buf;
    return sum(src, buf);
 }
-Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) 
+Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::sum;
+    using namespace cv::gpu::device::matrix_reductions::sum;
    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
-    { 
+    {
-        sumMultipassCaller<unsigned char>, sumMultipassCaller<char>, 
+        sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
-        sumMultipassCaller<unsigned short>, sumMultipassCaller<short>, 
+        sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
-        sumMultipassCaller<int>, sumMultipassCaller<float>, 0 
+        sumMultipassCaller<int>, sumMultipassCaller<float>
    };
-    static Caller singlepass_callers[7] = { 
+    static Caller singlepass_callers[] = {
-        sumCaller<unsigned char>, sumCaller<char>, 
+        sumCaller<unsigned char>, sumCaller<char>,
-        sumCaller<unsigned short>, sumCaller<short>, 
+        sumCaller<unsigned short>, sumCaller<short>,
-        sumCaller<int>, sumCaller<float>, 0 
+        sumCaller<int>, sumCaller<float>
    };
+    CV_Assert(src.depth() <= CV_32F);
    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); 
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
    Caller* callers = multipass_callers;
@@ -265,7 +263,6 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
        callers = singlepass_callers;
    Caller caller = callers[src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
    double result[4];
    caller(src, buf, result, src.channels());
@@ -273,35 +270,37 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 }
-Scalar cv::gpu::absSum(const GpuMat& src) 
+Scalar cv::gpu::absSum(const GpuMat& src)
 {
    GpuMat buf;
    return absSum(src, buf);
 }
-Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) 
+Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::sum;
+    using namespace cv::gpu::device::matrix_reductions::sum;
    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
-    { 
+    {
-        absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>, 
+        absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>,
-        absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>, 
+        absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>,
-        absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 
+        absSumMultipassCaller<int>, absSumMultipassCaller<float>
    };
-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[] =
-    {        
+    {
-        absSumCaller<unsigned char>, absSumCaller<char>, 
+        absSumCaller<unsigned char>, absSumCaller<char>,
-        absSumCaller<unsigned short>, absSumCaller<short>, 
+        absSumCaller<unsigned short>, absSumCaller<short>,
-        absSumCaller<int>, absSumCaller<float>, 0 
+        absSumCaller<int>, absSumCaller<float>
    };
+    CV_Assert(src.depth() <= CV_32F);
    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); 
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
    Caller* callers = multipass_callers;
@@ -309,7 +308,6 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
        callers = singlepass_callers;
    Caller caller = callers[src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "absSum: unsupported type");
    double result[4];
    caller(src, buf, result, src.channels());
@@ -317,43 +315,44 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 }
-Scalar cv::gpu::sqrSum(const GpuMat& src) 
+Scalar cv::gpu::sqrSum(const GpuMat& src)
 {
    GpuMat buf;
    return sqrSum(src, buf);
 }
-Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) 
+Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::sum;
+    using namespace cv::gpu::device::matrix_reductions::sum;
    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
-    { 
+    {
-        sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>, 
+        sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
-        sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>, 
+        sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
-        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 
+        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>
    };
-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[7] =
-    { 
+    {
-        sqrSumCaller<unsigned char>, sqrSumCaller<char>, 
+        sqrSumCaller<unsigned char>, sqrSumCaller<char>,
-        sqrSumCaller<unsigned short>, sqrSumCaller<short>, 
+        sqrSumCaller<unsigned short>, sqrSumCaller<short>,
-        sqrSumCaller<int>, sqrSumCaller<float>, 0 
+        sqrSumCaller<int>, sqrSumCaller<float>
    };
+    CV_Assert(src.depth() <= CV_32F);
    Caller* callers = multipass_callers;
    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
        callers = singlepass_callers;
    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); 
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
    Caller caller = callers[src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
    double result[4];
    caller(src, buf, result, src.channels());
@@ -363,24 +362,24 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 ////////////////////////////////////////////////////////////////////////
 // Find min or max
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
-        namespace minmax 
+        namespace minmax
        {
            void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
-            template <typename T> 
+            template <typename T>
            void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
-            template <typename T> 
+            template <typename T>
            void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-            template <typename T> 
+            template <typename T>
            void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
-            template <typename T> 
+            template <typename T>
            void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
        }
    }
@@ -401,41 +400,47 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);
    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
-    { 
+    {
-        minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, 
+        minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
-        minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, 
+        minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
-        minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 
+        minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0
    };
-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[] =
-    { 
+    {
-        minMaxCaller<unsigned char>, minMaxCaller<char>, 
+        minMaxCaller<unsigned char>, minMaxCaller<char>,
-        minMaxCaller<unsigned short>, minMaxCaller<short>, 
+        minMaxCaller<unsigned short>, minMaxCaller<short>,
-        minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> 
+        minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double>
    };
-    static MaskedCaller masked_multipass_callers[7] = 
+    static MaskedCaller masked_multipass_callers[] =
-    { 
+    {
-        minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, 
+        minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
        minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
        minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
    };
-    static MaskedCaller masked_singlepass_callers[7] =
+    static MaskedCaller masked_singlepass_callers[] =
-    { 
+    {
-        minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, 
+        minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
-        minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, 
+        minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
-        minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double> 
+        minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double>
    };
+    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);
    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    Size buf_size;
    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
@@ -447,7 +452,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
            callers = singlepass_callers;
        Caller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, minVal, maxVal, buf);
    }
    else
@@ -457,7 +462,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
            callers = masked_singlepass_callers;
        MaskedCaller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, mask, minVal, maxVal, buf);
    }
 }
@@ -466,36 +471,36 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 ////////////////////////////////////////////////////////////////////////
 // Locate min and max
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
-        namespace minmaxloc 
+        namespace minmaxloc
        {
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
+            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
                                    int& b1rows, int& b2cols, int& b2rows);
-            template <typename T> 
+            template <typename T>
-            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
+            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,
                                 int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-            template <typename T> 
+            template <typename T>
-            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
                                     int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-            template <typename T> 
+            template <typename T>
-            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
+            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,
                                          int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-            template <typename T> 
+            template <typename T>
-            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
                                              int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
        }
    }
 }}}
 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
-{    
+{
    GpuMat valBuf, locBuf;
    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
 }
@@ -508,45 +513,51 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
    {
-        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, 
+        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
-        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, 
+        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
-        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 
+        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
    };
-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[] =
    {
-        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, 
+        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
-        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, 
+        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
-        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> 
+        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double>
    };
-    static MaskedCaller masked_multipass_callers[7] = 
+    static MaskedCaller masked_multipass_callers[] =
    {
        minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
-        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, 
+        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
-        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 
+        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
    };
-    static MaskedCaller masked_singlepass_callers[7] = 
+    static MaskedCaller masked_singlepass_callers[] =
-    { 
+    {
-        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, 
+        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
-        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, 
+        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
-        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double> 
+        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
    };
+    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);
    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    int minLoc_[2];
    int maxLoc_[2];
    Size valbuf_size, locbuf_size;
-    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width, 
+    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width,
                       valbuf_size.height, locbuf_size.width, locbuf_size.height);
    ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
@@ -558,7 +569,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
            callers = singlepass_callers;
        Caller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }
    else
@@ -568,7 +579,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
            callers = masked_singlepass_callers;
        MaskedCaller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }
@@ -579,18 +590,18 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
 //////////////////////////////////////////////////////////////////////////////
 // Count non-zero elements
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
-        namespace countnonzero 
+        namespace countnonzero
        {
            void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
-            template <typename T> 
+            template <typename T>
            int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);
-            template <typename T> 
+            template <typename T>
            int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
        }
    }
@@ -609,21 +620,28 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
    typedef int (*Caller)(const DevMem2Db src, PtrStepb buf);
-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[7] =
    {
        countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
        countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
-        countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 
+        countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0
    };
-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[7] =
    {
        countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
        countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
        countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };
+    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
    Size buf_size;
    getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
@@ -633,16 +651,16 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
        callers = singlepass_callers;
    Caller caller = callers[src.type()];
-    if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
+    CV_Assert(caller != 0);
    return caller(src, buf);
 }
 //////////////////////////////////////////////////////////////////////////////
 // reduce
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
        template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
        template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
@@ -666,7 +684,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    {
        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        static const caller_t callers[6][6] = 
+        static const caller_t callers[6][6] =
        {
            {
                reduceRows_gpu<unsigned char, int, unsigned char>,
@@ -719,6 +737,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
        };
        const caller_t func = callers[src.depth()][dst.depth()];
        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
@@ -728,7 +747,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    {
        typedef void (*caller_t)(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        static const caller_t callers[6][6] = 
+        static const caller_t callers[6][6] =
        {
            {
                reduceCols_gpu<unsigned char, int, unsigned char>,
@@ -781,10 +800,11 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
        };
        const caller_t func = callers[src.depth()][dst.depth()];
        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
-        func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));        
+        func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));
    }
 }

--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -74,7 +74,7 @@
    #include "cuda.h"
    #include "cuda_runtime_api.h"
    #include "npp.h"
    #ifdef HAVE_CUFFT
        #include "cufft.h"
    #endif
@@ -85,7 +85,7 @@
    #include "internal_shared.hpp"
    #include "opencv2/gpu/stream_accessor.hpp"
    #include "nvidia/core/NCV.hpp"
    #include "nvidia/NPP_staging/NPP_staging.hpp"
    #include "nvidia/NCVHaarObjectDetection.hpp"
@@ -106,7 +106,7 @@
        #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
    #endif
-    static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); }
+    static inline void throw_nogpu() { CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform"); }
 #else /* defined(HAVE_CUDA) */

--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -995,13 +995,28 @@ TEST_P(AbsDiff, Array)
    cv::Mat src1 = randomMat(size, depth);
    cv::Mat src2 = randomMat(size, depth);
-    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::absdiff(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::absdiff(loadMat(src1), loadMat(src2), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::absdiff(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
-    cv::Mat dst_gold;
+        cv::Mat dst_gold;
-    cv::absdiff(src1, src2, dst_gold);
+        cv::absdiff(src1, src2, dst_gold);
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 TEST_P(AbsDiff, Scalar)
@@ -1009,13 +1024,28 @@ TEST_P(AbsDiff, Scalar)
    cv::Mat src = randomMat(size, depth);
    cv::Scalar val = randomScalar(0.0, 255.0);
-    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::absdiff(loadMat(src, useRoi), val, dst);
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::absdiff(loadMat(src), val, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::absdiff(loadMat(src, useRoi), val, dst);
-    cv::Mat dst_gold;
+        cv::Mat dst_gold;
-    cv::absdiff(src, val, dst_gold);
+        cv::absdiff(src, val, dst_gold);
-    EXPECT_MAT_NEAR(dst_gold, dst, depth <= CV_32F ? 1.0 : 1e-5);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth <= CV_32F ? 1.0 : 1e-5);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, AbsDiff, testing::Combine(
@@ -1243,6 +1273,40 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Exp
+template <typename T> void expImpl(const cv::Mat& src, cv::Mat& dst)
+{
+    dst.create(src.size(), src.type());
+    for (int y = 0; y < src.rows; ++y)
+    {
+        for (int x = 0; x < src.cols; ++x)
+            dst.at<T>(y, x) = cv::saturate_cast<T>(static_cast<int>(std::exp(static_cast<float>(src.at<T>(y, x)))));
+    }
+}
+void expImpl_float(const cv::Mat& src, cv::Mat& dst)
+{
+    dst.create(src.size(), src.type());
+    for (int y = 0; y < src.rows; ++y)
+    {
+        for (int x = 0; x < src.cols; ++x)
+            dst.at<float>(y, x) = std::exp(static_cast<float>(src.at<float>(y, x)));
+    }
+}
+void expGold(const cv::Mat& src, cv::Mat& dst)
+{
+    typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
+    const func_t funcs[] =
+    {
+        expImpl<uchar>, expImpl<schar>, expImpl<ushort>, expImpl<short>,
+        expImpl<int>, expImpl_float
+    };
+    funcs[src.depth()](src, dst);
+}
 PARAM_TEST_CASE(Exp, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
@@ -1269,7 +1333,7 @@ TEST_P(Exp, Accuracy)
    cv::gpu::exp(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
-    cv::exp(src, dst_gold);
+    expGold(src, dst_gold);
    EXPECT_MAT_NEAR(dst_gold, dst, 1e-2);
 }
@@ -1277,7 +1341,10 @@ TEST_P(Exp, Accuracy)
 INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_32FC1)),
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_16UC1),
+                    MatType(CV_16SC1),
+                    MatType(CV_32FC1)),
    WHOLE_SUBMAT));
 ////////////////////////////////////////////////////////////////////////////////
@@ -1311,13 +1378,28 @@ TEST_P(Compare, Accuracy)
    cv::Mat src1 = randomMat(size, depth);
    cv::Mat src2 = randomMat(size, depth);
-    cv::gpu::GpuMat dst = createMat(size, CV_8UC1, useRoi);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::compare(loadMat(src1, useRoi), loadMat(src2, useRoi), dst, cmp_code);
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::compare(loadMat(src1), loadMat(src2), dst, cmp_code);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, CV_8UC1, useRoi);
+        cv::gpu::compare(loadMat(src1, useRoi), loadMat(src2, useRoi), dst, cmp_code);
-    cv::Mat dst_gold;
+        cv::Mat dst_gold;
-    cv::compare(src1, src2, dst_gold, cmp_code);
+        cv::compare(src1, src2, dst_gold, cmp_code);
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Compare, testing::Combine(
@@ -1635,17 +1717,60 @@ PARAM_TEST_CASE(Min, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
    }
 };
-TEST_P(Min, Accuracy)
+TEST_P(Min, Array)
 {
    cv::Mat src1 = randomMat(size, depth);
    cv::Mat src2 = randomMat(size, depth);
-    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::min(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::min(loadMat(src1), loadMat(src2), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::min(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
-    cv::Mat dst_gold = cv::min(src1, src2);
+        cv::Mat dst_gold = cv::min(src1, src2);
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+TEST_P(Min, Scalar)
+{
+    cv::Mat src = randomMat(size, depth);
+    double val = randomDouble(0.0, 255.0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::min(loadMat(src), val, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::min(loadMat(src, useRoi), val, dst);
+        cv::Mat dst_gold = cv::min(src, val);
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Min, testing::Combine(
@@ -1675,17 +1800,60 @@ PARAM_TEST_CASE(Max, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
    }
 };
-TEST_P(Max, Accuracy)
+TEST_P(Max, Array)
 {
    cv::Mat src1 = randomMat(size, depth);
    cv::Mat src2 = randomMat(size, depth);
-    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::max(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::max(loadMat(src1), loadMat(src2), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::max(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
+        cv::Mat dst_gold = cv::max(src1, src2);
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
-    cv::Mat dst_gold = cv::max(src1, src2);
+TEST_P(Max, Scalar)
+{
+    cv::Mat src = randomMat(size, depth);
+    double val = randomDouble(0.0, 255.0);
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::max(loadMat(src), val, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::max(loadMat(src, useRoi), val, dst);
+        cv::Mat dst_gold = cv::max(src, val);
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Max, testing::Combine(
@@ -1723,13 +1891,28 @@ TEST_P(Pow, Accuracy)
    if (src.depth() < CV_32F)
        power = static_cast<int>(power);
-    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::pow(loadMat(src, useRoi), power, dst);
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::pow(loadMat(src), power, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::pow(loadMat(src, useRoi), power, dst);
-    cv::Mat dst_gold;
+        cv::Mat dst_gold;
-    cv::pow(src, power, dst_gold);
+        cv::pow(src, power, dst_gold);
-    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 0.0 : 1e-1);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 0.0 : 1e-1);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Pow, testing::Combine(
@@ -1750,7 +1933,6 @@ PARAM_TEST_CASE(AddWeighted, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth,
    int dst_depth;
    bool useRoi;
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
@@ -1772,13 +1954,28 @@ TEST_P(AddWeighted, Accuracy)
    double beta = randomDouble(-10.0, 10.0);
    double gamma = randomDouble(-10.0, 10.0);
-    cv::gpu::GpuMat dst = createMat(size, dst_depth, useRoi);
+    if ((depth1 == CV_64F || depth2 == CV_64F || dst_depth == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::addWeighted(loadMat(src1, useRoi), alpha, loadMat(src2, useRoi), beta, gamma, dst, dst_depth);
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::addWeighted(loadMat(src1), alpha, loadMat(src2), beta, gamma, dst, dst_depth);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, dst_depth, useRoi);
+        cv::gpu::addWeighted(loadMat(src1, useRoi), alpha, loadMat(src2, useRoi), beta, gamma, dst, dst_depth);
-    cv::Mat dst_gold;
+        cv::Mat dst_gold;
-    cv::addWeighted(src1, alpha, src2, beta, gamma, dst_gold, dst_depth);
+        cv::addWeighted(src1, alpha, src2, beta, gamma, dst_gold, dst_depth);
-    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 1.0 : 1e-12);
+        EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 1.0 : 1e-12);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, AddWeighted, testing::Combine(
@@ -1823,13 +2020,52 @@ TEST_P(GEMM, Accuracy)
    double alpha = randomDouble(-10.0, 10.0);
    double beta = randomDouble(-10.0, 10.0);
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+#ifndef HAVE_CUBLAS
-    cv::gpu::gemm(loadMat(src1, useRoi), loadMat(src2, useRoi), alpha, loadMat(src3, useRoi), beta, dst, flags);
+    try
+    {
+        cv::gpu::GpuMat dst;
+        cv::gpu::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+    }
+    catch (const cv::Exception& e)
+    {
+        ASSERT_EQ(CV_StsNotImplemented, e.code);
+    }
+#else
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else if (type == CV_64FC2 && flags != 0)
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+        cv::gpu::gemm(loadMat(src1, useRoi), loadMat(src2, useRoi), alpha, loadMat(src3, useRoi), beta, dst, flags);
-    cv::Mat dst_gold;
+        cv::Mat dst_gold;
-    cv::gemm(src1, src2, alpha, src3, beta, dst_gold, flags);
+        cv::gemm(src1, src2, alpha, src3, beta, dst_gold, flags);
-    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1e-10);
+        EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1e-10);
+    }
+#endif
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, GEMM, testing::Combine(
@@ -1864,13 +2100,28 @@ TEST_P(Transpose, Accuracy)
 {
    cv::Mat src = randomMat(size, type);
-    cv::gpu::GpuMat dst = createMat(cv::Size(size.height, size.width), type, useRoi);
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::transpose(loadMat(src, useRoi), dst);
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::transpose(loadMat(src), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(cv::Size(size.height, size.width), type, useRoi);
+        cv::gpu::transpose(loadMat(src, useRoi), dst);
-    cv::Mat dst_gold;
+        cv::Mat dst_gold;
-    cv::transpose(src, dst_gold);
+        cv::transpose(src, dst_gold);
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, Transpose, testing::Combine(
@@ -2498,14 +2749,29 @@ TEST_P(MinMax, WithoutMask)
 {
    cv::Mat src = randomMat(size, depth);
-    double minVal, maxVal;
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::minMax(loadMat(src, useRoi), &minVal, &maxVal);
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::gpu::minMax(loadMat(src), &minVal, &maxVal);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::gpu::minMax(loadMat(src, useRoi), &minVal, &maxVal);
-    double minVal_gold, maxVal_gold;
+        double minVal_gold, maxVal_gold;
-    minMaxLocGold(src, &minVal_gold, &maxVal_gold);
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold);
-    EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
-    EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
 }
 TEST_P(MinMax, WithMask)
@@ -2513,21 +2779,60 @@ TEST_P(MinMax, WithMask)
    cv::Mat src = randomMat(size, depth);
    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
-    double minVal, maxVal;
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::gpu::minMax(loadMat(src, useRoi), &minVal, &maxVal, loadMat(mask, useRoi));
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::gpu::minMax(loadMat(src), &minVal, &maxVal, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::gpu::minMax(loadMat(src, useRoi), &minVal, &maxVal, loadMat(mask, useRoi));
-    double minVal_gold, maxVal_gold;
+        double minVal_gold, maxVal_gold;
-    minMaxLocGold(src, &minVal_gold, &maxVal_gold, 0, 0, mask);
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, 0, 0, mask);
-    EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
-    EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
 }
 TEST_P(MinMax, NullPtr)
 {
    cv::Mat src = randomMat(size, depth);
-    cv::gpu::minMax(loadMat(src, useRoi), 0, 0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::gpu::minMax(loadMat(src), &minVal, 0);
+            cv::gpu::minMax(loadMat(src), 0, &maxVal);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::gpu::minMax(loadMat(src, useRoi), &minVal, 0);
+        cv::gpu::minMax(loadMat(src, useRoi), 0, &maxVal);
+        double minVal_gold, maxVal_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, 0, 0);
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, MinMax, testing::Combine(
@@ -2585,19 +2890,35 @@ TEST_P(MinMaxLoc, WithoutMask)
 {
    cv::Mat src = randomMat(size, depth);
-    double minVal, maxVal;
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::Point minLoc, maxLoc;
+    {
-    cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc);
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::gpu::minMaxLoc(loadMat(src), &minVal, &maxVal, &minLoc, &maxLoc);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc);
-    double minVal_gold, maxVal_gold;
+        double minVal_gold, maxVal_gold;
-    cv::Point minLoc_gold, maxLoc_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
-    minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
-    EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
-    EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
-    expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, minLoc_gold, minLoc);
-    expectEqual(src, maxLoc_gold, maxLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
 }
 TEST_P(MinMaxLoc, WithMask)
@@ -2605,26 +2926,76 @@ TEST_P(MinMaxLoc, WithMask)
    cv::Mat src = randomMat(size, depth);
    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
-    double minVal, maxVal;
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    cv::Point minLoc, maxLoc;
+    {
-    cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask, useRoi));
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::gpu::minMaxLoc(loadMat(src), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask, useRoi));
-    double minVal_gold, maxVal_gold;
+        double minVal_gold, maxVal_gold;
-    cv::Point minLoc_gold, maxLoc_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
-    minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold, mask);
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold, mask);
-    EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
-    EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
-    expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, minLoc_gold, minLoc);
-    expectEqual(src, maxLoc_gold, maxLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
 }
 TEST_P(MinMaxLoc, NullPtr)
 {
    cv::Mat src = randomMat(size, depth);
-    cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, 0);
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, 0, 0, 0);
+            cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, &maxVal, 0, 0);
+            cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, &minLoc, 0);
+            cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, &maxLoc);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), &minVal, 0, 0, 0);
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, &maxVal, 0, 0);
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, &minLoc, 0);
+        cv::gpu::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, &maxLoc);
+        double minVal_gold, maxVal_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+        expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, MinMaxLoc, testing::Combine(
@@ -2661,12 +3032,25 @@ TEST_P(CountNonZero, Accuracy)
    cv::Mat src;
    srcBase.convertTo(src, depth);
-    int val = cv::gpu::countNonZero(loadMat(src, useRoi));
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
-    int val_gold = cv::countNonZero(src);
+        try
+        {
+            cv::gpu::countNonZero(loadMat(src));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        int val = cv::gpu::countNonZero(loadMat(src, useRoi));
+        int val_gold = cv::countNonZero(src);
-    ASSERT_EQ(val_gold, val);
+        ASSERT_EQ(val_gold, val);
+    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_Core, CountNonZero, testing::Combine(