fixed errors in gpu on old video cards (SURF_GPU, BruteForceMatcher_GPU, min/max, setTo, convertTo)

added assertion after all kernels calls

fixed errors in gpu on old video cards (SURF_GPU, BruteForceMatcher_GPU, min/max, setTo, convertTo)
added assertion after all kernels calls
deac5d97 · Vladislav Vinogradov · 5f175f95 · deac5d97 · deac5d97 · deac5d97
Commit deac5d97 authored Feb 14, 2011 by Vladislav Vinogradov
31 changed files
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -435,8 +435,8 @@ namespace cv

            void enqueueCopy(const GpuMat& src, GpuMat& dst);

-            void enqueueMemSet(const GpuMat& src, Scalar val);
-            void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);
+            void enqueueMemSet(GpuMat& src, Scalar val);
+            void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);

            // converts matrix type, ex from float to uchar depending on type
            void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);

--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -76,18 +76,22 @@ namespace cv { namespace gpu { namespace bfmatcher
 {
    template <typename T>
    void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        bool cc_12);
    template <typename T>
    void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        bool cc_12);
    template <typename T>
    void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
-        const DevMem2Df& distance);
+        const DevMem2Df& distance, 
+        bool cc_12);
    template <typename T>
    void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
-        const DevMem2Df& distance);
+        const DevMem2Df& distance, 
+        bool cc_12);

    template <typename T>
    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
@@ -160,17 +164,20 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
    using namespace cv::gpu::bfmatcher;

    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
-        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        bool cc_12);

    static const match_caller_t match_callers[2][8] =
    {
        {
-            matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<char>, matchSingleL1_gpu<unsigned short>,
-            matchSingleL1_gpu<short>, matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
+            matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>, 
+            matchSingleL1_gpu<unsigned short>, matchSingleL1_gpu<short>, 
+            matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
        },
        {
-            matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<char>, matchSingleL2_gpu<unsigned short>,
-            matchSingleL2_gpu<short>, matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
+            matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>, 
+            matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>, 
+            matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
        }
    };

@@ -185,9 +192,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
    match_caller_t func = match_callers[distType][queryDescs.depth()];
    CV_Assert(func != 0);

+    bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+
    // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx.
    // trainIdx store after imgIdx, so we doesn't lose it value.
-    func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance);
+    func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12);
 }

 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance,
@@ -284,17 +293,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes

    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
-        const DevMem2Df& distance);
+        const DevMem2Df& distance, bool cc_12);

    static const match_caller_t match_callers[2][8] =
    {
        {
-            matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<char>,
+            matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>,
            matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>,
            matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0
        },
        {
-            matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<char>,
+            matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>,
            matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,
            matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
        }
@@ -311,7 +320,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
    match_caller_t func = match_callers[distType][queryDescs.depth()];
    CV_Assert(func != 0);

-    func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance);
+    bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+
+    func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12);
 }

 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx,
@@ -383,11 +394,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
    static const match_caller_t match_callers[2][8] =
    {
        {
-            knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<char>, knnMatchL1_gpu<unsigned short>,
+            knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>,
            knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0
        },
        {
-            knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<char>, knnMatchL2_gpu<unsigned short>,
+            knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>,
            knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
        }
    };
@@ -522,11 +533,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
    static const radiusMatch_caller_t radiusMatch_callers[2][8] =
    {
        {
-            radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<char>, radiusMatchL1_gpu<unsigned short>,
+            radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>,
            radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0
        },
        {
-            radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<char>, radiusMatchL2_gpu<unsigned short>,
+            radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>,
            radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
        }
    };

--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -43,6 +43,7 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/limits_gpu.hpp"

 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -51,13 +52,9 @@ using namespace cv::gpu::device;
 #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
 #endif

-#ifndef FLT_EPSILON
-    #define FLT_EPSILON     1.192092896e-07F
-#endif
-
 namespace cv { namespace gpu { namespace color
 {
-    template<typename T> struct ColorChannel {};
+    template<typename T> struct ColorChannel;
    template<> struct ColorChannel<uchar>
    {
        typedef float worktype_f;
@@ -133,6 +130,7 @@ namespace cv { namespace gpu { namespace color

        RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols, bidx);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -276,6 +274,7 @@ namespace cv { namespace gpu { namespace color

        RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols, bidx);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -304,6 +303,7 @@ namespace cv { namespace gpu { namespace color

        RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols, bidx);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -385,6 +385,7 @@ namespace cv { namespace gpu { namespace color

        Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -425,6 +426,7 @@ namespace cv { namespace gpu { namespace color

        Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -533,6 +535,7 @@ namespace cv { namespace gpu { namespace color

        RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols, bidx);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -573,6 +576,7 @@ namespace cv { namespace gpu { namespace color

        RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -698,6 +702,7 @@ namespace cv { namespace gpu { namespace color

        RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols, bidx);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -756,6 +761,7 @@ namespace cv { namespace gpu { namespace color

        YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols, bidx);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace color

        RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -960,6 +967,7 @@ namespace cv { namespace gpu { namespace color

        XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
            dst.data, dst.step, src.rows, src.cols);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -1063,8 +1071,8 @@ namespace cv { namespace gpu { namespace color
            vmin = fmin(vmin, b);

            diff = v - vmin;
-            s = diff / (float)(fabs(v) + FLT_EPSILON);
-            diff = (float)(60. / (diff + FLT_EPSILON));
+            s = diff / (float)(fabs(v) + numeric_limits_gpu<float>::epsilon());
+            diff = (float)(60. / (diff + numeric_limits_gpu<float>::epsilon()));

            if (v == r)
                h = (g - b) * diff;
@@ -1199,6 +1207,8 @@ namespace cv { namespace gpu { namespace color
            RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
                dst.data, dst.step, src.rows, src.cols, bidx);

+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
    }
@@ -1281,6 +1291,8 @@ namespace cv { namespace gpu { namespace color
            HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
                dst.data, dst.step, src.rows, src.cols, bidx);

+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
    }
@@ -1342,7 +1354,7 @@ namespace cv { namespace gpu { namespace color
            diff = vmax - vmin;
            l = (vmax + vmin) * 0.5f;

-            if (diff > FLT_EPSILON)
+            if (diff > numeric_limits_gpu<float>::epsilon())
            {
                s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin);
                diff = 60.f / diff;
@@ -1550,6 +1562,8 @@ namespace cv { namespace gpu { namespace color
            HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
                dst.data, dst.step, src.rows, src.cols, bidx);

+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
    }

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -44,6 +44,7 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vecmath.hpp"
 #include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"

 #include "safe_call.hpp"
 #include "internal_shared.hpp"
@@ -51,192 +52,6 @@
 using namespace cv::gpu;
 using namespace cv::gpu::device;

-namespace cv 
-{ 
-    namespace gpu 
-    {
-        namespace device
-        {
-            struct BrdReflect101 
-            {
-                explicit BrdReflect101(int len): last(len - 1) {}
-
-                __device__ int idx_low(int i) const
-                {
-                    return abs(i);
-                }
-
-                __device__ int idx_high(int i) const 
-                {
-                    return last - abs(last - i);
-                }
-
-                __device__ int idx(int i) const
-                {
-                    return abs(idx_high(i));
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return -last <= mini && maxi <= 2 * last;
-                }
-
-                int last;
-            };
-            template <typename D>
-            struct BrdRowReflect101: BrdReflect101
-            {
-                explicit BrdRowReflect101(int len): BrdReflect101(len) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i)]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i)]);
-                }
-            };
-            template <typename D>
-            struct BrdColReflect101: BrdReflect101
-            {
-                BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i) * step]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i) * step]);
-                }
-
-                int step;
-            };
-
-            struct BrdReplicate
-            {
-                explicit BrdReplicate(int len): last(len - 1) {}
-
-                __device__ int idx_low(int i) const
-                {
-                    return max(i, 0);
-                }
-
-                __device__ int idx_high(int i) const 
-                {
-                    return min(i, last);
-                }
-
-                __device__ int idx(int i) const
-                {
-                    return max(min(i, last), 0);
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return true;
-                }
-
-                int last;
-            };
-            template <typename D>
-            struct BrdRowReplicate: BrdReplicate
-            {
-                explicit BrdRowReplicate(int len): BrdReplicate(len) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i)]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i)]);
-                }
-            };
-            template <typename D>
-            struct BrdColReplicate: BrdReplicate
-            {
-                BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i) * step]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i) * step]);
-                }
-                int step;
-            };
-
-            template <typename D>
-            struct BrdRowConstant
-            {
-                explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return i >= 0 ? saturate_cast<D>(data[i]) : val;
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return i < len ? saturate_cast<D>(data[i]) : val;
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return true;
-                }
-
-                int len;
-                D val;
-            };
-            template <typename D>
-            struct BrdColConstant
-            {
-                BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return i < len ? saturate_cast<D>(data[i * step]) : val;
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return true;
-                }
-
-                int len;
-                int step;
-                D val;
-            };
-        }
-    }
-}
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Linear filters

@@ -329,6 +144,7 @@ namespace cv { namespace gpu { namespace filters
        }

        filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
+        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaThreadSynchronize() );
    }
@@ -467,6 +283,7 @@ namespace cv { namespace gpu { namespace filters
        }

        filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
+        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaThreadSynchronize() );
    }
@@ -705,14 +522,18 @@ namespace cv { namespace gpu { namespace bf
            for (int i = 0; i < iters; ++i)
            {
                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
            }
            break;
        case 3:
            for (int i = 0; i < iters; ++i)
            {
                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
            }
            break;
        default:

--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -222,6 +222,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
    int smem = hists_size + final_hists_size;
    compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
        img_block_width, grad, qangle, scale, block_hists);
+    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall(cudaThreadSynchronize());
 }
@@ -325,6 +326,8 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
    else
        cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);

+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -421,6 +424,8 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
    classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
        img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 
        block_hists, coefs, free_coef, threshold, labels);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -467,6 +472,8 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
                          block_stride_x;
    extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -515,6 +522,8 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i
                          block_stride_x;
    extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -640,6 +649,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
        compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(
                height, width, img, angle_scale, grad, qangle);

+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -713,6 +724,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
        compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(
                height, width, img, angle_scale, grad, qangle);

+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -749,6 +762,8 @@ void resize_8UC4(const DevMem2D& src, DevMem2D dst)
    float sx = (float)src.cols / dst.cols;
    float sy = (float)src.rows / dst.rows;
    resize_8UC4_kernel<<<grid, threads>>>(sx, sy, dst);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());

    cudaSafeCall(cudaUnbindTexture(resize8UC4_tex));
@@ -776,6 +791,8 @@ void resize_8UC1(const DevMem2D& src, DevMem2D dst)
    float sx = (float)src.cols / dst.cols;
    float sy = (float)src.rows / dst.rows;
    resize_8UC1_kernel<<<grid, threads>>>(sx, sy, dst);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());

    cudaSafeCall(cudaUnbindTexture(resize8UC1_tex));

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -137,6 +137,7 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );

        remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
+        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaThreadSynchronize() );  
        cudaSafeCall( cudaUnbindTexture(tex_remap) );
@@ -150,6 +151,7 @@ namespace cv { namespace gpu { namespace imgproc
        grid.y = divUp(dst.rows, threads.y);

        remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
+        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaThreadSynchronize() ); 
    }
@@ -259,6 +261,8 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );

        meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall( cudaThreadSynchronize() );
        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
    }
@@ -273,6 +277,8 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );

        meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall( cudaThreadSynchronize() );
        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
    }
@@ -388,6 +394,7 @@ namespace cv { namespace gpu { namespace imgproc
        grid.y = divUp(src.rows, threads.y);
         
        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() ); 
@@ -401,6 +408,7 @@ namespace cv { namespace gpu { namespace imgproc
        grid.y = divUp(src.rows, threads.y);
         
        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );
        
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -451,6 +459,7 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );

        reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -491,6 +500,8 @@ namespace cv { namespace gpu { namespace imgproc
        dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));

        extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall(cudaThreadSynchronize());
    }

@@ -598,6 +609,8 @@ namespace cv { namespace gpu { namespace imgproc
            break;
        }

+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall(cudaThreadSynchronize());
        cudaSafeCall(cudaUnbindTexture(harrisDxTex));
        cudaSafeCall(cudaUnbindTexture(harrisDyTex));
@@ -712,6 +725,8 @@ namespace cv { namespace gpu { namespace imgproc
            break;
        }

+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall(cudaThreadSynchronize());
        cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
        cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
@@ -746,6 +761,8 @@ namespace cv { namespace gpu { namespace imgproc
        dim3 grid(divUp(src.cols, threads.x));

        column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall(cudaThreadSynchronize());
    }

@@ -772,6 +789,8 @@ namespace cv { namespace gpu { namespace imgproc
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

        mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall(cudaThreadSynchronize());
    }

@@ -799,6 +818,8 @@ namespace cv { namespace gpu { namespace imgproc
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

        mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall(cudaThreadSynchronize());
    }

@@ -827,6 +848,8 @@ namespace cv { namespace gpu { namespace imgproc
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

        mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall(cudaThreadSynchronize());
    }

@@ -855,6 +878,8 @@ namespace cv { namespace gpu { namespace imgproc
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
+        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall(cudaThreadSynchronize());
    }


--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -132,6 +132,8 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ,
                templ.cols, templ.rows, image, templ, result);
        break;
    }
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -161,6 +163,8 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ,
                templ.cols, templ.rows, image, templ, result);
        break;
    }
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -222,6 +226,8 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ,
                templ.cols, templ.rows, image, templ, result);
        break;
    }
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -251,6 +257,8 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ,
                templ.cols, templ.rows, image, templ, result);
        break;
    }
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -299,6 +307,8 @@ void matchTemplatePrepared_SQDIFF_8U(
                w, h, image_sqsum, templ_sqsum, result);
        break;
    }
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -348,6 +358,8 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U(
                w, h, image_sqsum, templ_sqsum, result);
        break;
    }
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -378,6 +390,8 @@ void matchTemplatePrepared_CCOFF_8U(
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(
            w, h, (float)templ_sum / (w * h), image_sum, result);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -418,6 +432,8 @@ void matchTemplatePrepared_CCOFF_8UC2(
    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(
            w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
            image_sum_r, image_sum_g, result);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -472,6 +488,8 @@ void matchTemplatePrepared_CCOFF_8UC3(
            (float)templ_sum_g / (w * h), 
            (float)templ_sum_b / (w * h),
            image_sum_r, image_sum_g, image_sum_b, result);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -536,6 +554,8 @@ void matchTemplatePrepared_CCOFF_8UC4(
            (float)templ_sum_a / (w * h),
            image_sum_r, image_sum_g, image_sum_b, image_sum_a,
            result);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -580,6 +600,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(
            w, h, weight, templ_sum_scale, templ_sqsum_scale, 
            image_sum, image_sqsum, result);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -641,6 +663,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
            image_sum_r, image_sqsum_r, 
            image_sum_g, image_sqsum_g, 
            result);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -716,6 +740,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
            image_sum_g, image_sqsum_g, 
            image_sum_b, image_sqsum_b, 
            result);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -805,6 +831,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
            image_sum_b, image_sqsum_b, 
            image_sum_a, image_sqsum_a, 
            result);
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -847,6 +875,8 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
        normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
        break;
    }
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }

@@ -887,6 +917,8 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn)
        extractFirstChannel_32F<4><<<grid, threads>>>(image, result);
        break;
    }
+    cudaSafeCall( cudaGetLastError() );
+
    cudaSafeCall(cudaThreadSynchronize());
 }


--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -150,6 +150,7 @@ namespace cv { namespace gpu { namespace mathfunc
        cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
            x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), 
            mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -198,6 +199,7 @@ namespace cv { namespace gpu { namespace mathfunc

        polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), 
            angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );

--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -233,6 +233,8 @@ namespace cv { namespace gpu { namespace split_merge {
                src[0].data, src[0].step,
                src[1].data, src[1].step,
                dst.rows, dst.cols, dst.data, dst.step);
+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall(cudaThreadSynchronize());
    }
@@ -248,6 +250,8 @@ namespace cv { namespace gpu { namespace split_merge {
                src[1].data, src[1].step,
                src[2].data, src[2].step,
                dst.rows, dst.cols, dst.data, dst.step);
+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall(cudaThreadSynchronize());
    }
@@ -264,6 +268,8 @@ namespace cv { namespace gpu { namespace split_merge {
                src[2].data, src[2].step,
                src[3].data, src[3].step,
                dst.rows, dst.cols, dst.data, dst.step);
+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall(cudaThreadSynchronize());
    }
@@ -436,6 +442,8 @@ namespace cv { namespace gpu { namespace split_merge {
                src.data, src.step, src.rows, src.cols,
                dst[0].data, dst[0].step,
                dst[1].data, dst[1].step);
+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall(cudaThreadSynchronize());
    }
@@ -451,6 +459,8 @@ namespace cv { namespace gpu { namespace split_merge {
                dst[0].data, dst[0].step,
                dst[1].data, dst[1].step,
                dst[2].data, dst[2].step);
+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall(cudaThreadSynchronize());
    }
@@ -467,6 +477,8 @@ namespace cv { namespace gpu { namespace split_merge {
                 dst[1].data, dst[1].step,
                 dst[2].data, dst[2].step,
                 dst[3].data, dst[3].step);
+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall(cudaThreadSynchronize());
    }

--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -325,6 +325,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri
    size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);

    stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
+    cudaSafeCall( cudaGetLastError() );
+
    if (stream == 0)        
        cudaSafeCall( cudaThreadSynchronize() );
 };
@@ -402,6 +404,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
    grid.y = divUp(input.rows, threads.y);

    prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
+    cudaSafeCall( cudaGetLastError() );

    if (stream == 0)   
 		cudaSafeCall( cudaThreadSynchronize() );    
@@ -526,6 +529,7 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a

    size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
    textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
+    cudaSafeCall( cudaGetLastError() );

 	if (stream == 0)					
 		cudaSafeCall( cudaThreadSynchronize() );		

--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -172,6 +172,7 @@ namespace cv { namespace gpu { namespace bp
        grid.y = divUp(left.rows, threads.y);

        comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -185,6 +186,7 @@ namespace cv { namespace gpu { namespace bp
        grid.y = divUp(left.rows, threads.y);

        comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -199,6 +201,7 @@ namespace cv { namespace gpu { namespace bp
        grid.y = divUp(left.rows, threads.y);

        comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -212,6 +215,7 @@ namespace cv { namespace gpu { namespace bp
        grid.y = divUp(left.rows, threads.y);

        comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -226,6 +230,7 @@ namespace cv { namespace gpu { namespace bp
        grid.y = divUp(left.rows, threads.y);

        comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -239,6 +244,7 @@ namespace cv { namespace gpu { namespace bp
        grid.y = divUp(left.rows, threads.y);

        comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -278,6 +284,7 @@ namespace cv { namespace gpu { namespace bp
        grid.y = divUp(dst_rows, threads.y);

        data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -321,9 +328,13 @@ namespace cv { namespace gpu { namespace bp
        int src_idx = (dst_idx + 1) & 1;

        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
+        cudaSafeCall( cudaGetLastError() );
        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
+        cudaSafeCall( cudaGetLastError() );
        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
+        cudaSafeCall( cudaGetLastError() );
        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -443,6 +454,7 @@ namespace cv { namespace gpu { namespace bp
        for(int t = 0; t < iters; ++t)
        {
            one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
+            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaThreadSynchronize() );
@@ -505,6 +517,7 @@ namespace cv { namespace gpu { namespace bp
        grid.y = divUp(disp.rows, threads.y);

        output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );

--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -382,6 +382,8 @@ namespace cv { namespace gpu { namespace csbp
        cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );

        init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );

@@ -395,6 +397,9 @@ namespace cv { namespace gpu { namespace csbp
            get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
        else
            get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
+        
+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
    }
@@ -578,6 +583,7 @@ namespace cv { namespace gpu { namespace csbp
        cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,  &msg_step2,  sizeof(size_t)) );

        callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -700,10 +706,11 @@ namespace cv { namespace gpu { namespace csbp
        grid.y = divUp(h, threads.y);

        init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
-                                                         u_cur, d_cur, l_cur, r_cur,
-                                                         selected_disp_pyr_new, selected_disp_pyr_cur,
-                                                         data_cost_selected, data_cost,
-                                                         h, w, nr_plane, h2, w2, nr_plane2);
+                                                   u_cur, d_cur, l_cur, r_cur,
+                                                   selected_disp_pyr_new, selected_disp_pyr_cur,
+                                                   data_cost_selected, data_cost,
+                                                   h, w, nr_plane, h2, w2, nr_plane2);
+        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -805,6 +812,7 @@ namespace cv { namespace gpu { namespace csbp
        for(int t = 0; t < iters; ++t)
        {
            compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
+            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaThreadSynchronize() );
@@ -873,7 +881,9 @@ namespace cv { namespace gpu { namespace csbp
        grid.y = divUp(disp.rows, threads.y);

        compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
-                                                         disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
+                                                   disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
+        cudaSafeCall( cudaGetLastError() );
+
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
    }

--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -61,8 +61,8 @@ void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) {
 void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
 void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
 void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
 void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */
@@ -77,8 +77,10 @@ namespace cv
        {            
            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);

-            void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
-            void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            template <typename T>
+            void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
+            template <typename T>
+            void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);

            void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
        }
@@ -99,6 +101,20 @@ namespace
        size_t bwidth = src.cols * src.elemSize();
        cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );
    };
+
+    template <typename T>
+    void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
+    {
+        Scalar_<T> sf = s;
+        matrix_operations::set_to_gpu(src, sf.val, src.channels(), stream);
+    }
+
+    template <typename T>
+    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
+    {
+        Scalar_<T> sf = s;
+        matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+    }
 }

 CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; };
@@ -172,14 +188,26 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr
 void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
 void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }

-void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val)
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
 {
-    matrix_operations::set_to_without_mask(src, src.depth(), val.val, src.channels(), impl->stream);
+    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
+    static const set_caller_t set_callers[] =
+    {
+        kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>,
+        kernelSet<int>, kernelSet<float>, kernelSet<double>
+    };
+    set_callers[src.depth()](src, val, impl->stream);
 }

-void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask)
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
 {
-    matrix_operations::set_to_with_mask(src, src.depth(), val.val, mask, src.channels(), impl->stream);
+    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream);
+    static const set_caller_t set_callers[] =
+    {
+        kernelSetMask<uchar>, kernelSetMask<schar>, kernelSetMask<ushort>, kernelSetMask<short>,
+        kernelSetMask<int>, kernelSetMask<float>, kernelSetMask<double>
+    };
+    set_callers[src.depth()](src, val, mask, impl->stream);
 }

 void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -128,6 +128,8 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp

 void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)
 {
+    CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));
+
    if( src.empty() )
        CV_Error( CV_StsBadArg, "The input image is empty" );

@@ -154,6 +156,8 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,

 void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)
 {
+    CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));
+
    if( src.empty() )
        CV_Error( CV_StsBadArg, "The input image is empty" );


--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -87,8 +87,10 @@ namespace cv
        {
            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);

-            void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
-            void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            template <typename T>
+            void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
+            template <typename T>
+            void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);

            void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
        }
@@ -363,9 +365,11 @@ namespace
        }
    };

+    template <typename T>
    void kernelSet(GpuMat& src, const Scalar& s)
    {
-        matrix_operations::set_to_without_mask(src, src.depth(), s.val, src.channels());
+        Scalar_<T> sf = s;
+        matrix_operations::set_to_gpu(src, sf.val, src.channels(), 0);
    }

    template<int SDEPTH, int SCN> struct NppSetMaskFunc
@@ -412,9 +416,11 @@ namespace
        }
    };

+    template <typename T>
    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
    {
-        matrix_operations::set_to_with_mask(src, src.depth(), s.val, mask, src.channels());
+        Scalar_<T> sf = s;
+        matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), 0);
    }
 }

@@ -433,13 +439,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
        typedef void (*set_caller_t)(GpuMat& src, const Scalar& s);
        static const set_caller_t set_callers[8][4] =
        {
-            {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet,kernelSet,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
-            {kernelSet,kernelSet,kernelSet,kernelSet},
-            {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet,kernelSet,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
-            {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet,kernelSet,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
-            {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet,kernelSet,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
-            {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet,kernelSet,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
-            {kernelSet,kernelSet,kernelSet,kernelSet},
+            {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
+            {kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
+            {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet<ushort>,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
+            {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet<short>,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
+            {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
+            {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
+            {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>},
            {0,0,0,0}
        };
        set_callers[depth()][channels()-1](*this, s);
@@ -449,13 +455,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
        typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask);
        static const set_caller_t set_callers[8][4] =
        {
-            {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
-            {kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask},
-            {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
-            {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
-            {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
-            {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
-            {kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask},
+            {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<uchar>,kernelSetMask<uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
+            {kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>},
+            {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<ushort>,kernelSetMask<ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
+            {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
+            {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
+            {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
+            {kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>},
            {0,0,0,0}
        };
        set_callers[depth()][channels()-1](*this, s, mask);

--- a/modules/gpu/src/mssegmentation.cpp
+++ b/modules/gpu/src/mssegmentation.cpp
@@ -227,6 +227,8 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)

 void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria)
 {
+    CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));
+
    CV_Assert(src.type() == CV_8UC4);
    const int nrows = src.rows;
    const int ncols = src.cols;

--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -40,6 +40,9 @@
 //
 //M*/

+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vecmath.hpp"
+
 namespace cv 
 { 
    namespace gpu 
@@ -48,7 +51,7 @@ namespace cv
        {
            struct BrdReflect101 
            {
-                BrdReflect101(int len): last(len - 1) {}
+                explicit BrdReflect101(int len): last(len - 1) {}

                __device__ int idx_low(int i) const
                {
@@ -62,7 +65,7 @@ namespace cv

                __device__ int idx(int i) const
                {
-                    return abs(idx_high(i));
+                    return idx_low(idx_high(i));
                }

                bool is_range_safe(int mini, int maxi) const 
@@ -70,49 +73,55 @@ namespace cv
                    return -last <= mini && maxi <= 2 * last;
                }

+            private:
                int last;
            };


-            template <typename T>
+            template <typename D>
            struct BrdRowReflect101: BrdReflect101
            {
-                BrdRowReflect101(int len): BrdReflect101(len) {}
+                explicit BrdRowReflect101(int len): BrdReflect101(len) {}

-                __device__ float at_low(int i, const T* data) const 
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
                {
-                    return data[idx_low(i)];
+                    return saturate_cast<D>(data[idx_low(i)]);
                }

-                __device__ float at_high(int i, const T* data) const 
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
                {
-                    return data[idx_high(i)];
+                    return saturate_cast<D>(data[idx_high(i)]);
                }
            };


-            template <typename T>
+            template <typename D>
            struct BrdColReflect101: BrdReflect101
            {
                BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}

-                __device__ float at_low(int i, const T* data) const 
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
                {
-                    return data[idx_low(i) * step];
+                    return saturate_cast<D>(data[idx_low(i) * step]);
                }

-                __device__ float at_high(int i, const T* data) const 
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
                {
-                    return data[idx_high(i) * step];
+                    return saturate_cast<D>(data[idx_high(i) * step]);
                }

+            private:
                int step;
            };


            struct BrdReplicate
            {
-                BrdReplicate(int len): last(len - 1) {}
+                explicit BrdReplicate(int len): last(len - 1) {}

                __device__ int idx_low(int i) const
                {
@@ -126,7 +135,7 @@ namespace cv

                __device__ int idx(int i) const
                {
-                    return max(min(i, last), 0);
+                    return idx_low(idx_high(i));
                }

                bool is_range_safe(int mini, int maxi) const 
@@ -134,42 +143,104 @@ namespace cv
                    return true;
                }

+            private:
                int last;
            };


-            template <typename T>
+            template <typename D>
            struct BrdRowReplicate: BrdReplicate
            {
-                BrdRowReplicate(int len): BrdReplicate(len) {}
+                explicit BrdRowReplicate(int len): BrdReplicate(len) {}

-                __device__ float at_low(int i, const T* data) const 
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
                {
-                    return data[idx_low(i)];
+                    return saturate_cast<D>(data[idx_low(i)]);
                }

-                __device__ float at_high(int i, const T* data) const 
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
                {
-                    return data[idx_high(i)];
+                    return saturate_cast<D>(data[idx_high(i)]);
                }
            };


-            template <typename T>
+            template <typename D>
            struct BrdColReplicate: BrdReplicate
            {
                BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}

-                __device__ float at_low(int i, const T* data) const 
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
                {
-                    return data[idx_low(i) * step];
+                    return saturate_cast<D>(data[idx_low(i) * step]);
                }

-                __device__ float at_high(int i, const T* data) const 
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
+                {
+                    return saturate_cast<D>(data[idx_high(i) * step]);
+                }
+
+            private:
+                int step;
+            };
+
+            template <typename D>
+            struct BrdRowConstant
+            {
+                explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
+
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
                {
-                    return data[idx_high(i) * step];
+                    return i >= 0 ? saturate_cast<D>(data[i]) : val;
+                }
+
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
+                {
+                    return i < len ? saturate_cast<D>(data[i]) : val;
+                }
+
+                bool is_range_safe(int mini, int maxi) const 
+                {
+                    return true;
                }
+
+            private:
+                int len;
+                D val;
+            };
+
+            template <typename D>
+            struct BrdColConstant
+            {
+                BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
+
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
+                {
+                    return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
+                }
+
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
+                {
+                    return i < len ? saturate_cast<D>(data[i * step]) : val;
+                }
+
+                bool is_range_safe(int mini, int maxi) const 
+                {
+                    return true;
+                }
+
+            private:
+                int len;
                int step;
+                D val;
            };
        }
    }

--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -329,6 +329,7 @@ namespace cv
                grid.y = divUp(src.rows, threads.y);        

                device::transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaThreadSynchronize() ); 
@@ -345,6 +346,7 @@ namespace cv
                grid.y = divUp(src1.rows, threads.y);        

                device::transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaThreadSynchronize() );            
@@ -365,6 +367,7 @@ namespace cv
                grid.y = divUp(src.rows, threads.y);        

                device::transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaThreadSynchronize() );
@@ -383,6 +386,7 @@ namespace cv
                grid.y = divUp(src1.rows, threads.y);        

                device::transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaThreadSynchronize() );            

--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -65,6 +65,7 @@ namespace cv { namespace gpu { namespace surf
    dim3 calcBlockSize(int nIntervals);
    
    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
+    void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld);
    
    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
        int x_size, int y_size, bool use_mask, const dim3& threads);
@@ -75,6 +76,7 @@ namespace cv { namespace gpu { namespace surf
    void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures);
    
    void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
+    void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
 }}}

 using namespace cv::gpu::surf;
@@ -170,6 +172,10 @@ namespace

        void detectKeypoints(GpuMat& keypoints)
        {
+            typedef void (*fasthessian_t)(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
+            const fasthessian_t fasthessian = 
+                DeviceInfo().supports(COMPUTE_13) ? fasthessian_gpu : fasthessian_gpu_old;
+
            dim3 threads = calcBlockSize(nIntervals);
            for(int octave = 0; octave < nOctaves; ++octave)
            {
@@ -192,7 +198,7 @@ namespace
                uploadConstant("cv::gpu::surf::c_border", border);
                uploadConstant("cv::gpu::surf::c_step",   step);

-                fasthessian_gpu(hessianBuffer, x_size, y_size, threads);
+                fasthessian(hessianBuffer, x_size, y_size, threads);

                // Reset the candidate count.
                maxCounter = 0;
@@ -201,10 +207,13 @@ namespace
                
                maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));

-                fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,
-                    featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);
+                if (maxCounter > 0)
+                {
+                    fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,
+                        featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);

-                featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));
+                    featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));
+                }
            }

            if (featureCounter > 0)
@@ -221,10 +230,16 @@ namespace

        void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
        {
+            typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors, 
+                const KeyPoint_GPU* features, int nFeatures);
+
+            const compute_descriptors_t compute_descriptors = 
+                DeviceInfo().supports(COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
+
            if (keypoints.cols > 0)
            {
                descriptors.create(keypoints.cols, descriptorSize, CV_32F);
-                compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
+                compute_descriptors(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
            }
        }


--- a/tests/gpu/src/brute_force_matcher.cpp
+++ b/tests/gpu/src/brute_force_matcher.cpp
@@ -384,6 +384,14 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa

 void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train )
 {
+    bool atomics_ok = TargetArchs::builtWith(ATOMICS) && DeviceInfo().supports(ATOMICS);
+    if (!atomics_ok)
+    {
+        ts->printf(CvTS::CONSOLE, "\nCode and device atomics support is required for radiusMatch (CC >= 1.1)");
+        ts->set_failed_test_info(CvTS::FAIL_GENERIC);
+        return;
+    }
+
    dmatcher.clear();
    // test const version of match()
    {
@@ -501,15 +509,24 @@ void CV_GpuBruteForceMatcherTest::dataTest(int dim)

 void CV_GpuBruteForceMatcherTest::run(int)
 {
-    emptyDataTest();
-
-    dataTest(50);
-    dataTest(64);
-    dataTest(100);
-    dataTest(128);
-    dataTest(200);
-    dataTest(256);
-    dataTest(300);
+    try
+    {
+        emptyDataTest();
+
+        dataTest(50);
+        dataTest(64);
+        dataTest(100);
+        dataTest(128);
+        dataTest(200);
+        dataTest(256);
+        dataTest(300);
+    }
+    catch(cv::Exception& e)
+    {
+        if (!check_and_treat_gpu_exception(e, ts))
+            throw; 
+        return;
+    }
 }

 CV_GpuBruteForceMatcherTest CV_GpuBruteForceMatcher_test;
--- a/tests/gpu/src/features2d.cpp
+++ b/tests/gpu/src/features2d.cpp
@@ -154,7 +154,7 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
            return;
        }

-        if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)
+        if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.5f)
        {
            ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");
            ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
@@ -221,10 +221,19 @@ void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector)

 void CV_GPU_SURFTest::run( int /*start_from*/ )
 {
-    SURF_GPU fdetector;
+    try
+    {
+        SURF_GPU fdetector;

-    emptyDataTest(fdetector);
-    regressionTest(fdetector);
+        emptyDataTest(fdetector);
+        regressionTest(fdetector);
+    }
+    catch(cv::Exception& e)
+    {
+        if (!check_and_treat_gpu_exception(e, ts))
+            throw; 
+        return;
+    }
 }

 CV_GPU_SURFTest CV_GPU_SURF_test;
--- a/tests/gpu/src/gputest_main.cpp
+++ b/tests/gpu/src/gputest_main.cpp
@@ -43,15 +43,15 @@

 CvTS test_system("gpu");

-const char* blacklist[] =
-{
-    "GPU-NppImageCanny",            // NPP_TEXTURE_BIND_ERROR
-    0
-};
+//const char* blacklist[] =
+//{
+//    "GPU-NVidia",
+//    0
+//};

 int main( int argc, char** argv )
 {
-    return test_system.run( argc, argv, blacklist );
+    return test_system.run( argc, argv );
 }

 /* End of file. */
--- a/tests/gpu/src/meanshift.cpp
+++ b/tests/gpu/src/meanshift.cpp
@@ -43,6 +43,9 @@
 #include <iostream>
 #include <string>

+using namespace cv;
+using namespace cv::gpu;
+

 struct CV_GpuMeanShiftTest : public CvTest
 {
@@ -50,6 +53,14 @@ struct CV_GpuMeanShiftTest : public CvTest

    void run(int)
    {
+        bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+        if (!cc12_ok)
+        {
+            ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
+            ts->set_failed_test_info(CvTS::FAIL_GENERIC);
+            return;
+        }
+
        int spatialRad = 30;
        int colorRad = 30;

@@ -134,6 +145,14 @@ struct CV_GpuMeanShiftProcTest : public CvTest

    void run(int)
    {
+        bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+        if (!cc12_ok)
+        {
+            ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
+            ts->set_failed_test_info(CvTS::FAIL_GENERIC);
+            return;
+        }
+
        int spatialRad = 30;
        int colorRad = 30;


--- a/tests/gpu/src/mssegmentation.cpp
+++ b/tests/gpu/src/mssegmentation.cpp
@@ -54,6 +54,14 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
    {
        try 
        {
+            bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+            if (!cc12_ok)
+            {
+                ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
+                ts->set_failed_test_info(CvTS::FAIL_GENERIC);
+                return;
+            }
+
            Mat img_rgb = imread(string(ts->get_data_path()) + "meanshift/cones.png");
            if (img_rgb.empty())
            {

--- a/tests/gpu/src/operator_convert_to.cpp
+++ b/tests/gpu/src/operator_convert_to.cpp
@@ -91,14 +91,14 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */)
                    Mat cpumatdst;
                    GpuMat gpumatdst;

-                    cpumatsrc.convertTo(cpumatdst, dst_type);
-                    gpumatsrc.convertTo(gpumatdst, dst_type);
+                    cpumatsrc.convertTo(cpumatdst, dst_type, 0.5, 3.0);
+                    gpumatsrc.convertTo(gpumatdst, dst_type, 0.5, 3.0);

                    double r = norm(cpumatdst, gpumatdst, NORM_INF);
                    if (r > 1)
                    {
                        ts->printf(CvTS::LOG, 
-                                   "\nFAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %d\n",
+                                   "\nFAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %f\n",
                                   types_str[i], c, types_str[j], r);
                        passed = false;
                    }