optimizations:

- new reduce implementation (with kepler optimizations) - saturate_cast via asm command - video SIMD instructions in element operations - float arithmetics instead of double - new deviceSupports function

optimizations:
- new reduce implementation (with kepler optimizations) - saturate_cast via asm command - video SIMD instructions in element operations - float arithmetics instead of double - new deviceSupports function
281d036f · Vladislav Vinogradov · ae6266e1 · 281d036f · 281d036f · 281d036f
Commit 281d036f authored Jan 23, 2013 by Vladislav Vinogradov
45 changed files
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -79,6 +79,8 @@ namespace cv { namespace gpu
        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
    };
+    CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
    // Gives information about what GPU archs this OpenCV GPU module was
    // compiled for
    class CV_EXPORTS TargetArchs

--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@@ -44,6 +44,7 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/type_traits.hpp"
 namespace cv { namespace gpu { namespace device
 {
@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
    void writeScalar(const int*);
    void writeScalar(const float*);
    void writeScalar(const double*);
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
    void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
 }}}
@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
    //////////////////////////////// ConvertTo ////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////
-    template <typename T, typename D> struct Convertor : unary_function<T, D>
+    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
    {
-        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
+        Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
-        __device__ __forceinline__ D operator()(const T& src) const
+        __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
        {
            return saturate_cast<D>(alpha * src + beta);
        }
-        double alpha, beta;
+        S alpha, beta;
    };
    namespace detail
@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
        };
    }
-    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
    {
    };
-    template<typename T, typename D>
+    template<typename T, typename D, typename S>
    void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
    {
        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-        Convertor<T, D> op(alpha, beta);
+        Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
    }
@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
    {
        typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
-        static const caller_t tab[8][8] =
+        static const caller_t tab[7][7] =
        {
-            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
+            {
-            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
+                cvt_<uchar, uchar, float>,
+                cvt_<uchar, schar, float>,
-            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
+                cvt_<uchar, ushort, float>,
-            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
+                cvt_<uchar, short, float>,
+                cvt_<uchar, int, float>,
-            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
+                cvt_<uchar, float, float>,
-            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
+                cvt_<uchar, double, double>
+            },
-            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
+            {
-            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
+                cvt_<schar, uchar, float>,
+                cvt_<schar, schar, float>,
-            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
+                cvt_<schar, ushort, float>,
-            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
+                cvt_<schar, short, float>,
+                cvt_<schar, int, float>,
-            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
+                cvt_<schar, float, float>,
-            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
+                cvt_<schar, double, double>
+            },
-            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
+            {
-            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
+                cvt_<ushort, uchar, float>,
+                cvt_<ushort, schar, float>,
-            {0,0,0,0,0,0,0,0}
+                cvt_<ushort, ushort, float>,
+                cvt_<ushort, short, float>,
+                cvt_<ushort, int, float>,
+                cvt_<ushort, float, float>,
+                cvt_<ushort, double, double>
+            },
+            {
+                cvt_<short, uchar, float>,
+                cvt_<short, schar, float>,
+                cvt_<short, ushort, float>,
+                cvt_<short, short, float>,
+                cvt_<short, int, float>,
+                cvt_<short, float, float>,
+                cvt_<short, double, double>
+            },
+            {
+                cvt_<int, uchar, float>,
+                cvt_<int, schar, float>,
+                cvt_<int, ushort, float>,
+                cvt_<int, short, float>,
+                cvt_<int, int, double>,
+                cvt_<int, float, double>,
+                cvt_<int, double, double>
+            },
+            {
+                cvt_<float, uchar, float>,
+                cvt_<float, schar, float>,
+                cvt_<float, ushort, float>,
+                cvt_<float, short, float>,
+                cvt_<float, int, float>,
+                cvt_<float, float, float>,
+                cvt_<float, double, double>
+            },
+            {
+                cvt_<double, uchar, double>,
+                cvt_<double, schar, double>,
+                cvt_<double, ushort, double>,
+                cvt_<double, short, double>,
+                cvt_<double, int, double>,
+                cvt_<double, float, double>,
+                cvt_<double, double, double>
+            }
        };
        caller_t func = tab[sdepth][ddepth];
-        if (!func)
-            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
        func(src, dst, alpha, beta, stream);
    }

--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -45,8 +45,7 @@
 #include <iostream>
 #ifdef HAVE_CUDA
-    #include <cuda.h>
+    #include <cuda_runtime.h>
-    #include <cuda_runtime_api.h>
    #include <npp.h>
    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
@@ -69,33 +68,89 @@ using namespace cv::gpu;
 namespace
 {
-    // Compares value to set using the given comparator. Returns true if
+    class CudaArch
-    // there is at least one element x in the set satisfying to: x cmp value
+    {
-    // predicate.
+    public:
-    template <typename Comparer>
+        CudaArch();
-    bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
+        bool builtWith(FeatureSet feature_set) const;
+        bool hasPtx(int major, int minor) const;
+        bool hasBin(int major, int minor) const;
+        bool hasEqualOrLessPtx(int major, int minor) const;
+        bool hasEqualOrGreaterPtx(int major, int minor) const;
+        bool hasEqualOrGreaterBin(int major, int minor) const;
+    private:
+        static void fromStr(const string& set_as_str, vector<int>& arr);
+        vector<int> bin;
+        vector<int> ptx;
+        vector<int> features;
+    };
+    const CudaArch cudaArch;
+    CudaArch::CudaArch()
+    {
+    #ifdef HAVE_CUDA
+        fromStr(CUDA_ARCH_BIN, bin);
+        fromStr(CUDA_ARCH_PTX, ptx);
+        fromStr(CUDA_ARCH_FEATURES, features);
+    #endif
+    }
+    bool CudaArch::builtWith(FeatureSet feature_set) const
+    {
+        return !features.empty() && (features.back() >= feature_set);
+    }
+    bool CudaArch::hasPtx(int major, int minor) const
+    {
+        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
+    }
+    bool CudaArch::hasBin(int major, int minor) const
+    {
+        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
+    }
+    bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+    }
+    bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+    }
+    bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return !bin.empty() && (bin.back() >= major * 10 + minor);
+    }
+    void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
    {
        if (set_as_str.find_first_not_of(" ") == string::npos)
-            return false;
+            return;
-        std::stringstream stream(set_as_str);
+        istringstream stream(set_as_str);
        int cur_value;
        while (!stream.eof())
        {
            stream >> cur_value;
-            if (cmp(cur_value, value))
+            arr.push_back(cur_value);
-                return true;
        }
-        return false;
+        sort(arr.begin(), arr.end());
    }
 }
 bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>());
+    return cudaArch.builtWith(feature_set);
 #else
    (void)feature_set;
    return false;
@@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
 bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>());
+    return cudaArch.hasPtx(major, minor);
 #else
    (void)major;
    (void)minor;
@@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>());
+    return cudaArch.hasBin(major, minor);
 #else
    (void)major;
    (void)minor;
@@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor,
+    return cudaArch.hasEqualOrLessPtx(major, minor);
-                     std::less_equal<int>());
 #else
    (void)major;
    (void)minor;
@@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
 {
-    return hasEqualOrGreaterPtx(major, minor) ||
+    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
-           hasEqualOrGreaterBin(major, minor);
 }
 bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>());
+    return cudaArch.hasEqualOrGreaterPtx(major, minor);
 #else
    (void)major;
    (void)minor;
@@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor,
+    return cudaArch.hasEqualOrGreaterBin(major, minor);
-                     std::greater_equal<int>());
 #else
    (void)major;
    (void)minor;
@@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 #endif
 }
+bool cv::gpu::deviceSupports(FeatureSet feature_set)
+{
+    static int versions[] =
+    {
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    };
+    static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
+    const int devId = getDevice();
+    int version;
+    if (devId < cache_size && versions[devId] >= 0)
+        version = versions[devId];
+    else
+    {
+        DeviceInfo dev(devId);
+        version = dev.majorVersion() * 10 + dev.minorVersion();
+        if (devId < cache_size)
+            versions[devId] = version;
+    }
+    return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+}
 #if !defined (HAVE_CUDA)
 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")

--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
--- a/modules/gpu/include/opencv2/gpu/device/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp
--- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
@@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
    {
-        return (uchar) ::max((int)v, 0);
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
    {
-        return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
    {
-        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+        uint res = 0;
+        asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
    {
-        return (uchar) ::min(v, (uint)UCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
-    {
-        return saturate_cast<uchar>((uint)v);
-    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
    {
-        int iv = __float2int_rn(v);
+        uint res = 0;
-        return saturate_cast<uchar>(iv);
+        asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+    #if __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
+        uint res = 0;
-        return saturate_cast<uchar>(iv);
+        asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<uchar>((float)v);
    #endif
@@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device
    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
    {
-        return (schar) ::min((int)v, SCHAR_MAX);
+        uint res = 0;
+        uint vi = v;
+        asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
    {
-        return (schar) ::min((uint)v, (uint)SCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
    {
-        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+        uint res = 0;
+        asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
    {
-        return saturate_cast<schar>((int)v);
+        uint res = 0;
+        asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
    {
-        return (schar) ::min(v, (uint)SCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
    {
-        int iv = __float2int_rn(v);
+        uint res = 0;
-        return saturate_cast<schar>(iv);
+        asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+    #if __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
+        uint res = 0;
-        return saturate_cast<schar>(iv);
+        asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<schar>((float)v);
    #endif
@@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
    {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        int vi = v;
+        asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
    {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
    {
-        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
    {
-        return (ushort) ::min(v, (uint)USHRT_MAX);
+        ushort res = 0;
+        asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
    {
-        int iv = __float2int_rn(v);
+        ushort res = 0;
-        return saturate_cast<ushort>(iv);
+        asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+    #if __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
+        ushort res = 0;
-        return saturate_cast<ushort>(iv);
+        asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<ushort>((float)v);
    #endif
@@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device
    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
    {
-        return (short) ::min((int)v, SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
    {
-        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
+        short res = 0;
+        asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
    {
-        return (short) ::min(v, (uint)SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
    {
-        int iv = __float2int_rn(v);
+        short res = 0;
-        return saturate_cast<short>(iv);
+        asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+    #if __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
+        short res = 0;
-        return saturate_cast<short>(iv);
+        asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<short>((float)v);
    #endif
    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
+    {
+        int res = 0;
+        asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
    {
        return __float2int_rn(v);
@@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
    #endif
    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
    {
        return __float2uint_rn(v);

--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@@ -45,7 +45,6 @@
 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"
-#include "detail/reduction_detail.hpp"
 namespace cv { namespace gpu { namespace device
 {
@@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
        }
    };
-    ///////////////////////////////////////////////////////////////////////////////
-    // Reduction
-    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
-    }
-    template <int n, typename T, typename V, typename Pred>
-    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
-    }
-    template <int n, typename T, typename V1, typename V2, typename Pred>
-    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-    }
    ///////////////////////////////////////////////////////////////////////////////
    // Solve linear system

--- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_HPP__
-#include "utility.hpp"
+#include "reduce.hpp"
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
        }
        __device__ __forceinline__ operator int() const
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
        }
        __device__ __forceinline__ operator float() const
@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
        }
        __device__ __forceinline__ operator float() const
@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
        }
        __device__ __forceinline__ operator int() const

--- a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
+#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T>
+    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl(val, srcLane, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl((int) val, srcLane, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+        lo = __shfl(lo, srcLane, width);
+        hi = __shfl(hi, srcLane, width);
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+    template <typename T>
+    __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_down(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_down((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+        lo = __shfl_down(lo, delta, width);
+        hi = __shfl_down(hi, delta, width);
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+    template <typename T>
+    __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_up(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_up((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+        lo = __shfl_up(lo, delta, width);
+        hi = __shfl_up(hi, delta, width);
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+}}}
+#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
 void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
 {
 #ifndef HAVE_CUBLAS
-    (void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream;
+    (void)src1;
+    (void)src2;
+    (void)alpha;
+    (void)src3;
+    (void)beta;
+    (void)dst;
+    (void)flags;
+    (void)stream;
    CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
 #else
    // CUBLAS works with column-major matrices
    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
@@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    if (src1.depth() == CV_64F)
    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }
@@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    }
    cublasSafeCall( cublasDestroy_v2(handle) );
 #endif
 }
@@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
    }
    else // if (src.elemSize() == 8)
    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
        NppStStreamHandler h(stream);

--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -42,9 +42,10 @@
 #if !defined CUDA_DISABLER
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 namespace cv { namespace gpu { namespace device
 {
@@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
            }
+            __device__ __forceinline__ TransformOp() {}
+            __device__ __forceinline__ TransformOp(const TransformOp&) {}
        };
        void call(const PtrStepSz<float3> src, const float* rot,
@@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
            }
+            __device__ __forceinline__ ProjectOp() {}
+            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
        };
        void call(const PtrStepSz<float3> src, const float* rot,
@@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
            return x * x;
        }
+        template <int BLOCK_SIZE>
        __global__ void computeHypothesisScoresKernel(
                const int num_points, const float3* object, const float2* image,
                const float dist_threshold, int* g_num_inliers)
@@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
                    ++num_inliers;
            }
-            extern __shared__ float s_num_inliers[];
+            __shared__ int s_num_inliers[BLOCK_SIZE];
-            s_num_inliers[threadIdx.x] = num_inliers;
+            reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
-            __syncthreads();
-            for (int step = blockDim.x / 2; step > 0; step >>= 1)
-            {
-                if (threadIdx.x < step)
-                    s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
-                __syncthreads();
-            }
            if (threadIdx.x == 0)
-                g_num_inliers[blockIdx.x] = s_num_inliers[0];
+                g_num_inliers[blockIdx.x] = num_inliers;
        }
        void computeHypothesisScores(
@@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device
            dim3 threads(256);
            dim3 grid(num_hypotheses);
-            int smem_size = threads.x * sizeof(float);
-            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+            computeHypothesisScoresKernel<256><<<grid, threads>>>(
                    num_points, object, image, dist_threshold, hypothesis_scores);
            cudaSafeCall( cudaGetLastError() );

--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@@ -46,6 +46,8 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 #include "fgd_bgfg_common.hpp"
 using namespace cv::gpu;
@@ -181,57 +183,8 @@ namespace bgfg
        __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
        __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
-        data0[threadIdx.x] = sum0;
+        plus<unsigned int> op;
-        data1[threadIdx.x] = sum1;
+        reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
-        data2[threadIdx.x] = sum2;
-        __syncthreads();
-        if (threadIdx.x < 128)
-        {
-            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
-            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
-            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
-        }
-        __syncthreads();
-        if (threadIdx.x < 64)
-        {
-            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
-            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
-            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
-        }
-        __syncthreads();
-        if (threadIdx.x < 32)
-        {
-            volatile unsigned int* vdata0 = data0;
-            volatile unsigned int* vdata1 = data1;
-            volatile unsigned int* vdata2 = data2;
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
-        }
        if(threadIdx.x == 0)
        {
@@ -245,9 +198,9 @@ namespace bgfg
    void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               int cc, cudaStream_t stream)
+                               bool cc20, cudaStream_t stream)
    {
-        const int HISTOGRAM_WARP_COUNT = cc < 20 ? 4 : 6;
+        const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
@@ -261,10 +214,10 @@ namespace bgfg
            cudaSafeCall( cudaDeviceSynchronize() );
    }
-    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
    /////////////////////////////////////////////////////////////////////////
    // calcDiffThreshMask

--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@@ -125,7 +125,7 @@ namespace bgfg
    void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               int cc, cudaStream_t stream);
+                               bool cc20, cudaStream_t stream);
    template <typename PT, typename CT>
    void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);

--- a/modules/gpu/src/cuda/global_motion.cu
+++ b/modules/gpu/src/cuda/global_motion.cu
@@ -43,12 +43,10 @@
 #if !defined CUDA_DISABLER
-#include "thrust/device_ptr.h"
+#include <thrust/device_ptr.h>
-#include "thrust/remove.h"
+#include <thrust/remove.h>
-#include "thrust/functional.h"
+#include <thrust/functional.h>
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
-using namespace thrust;
 namespace cv { namespace gpu { namespace device { namespace globmotion {
@@ -61,10 +59,10 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
    thrust::device_ptr<float2> dpoints1((float2*)points1);
    thrust::device_ptr<const uchar> dmask(mask);
-    return thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
+    return (int)(thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
                             thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
                             dmask, thrust::not1(thrust::identity<uchar>()))
-           - make_zip_iterator(make_tuple(dpoints0, dpoints1));
+           - thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1)));
 }

--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -43,182 +43,112 @@
 #if !defined CUDA_DISABLER
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/emulation.hpp"
+#include "opencv2/gpu/device/transform.hpp"
-namespace cv { namespace gpu { namespace device
+using namespace cv::gpu;
-{
+using namespace cv::gpu::device;
-    #define UINT_BITS 32U
-    //Warps == subhistograms per threadblock
-    #define WARP_COUNT 6
-    //Threadblock size
-    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
-    #define HISTOGRAM256_BIN_COUNT 256
-    //Shared memory per threadblock
-    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
-    #define PARTIAL_HISTOGRAM256_COUNT 240
-    #define MERGE_THREADBLOCK_SIZE 256
-    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
+namespace hist
+{
-    namespace hist
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
    {
-        #if (!USE_SMEM_ATOMICS)
+        __shared__ int shist[256];
-            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
-            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
-            {
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-                uint count;
-                do
-                {
-                    count = s_WarpHist[data] & TAG_MASK;
-                    count = threadTag | (count + 1);
-                    s_WarpHist[data] = count;
-                } while (s_WarpHist[data] != count);
-            }
-        #else
+        shist[tid] = 0;
+        __syncthreads();
-            #define TAG_MASK 0xFFFFFFFFU
-            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
+        if (y < rows)
        {
-                atomicAdd(s_WarpHist + data, 1);
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
-            }
-        #endif
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
            {
-            uint x = pos_x << 2;
+                unsigned int data = rowPtr[x];
-            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
+                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
-            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
+                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
-            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
+                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
-            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
+                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
            }
-        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
+            if (cols % 4 != 0 && threadIdx.x == 0)
            {
-            //Per-warp subhistogram storage
+                for (int x = cols_4 * 4; x < cols; ++x)
-            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
-            //Clear shared memory storage for current threadblock before processing
-            #pragma unroll
-            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
-               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
-            //Cycle through the entire data set, update subhistograms for each warp
-            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
-            __syncthreads();
-            const uint colsui = d_Data.step / sizeof(uint);
-            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
                {
-                uint pos_y = pos / colsui;
+                    unsigned int data = ((const uchar*)rowPtr)[x];
-                uint pos_x = pos % colsui;
+                    Emulation::smem::atomicAdd(&shist[data], 1);
-                uint data = d_Data.ptr(pos_y)[pos_x];
-                addWord(s_WarpHist, data, tag, pos_x, cols);
                }
-            //Merge per-warp histograms into per-block and write to global memory
-            __syncthreads();
-            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
-            {
-                uint sum = 0;
-                for (uint i = 0; i < WARP_COUNT; i++)
-                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
-                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
            }
        }
-        ////////////////////////////////////////////////////////////////////////////////
-        // Merge histogram256() output
-        // Run one threadblock per bin; each threadblock adds up the same bin counter
-        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-        // takes only a fraction of total processing time
-        ////////////////////////////////////////////////////////////////////////////////
-        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
-        {
-            uint sum = 0;
-            #pragma unroll
-            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
-                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
-            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
-            data[threadIdx.x] = sum;
-            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
-            {
        __syncthreads();
-                if(threadIdx.x < stride)
-                    data[threadIdx.x] += data[threadIdx.x + stride];
-            }
-            if(threadIdx.x == 0)
+        const int histVal = shist[tid];
-                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
    }
-        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
    {
-            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
+        const dim3 block(32, 8);
-                PtrStepSz<uint>(src),
+        const dim3 grid(divUp(src.rows, block.y));
-                buf,
-                static_cast<uint>(src.rows * src.step / sizeof(uint)),
-                src.cols);
-            cudaSafeCall( cudaGetLastError() );
-            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
+}
+/////////////////////////////////////////////////////////////////////////
+namespace hist
+{
    __constant__ int c_lut[256];
-        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
+    struct EqualizeHist : unary_function<uchar, uchar>
    {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        float scale;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        __host__ EqualizeHist(float _scale) : scale(_scale) {}
-            if (x < src.cols && y < src.rows)
+        __device__ __forceinline__ uchar operator ()(uchar val) const
        {
-                const uchar val = src.ptr(y)[x];
            const int lut = c_lut[val];
-                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
+            return __float2int_rn(scale * lut);
-            }
        }
+    };
+}
-        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
    {
-            dim3 block(16, 16);
+        enum { smart_shift = 4 };
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+    };
+}}}
+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+    {
+        if (stream == 0)
            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
-            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
+        const float scale = 255.0f / (src.cols * src.rows);
-            cudaSafeCall( cudaGetLastError() );
-            if (stream == 0)
+        transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
-                cudaSafeCall( cudaDeviceSynchronize() );
    }
-    } // namespace hist
+}
-}}} // namespace cv { namespace gpu { namespace device
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -47,10 +47,11 @@
 #if !defined CUDA_DISABLER
+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 #include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 namespace cv { namespace gpu { namespace device
@@ -75,9 +76,9 @@ namespace cv { namespace gpu { namespace device
        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
        {
-            __shared__ int smem[8 * 32];
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            __shared__ int smem2[8 * 32];
            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
@@ -109,9 +110,12 @@ namespace cv { namespace gpu { namespace device
                    c += Ix * Iy;
                }
-                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
+                int* srow0 = smem0 + threadIdx.y * blockDim.x;
-                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
+                int* srow1 = smem1 + threadIdx.y * blockDim.x;
-                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
+                int* srow2 = smem2 + threadIdx.y * blockDim.x;
+                plus<int> op;
+                reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
                if (threadIdx.x == 0)
                {
@@ -151,9 +155,13 @@ namespace cv { namespace gpu { namespace device
        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
        {
-            __shared__ int smem[8 * 32];
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+            int* srow0 = smem0 + threadIdx.y * blockDim.x;
+            int* srow1 = smem1 + threadIdx.y * blockDim.x;
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            plus<int> op;
            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
@@ -167,7 +175,7 @@ namespace cv { namespace gpu { namespace device
                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
                    m_10 += u * image(loc.y, loc.x + u);
-                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow0, m_10, threadIdx.x, op);
                for (int v = 1; v <= half_k; ++v)
                {
@@ -185,8 +193,7 @@ namespace cv { namespace gpu { namespace device
                        m_sum += u * (val_plus + val_minus);
                    }
-                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
+                    reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
-                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
                    m_10 += m_sum;
                    m_01 += v * v_sum;

--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -294,9 +294,8 @@ void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf&
    ensureSizeIsEnough(1, maxCircles, CV_32FC3, circles);
-    DeviceInfo devInfo;
    const int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, circles.ptr<float3>(), maxCircles,
-                                                    dp, minRadius, maxRadius, votesThreshold, devInfo.supports(FEATURE_SET_COMPUTE_20));
+                                                    dp, minRadius, maxRadius, votesThreshold, deviceSupports(FEATURE_SET_COMPUTE_20));
    if (circlesCount > 0)
        circles.cols = circlesCount;
@@ -531,7 +530,7 @@ namespace
        const func_t func = funcs[dx.depth()];
        CV_Assert(func != 0);
-        edgePointList.cols = edgePointList.step / sizeof(int);
+        edgePointList.cols = (int) (edgePointList.step / sizeof(int));
        ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList);
        edgePointList.cols = func(edges, dx, dy, edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1));

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp