merged GPU scan

2777ebb8 · Marina Kolpakova · 6cca6a45 · 2777ebb8 · 2777ebb8 · 2777ebb8
Commit 2777ebb8 authored Jun 28, 2012 by Marina Kolpakova
25 changed files
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
 if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
  message(STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.)
-  return()  
+  return()
 endif()
-  
+
 find_package(CUDA 4.1)

 if(CUDA_FOUND)
@@ -23,7 +23,7 @@ if(CUDA_FOUND)
  else()
    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
  endif()
-    
+
  set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")

  string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
@@ -89,8 +89,8 @@ if(CUDA_FOUND)
      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
    endif()

-    # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1) 
-    set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG}) 
+    # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
+    set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
    string(REPLACE "-ggdb3" "" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
    CUDA_COMPILE(${VAR} ${ARGN})
    set(CMAKE_CXX_DEBUG_FLAGS ${CMAKE_CXX_FLAGS_DEBUG_})

--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -90,6 +90,40 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
                    Interpolation(cv::INTER_CUBIC),   Interpolation(cv::INTER_AREA)),
    testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));

+GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+    int interpolation = cv::INTER_AREA;
+    double f = GET_PARAM(3);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0, 255);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
+
+    declare.time(1.0);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(perf::sz1080p/*, cv::Size(4096, 2048)*/),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
+                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
+                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
+
 //////////////////////////////////////////////////////////////////////
 // WarpAffine


--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device

        struct Mask8U
        {
-            explicit Mask8U(PtrStepb mask): mask(mask) {}
+            explicit Mask8U(PtrStepb mask_): mask(mask_) {}

            __device__ __forceinline__ bool operator()(int y, int x) const
            {

--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -46,7 +46,8 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"
-# include <cfloat>
+#include <cfloat>
+#include <opencv2/gpu/device/scan.hpp>

 namespace cv { namespace gpu { namespace device
 {

--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-            mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            mergeC2_<T><<<grid, block, 0, stream>>>(
                    src[0].data, src[0].step,
                    src[1].data, src[1].step,
                    dst.rows, dst.cols, dst.data, dst.step);
@@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-            mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            mergeC3_<T><<<grid, block, 0, stream>>>(
                    src[0].data, src[0].step,
                    src[1].data, src[1].step,
                    src[2].data, src[2].step,
@@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-            mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            mergeC4_<T><<<grid, block, 0, stream>>>(
                    src[0].data, src[0].step,
                    src[1].data, src[1].step,
                    src[2].data, src[2].step,
@@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-            splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            splitC2_<T><<<grid, block, 0, stream>>>(
                    src.data, src.step, src.rows, src.cols,
                    dst[0].data, dst[0].step,
                    dst[1].data, dst[1].step);
@@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-            splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            splitC3_<T><<<grid, block, 0, stream>>>(
                    src.data, src.step, src.rows, src.cols,
                    dst[0].data, dst[0].step,
                    dst[1].data, dst[1].step,
@@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-            splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            splitC4_<T><<<grid, block, 0, stream>>>(
                     src.data, src.step, src.rows, src.cols,
                     dst[0].data, dst[0].step,
                     dst[1].data, dst[1].step,

--- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
@@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
 //===================================================================


-NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_)
    :
    currentSize(0),
    _maxSize(0),
@@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
    begin(NULL),
    end(NULL),
    _memType(NCVMemoryTypeNone),
-    _alignment(alignment),
+    _alignment(alignment_),
    bReusesMemory(false)
 {
-    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
 }


-NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr)
    :
    currentSize(0),
    _maxSize(0),
    allocBegin(NULL),
    _memType(memT),
-    _alignment(alignment)
+    _alignment(alignment_)
 {
-    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
    ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");

@@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const
 //===================================================================


-NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_)
    :
    currentSize(0),
    _maxSize(0),
    _memType(memT),
-    _alignment(alignment)
+    _alignment(alignment_)
 {
    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
 }

--- a/modules/gpu/src/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/common.hpp
@@ -64,7 +64,7 @@
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
 #endif

-namespace cv { namespace gpu 
+namespace cv { namespace gpu
 {
    void error(const char *error_string, const char *file, const int line, const char *func);

@@ -87,14 +87,14 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int

 #ifdef __CUDACC__

-namespace cv { namespace gpu 
-{     
-    __host__ __device__ __forceinline__ int divUp(int total, int grain) 
-    { 
-        return (total + grain - 1) / grain; 
+namespace cv { namespace gpu
+{
+    __host__ __device__ __forceinline__ int divUp(int total, int grain)
+    {
+        return (total + grain - 1) / grain;
    }

-    namespace device 
+    namespace device
    {
        typedef unsigned char uchar;
        typedef unsigned short ushort;

--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@@ -45,7 +45,7 @@

 #include "common.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200

@@ -54,13 +54,13 @@ namespace cv { namespace gpu { namespace device
        {
            __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
        };
-            
-    #else // __CUDA_ARCH__ >= 200        

-        #if defined(_WIN64) || defined(__LP64__)		
+    #else // __CUDA_ARCH__ >= 200
+
+        #if defined(_WIN64) || defined(__LP64__)
            // 64-bit register modifier for inlined asm
            #define OPENCV_GPU_ASM_PTR "l"
-        #else	
+        #else
            // 32-bit register modifier for inlined asm
            #define OPENCV_GPU_ASM_PTR "r"
        #endif
@@ -84,21 +84,21 @@ namespace cv { namespace gpu { namespace device
                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
                } \
            };
-        
+
            OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar,  u8)
            OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar,  s8)
            OPENCV_GPU_DEFINE_FORCE_GLOB_B(char,   b8)
            OPENCV_GPU_DEFINE_FORCE_GLOB  (ushort, u16, h)
            OPENCV_GPU_DEFINE_FORCE_GLOB  (short,  s16, h)
            OPENCV_GPU_DEFINE_FORCE_GLOB  (uint,   u32, r)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)	
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)	
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)	            
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)

        #undef OPENCV_GPU_DEFINE_FORCE_GLOB
        #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
        #undef OPENCV_GPU_ASM_PTR
-        
+
    #endif // __CUDA_ARCH__ >= 200
 }}} // namespace cv { namespace gpu { namespace device


--- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
@@ -44,7 +44,7 @@
 #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__

 namespace cv { namespace gpu { namespace device
-{   
+{
    template<class T> struct DynamicSharedMem
    {
        __device__ __forceinline__ operator T*()

--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@@ -45,21 +45,21 @@

 #include "warp_reduce.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    struct Emulation
    {
-	    static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
-	    {
+        static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
+        {
    #if __CUDA_ARCH__ >= 200
-		    (void)cta_buffer;
-		    return __ballot(predicate);
+            (void)cta_buffer;
+            return __ballot(predicate);
    #else
-		    int tid = threadIdx.x;				
-		    cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
-		    return warp_reduce(cta_buffer);
+            int tid = threadIdx.x;
+            cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
+            return warp_reduce(cta_buffer);
    #endif
-	    }
+        }
    };
 }}} // namespace cv { namespace gpu { namespace device


--- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
@@ -46,14 +46,14 @@

 #include <cstdio>

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    template<class Func> 
+    template<class Func>
    void printFuncAttrib(Func& func)
    {

        cudaFuncAttributes attrs;
-        cudaFuncGetAttributes(&attrs, func);  
+        cudaFuncGetAttributes(&attrs, func);

        printf("=== Function stats ===\n");
        printf("Name: \n");
@@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
        printf("ptxVersion         = %d\n", attrs.ptxVersion);
        printf("binaryVersion      = %d\n", attrs.binaryVersion);
        printf("\n");
-        fflush(stdout); 
+        fflush(stdout);
    }
 }}} // namespace cv { namespace gpu { namespace device


--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -48,7 +48,7 @@
 #include "vec_traits.hpp"
 #include "type_traits.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    // Function Objects

@@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device

    template <typename T> struct bit_not : unary_function<T, T>
    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const 
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
        {
            return ~v;
        }
@@ -268,7 +268,7 @@ namespace cv { namespace gpu { namespace device
    // Generalized Identity Operations
    template <typename T> struct identity : unary_function<T, T>
    {
-        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const 
+        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
        {
            return x;
        }
@@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace device

    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
    {
-        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
        {
            return lhs;
        }
@@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device

    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
    {
-        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
        {
            return rhs;
        }
@@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device

    template <typename T> struct maximum : binary_function<T, T, T>
    {
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
        {
            return lhs < rhs ? rhs : lhs;
        }
@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device

    template <typename T> struct minimum : binary_function<T, T, T>
    {
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
        {
            return lhs < rhs ? lhs : rhs;
        }
@@ -410,12 +410,14 @@ namespace cv { namespace gpu { namespace device
    #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
    #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR

-    template<typename T> struct hypot_sqr_func : binary_function<T, T, float> 
+    template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
    {
        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
        {
            return src1 * src1 + src2 * src2;
        }
+        __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
+        __device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
    };

    // Saturate Cast Functor
@@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device
        {
            return (src > thresh) * maxVal;
        }
+
        __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}

@@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device
        {
            return (src <= thresh) * maxVal;
        }
+
        __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}

@@ -519,12 +523,16 @@ namespace cv { namespace gpu { namespace device
      explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}

      __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
-      { 
-          return !pred(x); 
+      {
+          return !pred(x);
      }

+        __device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
+        __device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
+
      const Predicate pred;
    };
+
    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
    {
        return unary_negate<Predicate>(pred);
@@ -534,19 +542,26 @@ namespace cv { namespace gpu { namespace device
    {
        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}

-        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
-        { 
-            return !pred(x,y); 
+        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,
+                                                   typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
+        {
+            return !pred(x,y);
        }
+        __device__ __forceinline__ binary_negate(const binary_negate& other)
+        : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
+
+        __device__ __forceinline__ binary_negate() :
+        binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}

        const Predicate pred;
    };
+
    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
    {
        return binary_negate<BinaryPredicate>(pred);
    }

-    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
+    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
    {
        __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}

@@ -555,15 +570,19 @@ namespace cv { namespace gpu { namespace device
            return op(arg1, a);
        }

+        __device__ __forceinline__ binder1st(const binder1st& other) :
+        unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
+
        const Op op;
        const typename Op::first_argument_type arg1;
    };
+
    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
    {
        return binder1st<Op>(op, typename Op::first_argument_type(x));
    }

-    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> 
+    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
    {
        __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}

@@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device
            return op(a, arg2);
        }

+         __device__ __forceinline__ binder2nd(const binder2nd& other) :
+        unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
+
        const Op op;
        const typename Op::second_argument_type arg2;
    };
+
    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
    {
        return binder2nd<Op>(op, typename Op::second_argument_type(x));
    }

    // Functor Traits
-
    template <typename F> struct IsUnaryFunction
    {
        typedef char Yes;
@@ -618,7 +640,7 @@ namespace cv { namespace gpu { namespace device
        {
            enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
        };
-        
+
        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };

--- a/modules/gpu/src/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
@@ -46,7 +46,7 @@
 #include <limits>
 #include "common.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template<class T> struct numeric_limits
    {

--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@@ -57,35 +57,35 @@ namespace cv { namespace gpu { namespace device
    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }

    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
-    { 
-        return (uchar) ::max((int)v, 0); 
+    {
+        return (uchar) ::max((int)v, 0);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
-    { 
-        return (uchar) ::min((uint)v, (uint)UCHAR_MAX); 
+    {
+        return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
-    { 
-        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); 
+    {
+        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
-    { 
-        return (uchar) ::min(v, (uint)UCHAR_MAX); 
+    {
+        return (uchar) ::min(v, (uint)UCHAR_MAX);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
-    { 
-        return saturate_cast<uchar>((uint)v); 
+    {
+        return saturate_cast<uchar>((uint)v);
    }

    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
-    { 
-        int iv = __float2int_rn(v); 
-        return saturate_cast<uchar>(iv); 
+    {
+        int iv = __float2int_rn(v);
+        return saturate_cast<uchar>(iv);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); 
+        int iv = __double2int_rn(v);
        return saturate_cast<uchar>(iv);
    #else
        return saturate_cast<uchar>((float)v);
@@ -93,35 +93,35 @@ namespace cv { namespace gpu { namespace device
    }

    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
-    { 
-        return (schar) ::min((int)v, SCHAR_MAX); 
+    {
+        return (schar) ::min((int)v, SCHAR_MAX);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
-    { 
-        return (schar) ::min((uint)v, (uint)SCHAR_MAX); 
+    {
+        return (schar) ::min((uint)v, (uint)SCHAR_MAX);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
    {
        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
-    { 
-        return saturate_cast<schar>((int)v); 
+    {
+        return saturate_cast<schar>((int)v);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
-    { 
-        return (schar) ::min(v, (uint)SCHAR_MAX); 
+    {
+        return (schar) ::min(v, (uint)SCHAR_MAX);
    }

    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
-    { 
-        int iv = __float2int_rn(v); 
-        return saturate_cast<schar>(iv); 
+    {
+        int iv = __float2int_rn(v);
+        return saturate_cast<schar>(iv);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
-    {             
+    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); 
+        int iv = __double2int_rn(v);
        return saturate_cast<schar>(iv);
    #else
        return saturate_cast<schar>((float)v);
@@ -129,30 +129,30 @@ namespace cv { namespace gpu { namespace device
    }

    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
-    { 
-        return (ushort) ::max((int)v, 0); 
+    {
+        return (ushort) ::max((int)v, 0);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
-    { 
-        return (ushort) ::max((int)v, 0); 
+    {
+        return (ushort) ::max((int)v, 0);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
-    { 
-        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); 
+    {
+        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
-    { 
-        return (ushort) ::min(v, (uint)USHRT_MAX); 
+    {
+        return (ushort) ::min(v, (uint)USHRT_MAX);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
    {
-        int iv = __float2int_rn(v); 
-        return saturate_cast<ushort>(iv); 
+        int iv = __float2int_rn(v);
+        return saturate_cast<ushort>(iv);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
-    {             
+    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); 
+        int iv = __double2int_rn(v);
        return saturate_cast<ushort>(iv);
    #else
        return saturate_cast<ushort>((float)v);
@@ -160,37 +160,37 @@ namespace cv { namespace gpu { namespace device
    }

    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
-    { 
-        return (short) ::min((int)v, SHRT_MAX); 
+    {
+        return (short) ::min((int)v, SHRT_MAX);
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
    {
        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
-    { 
-        return (short) ::min(v, (uint)SHRT_MAX); 
+    {
+        return (short) ::min(v, (uint)SHRT_MAX);
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
-    { 
-        int iv = __float2int_rn(v); 
-        return saturate_cast<short>(iv); 
+    {
+        int iv = __float2int_rn(v);
+        return saturate_cast<short>(iv);
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
-    {            
+    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); 
+        int iv = __double2int_rn(v);
        return saturate_cast<short>(iv);
    #else
        return saturate_cast<short>((float)v);
    #endif
    }

-    template<> __device__ __forceinline__ int saturate_cast<int>(float v) 
-    { 
-        return __float2int_rn(v); 
+    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
+    {
+        return __float2int_rn(v);
    }
-    template<> __device__ __forceinline__ int saturate_cast<int>(double v) 
+    template<> __device__ __forceinline__ int saturate_cast<int>(double v)
    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
        return __double2int_rn(v);
@@ -200,11 +200,11 @@ namespace cv { namespace gpu { namespace device
    }

    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
-    { 
-        return __float2uint_rn(v); 
+    {
+        return __float2uint_rn(v);
    }
-    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) 
-    {            
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
+    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
        return __double2uint_rn(v);
    #else

--- a/modules/gpu/src/opencv2/gpu/device/scan.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/scan.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_SCAN_HPP__
+#define __OPENCV_GPU_SCAN_HPP__
+
+        enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };
+
+        template <ScanKind Kind, typename T, typename F> struct WarpScan
+        {
+            __device__ __forceinline__ WarpScan() {}
+            __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
+
+            __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+            {
+                const unsigned int lane = idx & 31;
+                F op;
+
+                if ( lane >=  1) ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+                if ( lane >=  2) ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+                if ( lane >=  4) ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+                if ( lane >=  8) ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+                if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+                if( Kind == INCLUSIVE )
+                    return ptr [idx];
+                else
+                    return (lane > 0) ? ptr [idx - 1] : 0;
+            }
+
+            __device__ __forceinline__ unsigned int index(const unsigned int tid)
+            {
+                return tid;
+            }
+
+            __device__ __forceinline__ void init(volatile T *ptr){}
+
+            static const int warp_offset      = 0;
+
+            typedef WarpScan<INCLUSIVE, T, F>  merge;
+        };
+
+        template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
+        {
+            __device__ __forceinline__ WarpScanNoComp() {}
+            __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
+
+            __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+            {
+                const unsigned int lane = threadIdx.x & 31;
+                F op;
+
+                ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+                ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+                ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+                ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+                ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+                if( Kind == INCLUSIVE )
+                    return ptr [idx];
+                else
+                    return (lane > 0) ? ptr [idx - 1] : 0;
+            }
+
+            __device__ __forceinline__ unsigned int index(const unsigned int tid)
+            {
+                return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
+            }
+
+            __device__ __forceinline__ void init(volatile T *ptr)
+            {
+                ptr[threadIdx.x] = 0;
+            }
+
+            static const int warp_smem_stride = 32 + 16 + 1;
+            static const int warp_offset      = 16;
+            static const int warp_log         = 5;
+            static const int warp_mask        = 31;
+
+            typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
+        };
+
+        template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
+        {
+            __device__ __forceinline__ BlockScan() {}
+            __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
+
+            __device__ __forceinline__ T operator()(volatile T *ptr)
+            {
+                const unsigned int tid  = threadIdx.x;
+                const unsigned int lane = tid & warp_mask;
+                const unsigned int warp = tid >> warp_log;
+
+                Sc scan;
+                typename Sc::merge merge_scan;
+                const unsigned int idx = scan.index(tid);
+
+                T val = scan(ptr, idx);
+                __syncthreads ();
+
+                if( warp == 0)
+                    scan.init(ptr);
+                __syncthreads ();
+
+                if( lane == 31 )
+                    ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
+                __syncthreads ();
+
+                if( warp == 0 )
+                    merge_scan(ptr, idx);
+                __syncthreads();
+
+                if ( warp > 0)
+                    val = ptr [scan.warp_offset + warp - 1] + val;
+                __syncthreads ();
+
+                ptr[idx] = val;
+                __syncthreads ();
+
+                return val ;
+            }
+
+            static const int warp_log  = 5;
+            static const int warp_mask = 31;
+        };
+
+#endif
\ No newline at end of file
--- a/modules/gpu/src/opencv2/gpu/device/static_check.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/static_check.hpp
@@ -43,27 +43,27 @@
 #ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
 #define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__

-#if defined(__CUDACC__) 
-    #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ 
+#if defined(__CUDACC__)
+    #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
 #else
    #define __OPENCV_GPU_HOST_DEVICE__
-#endif  
+#endif

-namespace cv { namespace gpu 
-{ 
+namespace cv { namespace gpu
+{
    namespace device
    {
        template<bool expr> struct Static {};
-        
-        template<> struct Static<true> 
-        { 
-            __OPENCV_GPU_HOST_DEVICE__ static void check() {}; 
+
+        template<> struct Static<true>
+        {
+            __OPENCV_GPU_HOST_DEVICE__ static void check() {};
        };
-    }    
+    }

    using ::cv::gpu::device::Static;
 }}

 #undef __OPENCV_GPU_HOST_DEVICE__

-#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */ 
\ No newline at end of file
+#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -47,7 +47,7 @@
 #include "utility.hpp"
 #include "detail/transform_detail.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template <typename T, typename D, typename UnOp, typename Mask>
    static inline void transform(DevMem2D_<T> src, DevMem2D_<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)

--- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
@@ -45,11 +45,11 @@

 #include "detail/type_traits_detail.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template <typename T> struct IsSimpleParameter
    {
-        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value || 
+        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
            type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
    };

@@ -65,16 +65,16 @@ namespace cv { namespace gpu { namespace device
        enum { isVolatile       = type_traits_detail::UnVolatile<T>::value };

        enum { isReference      = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
-        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };        
+        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };

-        enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
-        enum { isSignedInt   = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
-        enum { isIntegral    = type_traits_detail::IsIntegral<UnqualifiedType>::value };
-        enum { isFloat       = type_traits_detail::IsFloat<UnqualifiedType>::value  };
-        enum { isArith       = isIntegral || isFloat };
-        enum { isVec         = type_traits_detail::IsVec<UnqualifiedType>::value  };
-        
-        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value, 
+        enum { isUnsignedInt    = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
+        enum { isSignedInt      = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
+        enum { isIntegral       = type_traits_detail::IsIntegral<UnqualifiedType>::value };
+        enum { isFloat          = type_traits_detail::IsFloat<UnqualifiedType>::value };
+        enum { isArith          = isIntegral || isFloat };
+        enum { isVec            = type_traits_detail::IsVec<UnqualifiedType>::value };
+
+        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
            T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
    };
 }}}

--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -47,17 +47,17 @@
 #include "datamov_utils.hpp"
 #include "detail/utility_detail.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    #define OPENCV_GPU_LOG_WARP_SIZE	    (5)
-    #define OPENCV_GPU_WARP_SIZE	        (1 << OPENCV_GPU_LOG_WARP_SIZE)
+    #define OPENCV_GPU_LOG_WARP_SIZE        (5)
+    #define OPENCV_GPU_WARP_SIZE            (1 << OPENCV_GPU_LOG_WARP_SIZE)
    #define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
    #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)

    ///////////////////////////////////////////////////////////////////////////////
    // swap

-    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) 
+    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
    {
        const T temp = a;
        a = b;
@@ -71,9 +71,9 @@ namespace cv { namespace gpu { namespace device
    {
        explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
        __host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
-        
+
        __device__ __forceinline__ bool operator()(int y, int x) const
-        {            
+        {
            return mask.ptr(y)[x] != 0;
        }

@@ -82,13 +82,13 @@ namespace cv { namespace gpu { namespace device

    struct SingleMaskChannels
    {
-        __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_) 
+        __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
        : mask(mask_), channels(channels_) {}
        __host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
            :mask(mask_.mask), channels(mask_.channels){}
-        
+
        __device__ __forceinline__ bool operator()(int y, int x) const
-        {            
+        {
            return mask.ptr(y)[x / channels] != 0;
        }

@@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device
        {
            curMask = maskCollection[z];
        }
-        
+
        __device__ __forceinline__ bool operator()(int y, int x) const
        {
            uchar val;
@@ -165,20 +165,20 @@ namespace cv { namespace gpu { namespace device
        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
    }

-    template <int n, typename T, typename V, typename Pred> 
+    template <int n, typename T, typename V, typename Pred>
    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
    {
        StaticAssert<n >= 8 && n <= 512>::check();
        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
    }

-    template <int n, typename T, typename V1, typename V2, typename Pred> 
+    template <int n, typename T, typename V1, typename V2, typename Pred>
    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
    {
        StaticAssert<n >= 8 && n <= 512>::check();
        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
    }
-    
+
    ///////////////////////////////////////////////////////////////////////////////
    // Solve linear system

@@ -212,17 +212,17 @@ namespace cv { namespace gpu { namespace device
        {
            double invdet = 1.0 / det;

-            x[0] = saturate_cast<T>(invdet * 
+            x[0] = saturate_cast<T>(invdet *
                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));

-            x[1] = saturate_cast<T>(invdet * 
+            x[1] = saturate_cast<T>(invdet *
                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));

-            x[2] = saturate_cast<T>(invdet * 
+            x[2] = saturate_cast<T>(invdet *
                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));

--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
@@ -47,7 +47,7 @@
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template <typename T> struct L1Dist
    {
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
    };

    // calc distance between two vectors in global memory
-    template <int THREAD_DIM, typename Dist, typename T1, typename T2> 
+    template <int THREAD_DIM, typename Dist, typename T1, typename T2>
    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
    {
        for (int i = tid; i < len; i += THREAD_DIM)
@@ -170,9 +170,9 @@ namespace cv { namespace gpu { namespace device
    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
-    {        
+    {
        vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
-        
+
        dist.reduceAll<THREAD_DIM>(smem, tid);
    }


--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@@ -47,7 +47,7 @@
 #include "vec_traits.hpp"
 #include "functional.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace vec_math_detail
    {
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
    }

    namespace vec_math_detail
-    {    
+    {
        template <typename T1, typename T2> struct BinOpTraits
        {
            typedef int argument_type;
@@ -326,5 +326,5 @@ namespace cv { namespace gpu { namespace device
    #undef OPENCV_GPU_IMPLEMENT_VEC_OP
    #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
 }}} // namespace cv { namespace gpu { namespace device
-        
+
 #endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -45,7 +45,7 @@

 #include "common.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template<typename T, int N> struct TypeVec;

@@ -219,18 +219,18 @@ namespace cv { namespace gpu { namespace device

    #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS

-    template<> struct VecTraits<char> 
-    { 
+    template<> struct VecTraits<char>
+    {
        typedef char elem_type;
-        enum {cn=1}; 
+        enum {cn=1};
        static __device__ __host__ __forceinline__ char all(char v) {return v;}
        static __device__ __host__ __forceinline__ char make(char x) {return x;}
        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
    };
-    template<> struct VecTraits<schar> 
-    { 
+    template<> struct VecTraits<schar>
+    {
        typedef schar elem_type;
-        enum {cn=1}; 
+        enum {cn=1};
        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}

--- a/modules/gpu/src/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
 #define __OPENCV_GPU_DEVICE_WARP_HPP__

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    struct Warp
    {
@@ -64,18 +64,18 @@ namespace cv { namespace gpu { namespace device

        template<typename It, typename T>
        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
-        {                
+        {
            for(It t = beg + laneId(); t < end; t += STRIDE)
                *t = value;
-        }            
+        }

        template<typename InIt, typename OutIt>
        static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
-        {                
+        {
            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
                *out = *t;
            return out;
-        }            
+        }

        template<typename InIt, typename OutIt, class UnOp>
        static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
@@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device
        {
            unsigned int lane = laneId();

-            InIt1 t1 = beg1 + lane; 
+            InIt1 t1 = beg1 + lane;
            InIt2 t2 = beg2 + lane;
            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
                *out = op(*t1, *t2);
@@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device
        template<typename OutIt, typename T>
        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
        {
-            unsigned int lane = laneId();                
+            unsigned int lane = laneId();
            value += lane;

            for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)

--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@@ -44,7 +44,32 @@

 #ifndef HAVE_CUDA

-void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
+{
+    (void)src;
+    (void)dst;
+    (void)dsize;
+    (void)fx;
+    (void)fy;
+    (void)interpolation;
+    (void)s;
+
+    throw_nogpu();
+}
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst,GpuMat& buffer, Size dsize,
+            double fx, double fy, int interpolation, Stream& s)
+{
+    (void)src;
+    (void)dst;
+    (void)dsize;
+    (void)fx;
+    (void)fy;
+    (void)interpolation;
+    (void)buffer;
+    (void)s;
+
+    throw_nogpu();
+}

 #else // HAVE_CUDA