scan operations are moved in separate header

e7f6c4b7 · Marina Kolpakova · 8748cbc2 · e7f6c4b7 · e7f6c4b7 · e7f6c4b7
Commit e7f6c4b7 authored Jun 20, 2012 by Marina Kolpakova
28 changed files
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -629,10 +629,6 @@ CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, doubl
 //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
 CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());

-//! resizes the image
-//! Supports INTER_AREA
-CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize, double fx=0, double fy=0, int interpolation = INTER_AREA, Stream& stream = Stream::Null());
-
 //! warps the image using affine transformation
 //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
 CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,

--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -118,10 +118,10 @@ GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)

 INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
    ALL_DEVICES,
-    testing::Values(perf::sz1080p, cv::Size(4096, 2048)),
-    testing::Values(MatType(CV_8UC1)/*, MatType(CV_8UC3), MatType(CV_8UC4),
+    testing::Values(perf::sz1080p/*, cv::Size(4096, 2048)*/),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
+                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));

 //////////////////////////////////////////////////////////////////////

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -1253,7 +1253,7 @@ namespace cv { namespace gpu { namespace device
    {
        const T val;

-        __host__ explicit CompareScalar(T val) : val(val) {}
+        __host__ explicit CompareScalar(T val_) : val(val_) {}

        __device__ __forceinline__ uchar operator()(T src) const
        {
@@ -1266,7 +1266,7 @@ namespace cv { namespace gpu { namespace device
    {
        const TYPE_VEC(T, 2) val;

-        __host__ explicit CompareScalar(TYPE_VEC(T, 2) val) : val(val) {}
+        __host__ explicit CompareScalar(TYPE_VEC(T, 2) val_) : val(val_) {}

        __device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
        {
@@ -1281,7 +1281,7 @@ namespace cv { namespace gpu { namespace device
    {
        const TYPE_VEC(T, 3) val;

-        __host__ explicit CompareScalar(TYPE_VEC(T, 3) val) : val(val) {}
+        __host__ explicit CompareScalar(TYPE_VEC(T, 3) val_) : val(val_) {}

        __device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
        {
@@ -1297,7 +1297,7 @@ namespace cv { namespace gpu { namespace device
    {
        const TYPE_VEC(T, 4) val;

-        __host__ explicit CompareScalar(TYPE_VEC(T, 4) val) : val(val) {}
+        __host__ explicit CompareScalar(TYPE_VEC(T, 4) val_) : val(val_) {}

        __device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
        {

--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device

        struct Mask8U
        {
-            explicit Mask8U(PtrStepb mask): mask(mask) {}
+            explicit Mask8U(PtrStepb mask_): mask(mask_) {}

            __device__ __forceinline__ bool operator()(int y, int x) const
            {

--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -46,7 +46,8 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"
-# include <cfloat>
+#include <cfloat>
+#include <opencv2/gpu/device/scan.hpp>

 namespace cv { namespace gpu { namespace device
 {
@@ -285,367 +286,5 @@ namespace cv { namespace gpu { namespace device
            typedef float scan_line_type;
        };

-//        template <typename T>
-//        __global__ void resize_area_scan(const DevMem2D_<T> src, DevMem2D_<T> dst, int fx, int fy,  DevMem2D_<T> buffer)
-//        {
-//            typedef typename scan_traits<T>::scan_line_type W;
-//            extern __shared__ W line[];
-
-//            const int x = threadIdx.x;
-//            const int y = blockIdx.x;
-
-//            if (y >= src.rows) return;
-
-//            int offset = 1;
-
-//            line[2 * x + 0] = src(y, 2 * x + 0);
-//            line[2 * x + 1] = src(y, 2 * x + 1);
-
-//            __syncthreads();//???
-//            // reduction
-//            for (int d = blockDim.x; d > 0; d >>= 1)
-//            {
-//                __syncthreads();
-//                if (x < d)
-//                {
-//                    int ai = 2 * x * offset -1 + 1 * offset;
-//                    int bi = 2 * x * offset -1 + 2 * offset;
-//                    line[bi] += line[ai];
-//                }
-
-//                offset *= 2;
-//            }
-
-//            __syncthreads();
-//            // convolution
-//            if (x == 0) { line[(blockDim.x << 1) - 1] = 0; printf("offset: %d!!!!!!!!!!!!!\n", fx);}
-
-//            for (int d = 1; d < (blockDim.x << 1); d *= 2)
-//            {
-//                offset >>= 1;
-
-//                __syncthreads();
-//                if (x < d)
-//                {
-//                    int ai = offset * 2 * x + 1 * offset - 1;
-//                    int bi = offset * 2 * x + 2 * offset - 1;
-
-//                    W t = line[ai];
-//                    line[ai] = line[bi];
-//                    line[bi] += t;
-//                }
-//            }
-//            __syncthreads();
-
-//            // calculate sum
-//            int start = 0;
-//            int out_idx = 0;
-//            int end = start + fx;
-//            while (start < (blockDim.x << 1) && end < (blockDim.x << 1))
-//            {
-//                buffer(y, out_idx) = saturate_cast<T>((line[end] - line[start]) / fx);
-//                start = end;
-//                end = start + fx;
-//                out_idx++;
-//            }
-
-//        }
-
-        template <typename T>
-        __device__ void scan_y(DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,int fx, int fy,  DevMem2D_<T> dst,
-                               typename scan_traits<T>::scan_line_type* line, int g_base)
-        {
-            typedef typename scan_traits<T>::scan_line_type W;
-
-            const int y = threadIdx.x;
-            const int x = blockIdx.x;
-
-            float scale = 1.f / (fx * fy);
-
-            if (x >= buffer.cols) return;
-
-            int offset = 1;
-            line[2 * y + 0] = buffer((g_base * fy) + 2 * y + 1, x);
-
-            if (y != (blockDim.x -1) )
-                line[2 * y + 1] = buffer((g_base * fy) + 2 * y + 2, x);
-            else
-                line[2 * y + 1] = 0;
-
-            __syncthreads();
-
-            // reduction
-            for (int d = blockDim.x; d > 0; d >>= 1)
-            {
-                __syncthreads();
-                if (y < d)
-                {
-                    int ai = 2 * y * offset -1 + 1 * offset;
-                    int bi = 2 * y * offset -1 + 2 * offset;
-                    line[bi] += line[ai];
-                }
-
-                offset *= 2;
-            }
-
-            __syncthreads();
-            // convolution
-            if (y == 0) line[(blockDim.x << 1) - 1] = (W)buffer(0, x);
-
-            for (int d = 1; d < (blockDim.x << 1); d *= 2)
-            {
-                offset >>= 1;
-
-                __syncthreads();
-                if (y < d)
-                {
-                    int ai = offset * 2 * y + 1 * offset - 1;
-                    int bi = offset * 2 * y + 2 * offset - 1;
-
-
-                    W t = line[ai];
-                    line[ai] = line[bi];
-                    line[bi] += t;
-                }
-            }
-            __syncthreads();
-
-            if (y < dst.rows)
-            {
-                W start = (y == 0)? (W)0:line[y * fy -1];
-                W end = line[y * fy + fy - 1];
-                dst(g_base +  y ,x) = saturate_cast<T>((end - start) * scale);
-            }
-        }
-
-        template <typename T>
-        __device__ void scan_x(const DevMem2D_<T> src, int fx, int fy, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,
-                               typename scan_traits<T>::scan_line_type* line, int g_base)
-        {
-            typedef typename scan_traits<T>::scan_line_type W;
-
-            const int x = threadIdx.x;
-            const int y = blockIdx.x;
-
-            float scale = 1.f / (fx * fy);
-
-            if (y >= src.rows) return;
-
-            int offset = 1;
-
-            line[2 * x + 0] = (W)src(y, (g_base * fx) + 2 * x + 1);
-
-            if (x != (blockDim.x -1) )
-                line[2 * x + 1] = (W)src(y, (g_base * fx) + 2 * x + 2);
-            else
-                line[2 * x + 1] = 0;
-
-            __syncthreads();
-
-            // reduction
-            for (int d = blockDim.x; d > 0; d >>= 1)
-            {
-                __syncthreads();
-                if (x < d)
-                {
-                    int ai = 2 * x * offset -1 + 1 * offset;
-                    int bi = 2 * x * offset -1 + 2 * offset;
-                    line[bi] += line[ai];
-                }
-
-                offset *= 2;
-            }
-
-            __syncthreads();
-            // convolution
-            if (x == 0) line[(blockDim.x << 1) - 1] = (W)src(y, 0);
-
-            for (int d = 1; d < (blockDim.x << 1); d *= 2)
-            {
-                offset >>= 1;
-
-                __syncthreads();
-                if (x < d)
-                {
-                    int ai = offset * 2 * x + 1 * offset - 1;
-                    int bi = offset * 2 * x + 2 * offset - 1;
-
-                    W t = line[ai];
-                    line[ai] = line[bi];
-                    line[bi] += t;
-                }
-            }
-            __syncthreads();
-
-            if (x < buffer.cols)
-            {
-                W start = (x == 0)? (W)0:line[x * fx -1];
-                W end = line[x * fx + fx - 1];
-                buffer(y, g_base +  x) =(end - start);
-            }
-        }
-
-        enum ScanKind { exclusive,  inclusive } ;
-
-        template <ScanKind Kind , class T>
-        __device__ __forceinline__ T scan_warp ( volatile T *ptr , const unsigned int idx = threadIdx.x )
-        {
-            const unsigned int lane = idx & 31;
-
-            if ( lane >=  1) ptr [idx ] = ptr [idx -  1] + ptr [idx];
-            if ( lane >=  2) ptr [idx ] = ptr [idx -  2] + ptr [idx];
-            if ( lane >=  4) ptr [idx ] = ptr [idx -  4] + ptr [idx];
-            if ( lane >=  8) ptr [idx ] = ptr [idx -  8] + ptr [idx];
-            if ( lane >= 16) ptr [idx ] = ptr [idx - 16] + ptr [idx];
-
-            if( Kind == inclusive )
-                return ptr [idx ];
-            else
-                return (lane > 0) ? ptr [idx - 1] : 0;
-        }
-
-        template <ScanKind Kind , class T>
-        __device__ __forceinline__ T scan_block( volatile T *ptr)
-        {
-            const unsigned int idx = threadIdx.x;
-            const unsigned int lane = idx & 31;
-            const unsigned int warp = idx >> 5;
-
-            T val = scan_warp <Kind>( ptr , idx );
-            __syncthreads ();
-
-            if( lane == 31 )
-                ptr [ warp ] = ptr [idx ];
-
-            __syncthreads ();
-
-            if( warp == 0 )
-                scan_warp<inclusive>( ptr , idx );
-
-            __syncthreads ();
-
-            if ( warp > 0)
-                val = ptr [warp -1] + val;
-
-            __syncthreads ();
-
-            ptr[idx] = val;
-
-            __syncthreads ();
-
-            return val ;
-        }
-
-        template<typename T, typename W>
-        __global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines, int stride)
-        {
-            extern __shared__ W sbuf[];
-
-            const unsigned int tid = threadIdx. x;
-
-            // load line-block on shared memory
-            int y = blockIdx.x / thred_lines;
-            int input_stride = (blockIdx.x % thred_lines) * stride;
-            int x = input_stride  + tid;
-
-            // store global data in shared memory
-            if (x  < src.cols && y < src.rows)
-                sbuf[tid] = src(y, x);
-            else
-                sbuf[tid] = 0;
-            __syncthreads();
-
-            scan_block<inclusive, W>(sbuf);
-
-            float scale =  __fdividef(1.f, fx);
-            int out_stride = input_stride / fx;
-            int count = blockDim.x / fx;
-
-            if (tid < count)
-            {
-                int start_idx = (tid == 0)? 0 : tid * fx - 1;
-                int end_idx = tid * fx + fx - 1;
-
-                W start = (tid == 0)? (W)0:sbuf[start_idx];
-                W end = sbuf[end_idx];
-
-                dst(y, out_stride  +  tid) = (end - start);
-            }
-        }
-
-        template<typename T, typename W>
-        __global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines, int stride)
-        {
-            extern __shared__ W sbuf[];
-
-            const unsigned int tid = threadIdx. x;
-
-            // load line-block on shared memory
-            int x = blockIdx.x / thred_lines;
-
-            int global_stride = (blockIdx.x % thred_lines) * stride;
-            int y = global_stride + tid;
-
-            // store global data in shared memory
-            if (x  < src.cols && y < src.rows)
-                sbuf[tid] = src(y, x);
-            else
-                sbuf[tid] = 0;
-
-            __syncthreads();
-            scan_block<inclusive, W>(sbuf);
-
-            float scale =  __fdividef(1.f, fx * fy);
-            int out_stride = global_stride / fx;
-            int count = blockDim.x / fx;
-
-            if (tid < count)
-            {
-                int start_idx = (tid == 0)? 0 : tid * fx - 1;
-                int end_idx = tid * fx + fx - 1;
-
-                W start = (tid == 0)? (W)0:sbuf[start_idx];
-                W end = sbuf[end_idx];
-
-                dst(out_stride  +  tid, x) = saturate_cast<T>((end - start) * scale);
-            }
-        }
-
-        template <typename T>
-        void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,
-                             int interpolation, DevMem2Df buffer, cudaStream_t stream)
-        {
-            (void)interpolation;
-
-            int iscale_x = round(fx);
-            int iscale_y = round(fy);
-
-            int warps = 4;
-            const int threads = 32 * warps;
-            int input_stride = threads / iscale_x;
-
-            int thred_lines = divUp(src.cols, input_stride * iscale_x);
-            int blocks = src.rows * thred_lines;
-
-            typedef typename scan_traits<T>::scan_line_type smem_type;
-
-            resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
-                    (src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x);
-
-            input_stride = threads / iscale_y;
-            thred_lines = divUp(src.rows, input_stride * iscale_y);
-            blocks = dst.cols * thred_lines;
-
-            resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
-                    (buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void resize_area_gpu<uchar>(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream);
-
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-            mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            mergeC2_<T><<<grid, block, 0, stream>>>(
                    src[0].data, src[0].step,
                    src[1].data, src[1].step,
                    dst.rows, dst.cols, dst.data, dst.step);
@@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-            mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            mergeC3_<T><<<grid, block, 0, stream>>>(
                    src[0].data, src[0].step,
                    src[1].data, src[1].step,
                    src[2].data, src[2].step,
@@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-            mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            mergeC4_<T><<<grid, block, 0, stream>>>(
                    src[0].data, src[0].step,
                    src[1].data, src[1].step,
                    src[2].data, src[2].step,
@@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-            splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            splitC2_<T><<<grid, block, 0, stream>>>(
                    src.data, src.step, src.rows, src.cols,
                    dst[0].data, dst[0].step,
                    dst[1].data, dst[1].step);
@@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-            splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            splitC3_<T><<<grid, block, 0, stream>>>(
                    src.data, src.step, src.rows, src.cols,
                    dst[0].data, dst[0].step,
                    dst[1].data, dst[1].step,
@@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
-            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-            splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            splitC4_<T><<<grid, block, 0, stream>>>(
                     src.data, src.step, src.rows, src.cols,
                     dst[0].data, dst[0].step,
                     dst[1].data, dst[1].step,

--- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
@@ -1121,18 +1121,18 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
                dim3 p_threads(32, 8);

-                NcvSize32u srcSize (kLevelWidth, kLevelHeight);
+                NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight);
                NcvSize32u dstSize (nw, nh);
                NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
                NcvRect32u dstROI (0, 0, nw, nh);

-                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, 
+                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
                    ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );

                ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);

-                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, 
+                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
                    ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );

                ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);

--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
@@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
 //===================================================================


-NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_)
    :
    currentSize(0),
    _maxSize(0),
@@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
    begin(NULL),
    end(NULL),
    _memType(NCVMemoryTypeNone),
-    _alignment(alignment),
+    _alignment(alignment_),
    bReusesMemory(false)
 {
-    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
 }


-NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr)
    :
    currentSize(0),
    _maxSize(0),
    allocBegin(NULL),
    _memType(memT),
-    _alignment(alignment)
+    _alignment(alignment_)
 {
-    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
    ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");

@@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const
 //===================================================================


-NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_)
    :
    currentSize(0),
    _maxSize(0),
    _memType(memT),
-    _alignment(alignment)
+    _alignment(alignment_)
 {
    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
 }

--- a/modules/gpu/src/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/common.hpp
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
--- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
--- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -416,6 +416,8 @@ namespace cv { namespace gpu { namespace device
        {
            return src1 * src1 + src2 * src2;
        }
+        __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
+        __device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
    };

    // Saturate Cast Functor
@@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device
        {
            return (src > thresh) * maxVal;
        }
+
        __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}

@@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device
        {
            return (src <= thresh) * maxVal;
        }
+
        __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}

@@ -523,8 +527,12 @@ namespace cv { namespace gpu { namespace device
          return !pred(x);
      }

+        __device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
+        __device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
+
      const Predicate pred;
    };
+
    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
    {
        return unary_negate<Predicate>(pred);
@@ -534,13 +542,20 @@ namespace cv { namespace gpu { namespace device
    {
        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}

-        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
+        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,
+                                                   typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
        {
            return !pred(x,y);
        }
+        __device__ __forceinline__ binary_negate(const binary_negate& other)
+        : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
+
+        __device__ __forceinline__ binary_negate() :
+        binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}

        const Predicate pred;
    };
+
    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
    {
        return binary_negate<BinaryPredicate>(pred);
@@ -555,9 +570,13 @@ namespace cv { namespace gpu { namespace device
            return op(arg1, a);
        }

+        __device__ __forceinline__ binder1st(const binder1st& other) :
+        unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
+
        const Op op;
        const typename Op::first_argument_type arg1;
    };
+
    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
    {
        return binder1st<Op>(op, typename Op::first_argument_type(x));
@@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device
            return op(a, arg2);
        }

+         __device__ __forceinline__ binder2nd(const binder2nd& other) :
+        unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
+
        const Op op;
        const typename Op::second_argument_type arg2;
    };
+
    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
    {
        return binder2nd<Op>(op, typename Op::second_argument_type(x));
    }

    // Functor Traits
-
    template <typename F> struct IsUnaryFunction
    {
        typedef char Yes;

--- a/modules/gpu/src/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
--- a/modules/gpu/src/opencv2/gpu/device/scan.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/scan.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_SCAN_HPP__
+#define __OPENCV_GPU_SCAN_HPP__
+
+        enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };
+
+        template <ScanKind Kind, typename T, typename F> struct WarpScan
+        {
+            __device__ __forceinline__ WarpScan() {}
+            __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
+
+            __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+            {
+                const unsigned int lane = idx & 31;
+                F op;
+
+                if ( lane >=  1) ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+                if ( lane >=  2) ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+                if ( lane >=  4) ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+                if ( lane >=  8) ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+                if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+                if( Kind == INCLUSIVE )
+                    return ptr [idx];
+                else
+                    return (lane > 0) ? ptr [idx - 1] : 0;
+            }
+
+            __device__ __forceinline__ unsigned int index(const unsigned int tid)
+            {
+                return tid;
+            }
+
+            __device__ __forceinline__ void init(volatile T *ptr){}
+
+            static const int warp_offset      = 0;
+
+            typedef WarpScan<INCLUSIVE, T, F>  merge;
+        };
+
+        template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
+        {
+            __device__ __forceinline__ WarpScanNoComp() {}
+            __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
+
+            __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+            {
+                const unsigned int lane = threadIdx.x & 31;
+                F op;
+
+                ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+                ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+                ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+                ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+                ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+                if( Kind == INCLUSIVE )
+                    return ptr [idx];
+                else
+                    return (lane > 0) ? ptr [idx - 1] : 0;
+            }
+
+            __device__ __forceinline__ unsigned int index(const unsigned int tid)
+            {
+                return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
+            }
+
+            __device__ __forceinline__ void init(volatile T *ptr)
+            {
+                ptr[threadIdx.x] = 0;
+            }
+
+            static const int warp_smem_stride = 32 + 16 + 1;
+            static const int warp_offset      = 16;
+            static const int warp_log         = 5;
+            static const int warp_mask        = 31;
+
+            typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
+        };
+
+        template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
+        {
+            __device__ __forceinline__ BlockScan() {}
+            __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
+
+            __device__ __forceinline__ T operator()(volatile T *ptr)
+            {
+                const unsigned int tid  = threadIdx.x;
+                const unsigned int lane = tid & warp_mask;
+                const unsigned int warp = tid >> warp_log;
+
+                Sc scan;
+                typename Sc::merge merge_scan;
+                const unsigned int idx = scan.index(tid);
+
+                T val = scan(ptr, idx);
+                __syncthreads ();
+
+                if( warp == 0)
+                    scan.init(ptr);
+                __syncthreads ();
+
+                if( lane == 31 )
+                    ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
+                __syncthreads ();
+
+                if( warp == 0 )
+                    merge_scan(ptr, idx);
+                __syncthreads();
+
+                if ( warp > 0)
+                    val = ptr [scan.warp_offset + warp - 1] + val;
+                __syncthreads ();
+
+                ptr[idx] = val;
+                __syncthreads ();
+
+                return val ;
+            }
+
+            static const int warp_log  = 5;
+            static const int warp_mask = 31;
+        };
+
+#endif
\ No newline at end of file
--- a/modules/gpu/src/opencv2/gpu/device/static_check.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/static_check.hpp
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
--- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
--- a/modules/gpu/src/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@@ -80,51 +80,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
                        DevMem2Db dst, int interpolation, cudaStream_t stream);
-
-        template <typename T>
-        void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,
-                             int interpolation, DevMem2Df buffer, cudaStream_t stream);
    }
 }}}

-void cv::gpu::resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize, double fx, double fy,
-                     int interpolation, Stream& s)
-{
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(interpolation == INTER_AREA);
-    CV_Assert( (fx < 1.0) && (fy < 1.0));
-    CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
-    CV_Assert(src.cols >= 128 && src.rows >= 128);
-    CV_Assert((fx - 128.0) <= 0 && (fy - 128.0) <= 0);
-
-    if (dsize == Size())
-        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
-    else
-    {
-        fx = static_cast<double>(dsize.width) / src.cols;
-        fy = static_cast<double>(dsize.height) / src.rows;
-    }
-
-    fx = static_cast<float>(1.0 / fx);
-    fy = static_cast<float>(1.0 / fy);
-
-    dst.create(dsize, src.type());
-    buffer.create(cv::Size(dsize.width, src.rows), CV_32FC1);
-
-    if (dsize == src.size())
-    {
-        if (s)
-            s.enqueueCopy(src, dst);
-        else
-            src.copyTo(dst);
-        return;
-    }
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    cv::gpu::device::imgproc::resize_area_gpu<uchar>(src, dst, fx, fy, interpolation, buffer, stream);
-}
-
 void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
 {
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);

--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@@ -182,45 +182,6 @@ PARAM_TEST_CASE(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, double, Inte
    }
 };

-TEST_P(ResizeArea, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
-    cv::gpu::GpuMat buffer = createMat(cv::Size(dst.cols, src.rows), CV_32FC1);
-
-    cv::gpu::resize(loadMat(src, useRoi), dst, buffer, cv::Size(), coeff, coeff, interpolation);
-
-    cv::Mat dst_cpu;
-
-    cv::resize(src, dst_cpu, cv::Size(), coeff, coeff, interpolation);
-
-   cv::Mat gpu_buff;
-   buffer.download(gpu_buff);
-
-   cv::Mat gpu;
-   dst.download(gpu);
-
-   // std::cout // << src
-   // // << std::endl << std::endl
-   // // << gpu_buff
-   // // << std::endl << std::endl
-   // << gpu
-   // << std::endl << std::endl
-   // << dst_cpu<<  std::endl;
-
-
-    EXPECT_MAT_NEAR(dst_cpu, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeArea, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(cv::Size(640, 480)),//DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1)/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
-    testing::Values(0.05, 0.1),
-    testing::Values(Interpolation(cv::INTER_AREA)),
-    WHOLE_SUBMAT));
-
 ///////////////////////////////////////////////////////////////////
 // Test NPP