used new device layer for cv::gpu::integral

7839dbd2 · Vladislav Vinogradov · 224f18b0 · 7839dbd2 · 7839dbd2 · 7839dbd2
Commit 7839dbd2 authored Aug 27, 2013 by Vladislav Vinogradov
6 changed files
--- a/modules/cudaarithm/perf/perf_arithm.cpp
+++ b/modules/cudaarithm/perf/perf_arithm.cpp
@@ -248,60 +248,3 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
        CPU_SANITY_CHECK(dst);
    }
 }
-
-//////////////////////////////////////////////////////////////////////
-// Integral
-
-PERF_TEST_P(Sz, Integral,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;
-
-        TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
-
-        CUDA_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::integral(src, dst);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// IntegralSqr
-
-PERF_TEST_P(Sz, IntegralSqr,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat dst, buf;
-
-        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
-
-        CUDA_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
--- a/modules/cudaarithm/perf/perf_reductions.cpp
+++ b/modules/cudaarithm/perf/perf_reductions.cpp
@@ -465,3 +465,60 @@ PERF_TEST_P(Sz, MeanStdDev,
        SANITY_CHECK(cpu_stddev);
    }
 }
+
+//////////////////////////////////////////////////////////////////////
+// Integral
+
+PERF_TEST_P(Sz, Integral,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+        cv::cuda::GpuMat d_buf;
+
+        TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::integral(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// IntegralSqr
+
+PERF_TEST_P(Sz, IntegralSqr,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst, buf;
+
+        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
--- a/modules/cudaarithm/src/cuda/integral.cu
+++ b/modules/cudaarithm/src/cuda/integral.cu
@@ -40,433 +40,61 @@
 //
 //M*/

-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"

-#include "opencv2/core/cuda/common.hpp"
+#ifndef HAVE_OPENCV_CUDEV

-namespace cv { namespace cuda { namespace device
-{
-    namespace imgproc
-    {
-        // Utility function to extract unsigned chars from an unsigned integer
-        __device__ uchar4 int_to_uchar4(unsigned int in)
-        {
-            uchar4 bytes;
-            bytes.x = (in & 0x000000ff) >>  0;
-            bytes.y = (in & 0x0000ff00) >>  8;
-            bytes.z = (in & 0x00ff0000) >> 16;
-            bytes.w = (in & 0xff000000) >> 24;
-            return bytes;
-        }
-
-        __global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
-        {
-        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
-            __shared__ int sums[128];
-
-            const int id = threadIdx.x;
-            const int lane_id = id % warpSize;
-            const int warp_id = id / warpSize;
-
-            const uint4 data = img(blockIdx.x, id);
-
-            const uchar4 a = int_to_uchar4(data.x);
-            const uchar4 b = int_to_uchar4(data.y);
-            const uchar4 c = int_to_uchar4(data.z);
-            const uchar4 d = int_to_uchar4(data.w);
-
-            int result[16];
-
-            result[0]  =              a.x;
-            result[1]  = result[0]  + a.y;
-            result[2]  = result[1]  + a.z;
-            result[3]  = result[2]  + a.w;
-
-            result[4]  = result[3]  + b.x;
-            result[5]  = result[4]  + b.y;
-            result[6]  = result[5]  + b.z;
-            result[7]  = result[6]  + b.w;
-
-            result[8]  = result[7]  + c.x;
-            result[9]  = result[8]  + c.y;
-            result[10] = result[9]  + c.z;
-            result[11] = result[10] + c.w;
-
-            result[12] = result[11] + d.x;
-            result[13] = result[12] + d.y;
-            result[14] = result[13] + d.z;
-            result[15] = result[14] + d.w;
-
-            int sum = result[15];
-
-            // the prefix sum for each thread's 16 value is computed,
-            // now the final sums (result[15]) need to be shared
-            // with the other threads and add.  To do this,
-            // the __shfl_up() instruction is used and a shuffle scan
-            // operation is performed to distribute the sums to the correct
-            // threads
-            #pragma unroll
-            for (int i = 1; i < 32; i *= 2)
-            {
-                const int n = __shfl_up(sum, i, 32);
-
-                if (lane_id >= i)
-                {
-                    #pragma unroll
-                    for (int i = 0; i < 16; ++i)
-                        result[i] += n;
-
-                    sum += n;
-                }
-            }
-
-            // Now the final sum for the warp must be shared
-            // between warps.  This is done by each warp
-            // having a thread store to shared memory, then
-            // having some other warp load the values and
-            // compute a prefix sum, again by using __shfl_up.
-            // The results are uniformly added back to the warps.
-            // last thread in the warp holding sum of the warp
-            // places that in shared
-            if (threadIdx.x % warpSize == warpSize - 1)
-                sums[warp_id] = result[15];
-
-            __syncthreads();
-
-            if (warp_id == 0)
-            {
-                int warp_sum = sums[lane_id];
-
-                #pragma unroll
-                for (int i = 1; i <= 32; i *= 2)
-                {
-                    const int n = __shfl_up(warp_sum, i, 32);
-
-                    if (lane_id >= i)
-                        warp_sum += n;
-                }
-
-                sums[lane_id] = warp_sum;
-            }
-
-            __syncthreads();
-
-            int blockSum = 0;
-
-            // fold in unused warp
-            if (warp_id > 0)
-            {
-                blockSum = sums[warp_id - 1];
-
-                #pragma unroll
-                for (int i = 0; i < 16; ++i)
-                    result[i] += blockSum;
-            }
-
-            // assemble result
-            // Each thread has 16 values to write, which are
-            // now integer data (to avoid overflow).  Instead of
-            // each thread writing consecutive uint4s, the
-            // approach shown here experiments using
-            // the shuffle command to reformat the data
-            // inside the registers so that each thread holds
-            // consecutive data to be written so larger contiguous
-            // segments can be assembled for writing.
-
-            /*
-                For example data that needs to be written as
-
-                GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
-                but is stored in registers (r0..r3), in four threads (0..3) as:
-
-                threadId   0  1  2  3
-                  r0      x0 y0 z0 w0
-                  r1      x1 y1 z1 w1
-                  r2      x2 y2 z2 w2
-                  r3      x3 y3 z3 w3
-
-                  after apply __shfl_xor operations to move data between registers r1..r3:
-
-                threadId  00 01 10 11
-                          x0 y0 z0 w0
-                 xor(01)->y1 x1 w1 z1
-                 xor(10)->z2 w2 x2 y2
-                 xor(11)->w3 z3 y3 x3
-
-                 and now x0..x3, and z0..z3 can be written out in order by all threads.
-
-                 In the current code, each register above is actually representing
-                 four integers to be written as uint4's to GMEM.
-            */
-
-            result[4]  = __shfl_xor(result[4] , 1, 32);
-            result[5]  = __shfl_xor(result[5] , 1, 32);
-            result[6]  = __shfl_xor(result[6] , 1, 32);
-            result[7]  = __shfl_xor(result[7] , 1, 32);
-
-            result[8]  = __shfl_xor(result[8] , 2, 32);
-            result[9]  = __shfl_xor(result[9] , 2, 32);
-            result[10] = __shfl_xor(result[10], 2, 32);
-            result[11] = __shfl_xor(result[11], 2, 32);
-
-            result[12] = __shfl_xor(result[12], 3, 32);
-            result[13] = __shfl_xor(result[13], 3, 32);
-            result[14] = __shfl_xor(result[14], 3, 32);
-            result[15] = __shfl_xor(result[15], 3, 32);
-
-            uint4* integral_row = integral.ptr(blockIdx.x);
-            uint4 output;
-
-            ///////
-
-            if (threadIdx.x % 4 == 0)
-                output = make_uint4(result[0], result[1], result[2], result[3]);
-
-            if (threadIdx.x % 4 == 1)
-                output = make_uint4(result[4], result[5], result[6], result[7]);
-
-            if (threadIdx.x % 4 == 2)
-                output = make_uint4(result[8], result[9], result[10], result[11]);
-
-            if (threadIdx.x % 4 == 3)
-                output = make_uint4(result[12], result[13], result[14], result[15]);
-
-            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
-
-            ///////
-
-            if (threadIdx.x % 4 == 2)
-                output = make_uint4(result[0], result[1], result[2], result[3]);
-
-            if (threadIdx.x % 4 == 3)
-                output = make_uint4(result[4], result[5], result[6], result[7]);
+#error "opencv_cudev is required"

-            if (threadIdx.x % 4 == 0)
-                output = make_uint4(result[8], result[9], result[10], result[11]);
+#else

-            if (threadIdx.x % 4 == 1)
-                output = make_uint4(result[12], result[13], result[14], result[15]);
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"

-            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+using namespace cv::cudev;

-            // continuning from the above example,
-            // this use of __shfl_xor() places the y0..y3 and w0..w3 data
-            // in order.
+////////////////////////////////////////////////////////////////////////
+// integral

-            #pragma unroll
-            for (int i = 0; i < 16; ++i)
-                result[i] = __shfl_xor(result[i], 1, 32);
-
-            if (threadIdx.x % 4 == 0)
-                output = make_uint4(result[0], result[1], result[2], result[3]);
-
-            if (threadIdx.x % 4 == 1)
-                output = make_uint4(result[4], result[5], result[6], result[7]);
-
-            if (threadIdx.x % 4 == 2)
-                output = make_uint4(result[8], result[9], result[10], result[11]);
-
-            if (threadIdx.x % 4 == 3)
-                output = make_uint4(result[12], result[13], result[14], result[15]);
-
-            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
-
-            ///////
-
-            if (threadIdx.x % 4 == 2)
-                output = make_uint4(result[0], result[1], result[2], result[3]);
-
-            if (threadIdx.x % 4 == 3)
-                output = make_uint4(result[4], result[5], result[6], result[7]);
-
-            if (threadIdx.x % 4 == 0)
-                output = make_uint4(result[8], result[9], result[10], result[11]);
-
-            if (threadIdx.x % 4 == 1)
-                output = make_uint4(result[12], result[13], result[14], result[15]);
-
-            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
-        #endif
-        }
-
-        // This kernel computes columnwise prefix sums.  When the data input is
-        // the row sums from above, this completes the integral image.
-        // The approach here is to have each block compute a local set of sums.
-        // First , the data covered by the block is loaded into shared memory,
-        // then instead of performing a sum in shared memory using __syncthreads
-        // between stages, the data is reformatted so that the necessary sums
-        // occur inside warps and the shuffle scan operation is used.
-        // The final set of sums from the block is then propgated, with the block
-        // computing "down" the image and adding the running sum to the local
-        // block sums.
-        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
-        {
-        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
-            __shared__ unsigned int sums[32][9];
-
-            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
-            const int lane_id = tidx % 8;
-
-            if (tidx >= integral.cols)
-                return;
-
-            sums[threadIdx.x][threadIdx.y] = 0;
-            __syncthreads();
-
-            unsigned int stepSum = 0;
-
-            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
-            {
-                unsigned int* p = integral.ptr(y) + tidx;
-
-                unsigned int sum = *p;
-
-                sums[threadIdx.x][threadIdx.y] = sum;
-                __syncthreads();
-
-                // place into SMEM
-                // shfl scan reduce the SMEM, reformating so the column
-                // sums are computed in a warp
-                // then read out properly
-                const int j = threadIdx.x % 8;
-                const int k = threadIdx.x / 8 + threadIdx.y * 4;
-
-                int partial_sum = sums[k][j];
-
-                for (int i = 1; i <= 8; i *= 2)
-                {
-                    int n = __shfl_up(partial_sum, i, 32);
-
-                    if (lane_id >= i)
-                        partial_sum += n;
-                }
-
-                sums[k][j] = partial_sum;
-                __syncthreads();
-
-                if (threadIdx.y > 0)
-                    sum += sums[threadIdx.x][threadIdx.y - 1];
-
-                sum += stepSum;
-                stepSum += sums[threadIdx.x][blockDim.y - 1];
-
-                __syncthreads();
-
-                *p = sum;
-            }
-        #endif
-        }
-
-        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
-        {
-            {
-                // each thread handles 16 values, use 1 block/row
-                // save, becouse step is actually can't be less 512 bytes
-                int block = integral.cols / 16;
-
-                // launch 1 block / row
-                const int grid = img.rows;
-
-                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
-
-                shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            {
-                const dim3 block(32, 8);
-                const dim3 grid(divUp(integral.cols, block.x), 1);
-
-                shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
-        {
-        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
-            __shared__ unsigned int sums[32][9];
-
-            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
-            const int lane_id = tidx % 8;
-
-            if (tidx >= integral.cols)
-                return;
-
-            sums[threadIdx.x][threadIdx.y] = 0;
-            __syncthreads();
-
-            unsigned int stepSum = 0;
-
-            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
-            {
-                unsigned int* p = buffer.ptr(y) + tidx;
-                unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
-
-                unsigned int sum = *p;
-
-                sums[threadIdx.x][threadIdx.y] = sum;
-                __syncthreads();
+void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();

-                // place into SMEM
-                // shfl scan reduce the SMEM, reformating so the column
-                // sums are computed in a warp
-                // then read out properly
-                const int j = threadIdx.x % 8;
-                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+    CV_Assert( src.type() == CV_8UC1 );

-                int partial_sum = sums[k][j];
+    GpuMat_<int>& res = (GpuMat_<int>&) buffer;

-                for (int i = 1; i <= 8; i *= 2)
-                {
-                    int n = __shfl_up(partial_sum, i, 32);
+    gridIntegral(globPtr<uchar>(src), res, stream);

-                    if (lane_id >= i)
-                        partial_sum += n;
-                }
+    _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
+    GpuMat dst = _dst.getGpuMat();

-                sums[k][j] = partial_sum;
-                __syncthreads();
+    dst.setTo(Scalar::all(0), stream);

-                if (threadIdx.y > 0)
-                    sum += sums[threadIdx.x][threadIdx.y - 1];
+    GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
+    res.copyTo(inner, stream);
+}

-                sum += stepSum;
-                stepSum += sums[threadIdx.x][blockDim.y - 1];
+//////////////////////////////////////////////////////////////////////////////
+// sqrIntegral

-                __syncthreads();
+void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();

-                *dst = sum;
-            }
-        #endif
-        }
+    CV_Assert( src.type() == CV_8UC1 );

-        // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
-        void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
-            int blockStep, cudaStream_t stream)
-        {
-            {
-                const int block = blockStep;
-                const int grid = img.rows;
+    GpuMat_<double>& res = (GpuMat_<double>&) buf;

-                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+    gridIntegral(sqr_(cvt_<int>(globPtr<uchar>(src))), res, stream);

-                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
-                cudaSafeCall( cudaGetLastError() );
-            }
+    _dst.create(src.rows + 1, src.cols + 1, CV_64FC1);
+    GpuMat dst = _dst.getGpuMat();

-            {
-                const dim3 block(32, 8);
-                const dim3 grid(divUp(integral.cols, block.x), 1);
+    dst.setTo(Scalar::all(0), stream);

-                shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        }
-    }
-}}}
+    GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
+    res.copyTo(inner, stream);
+}

-#endif /* CUDA_DISABLER */
+#endif
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -294,116 +294,4 @@ void cv::cuda::normalize(InputArray _src, OutputArray dst, double a, double b, i
    }
 }

-////////////////////////////////////////////////////////////////////////
-// integral
-
-namespace cv { namespace cuda { namespace device
-{
-    namespace imgproc
-    {
-        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
-    }
-}}}
-
-void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& _stream)
-{
-    GpuMat src = _src.getGpuMat();
-
-    CV_Assert( src.type() == CV_8UC1 );
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    cv::Size whole;
-    cv::Point offset;
-    src.locateROI(whole, offset);
-
-    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
-        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
-    {
-        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
-
-        cv::cuda::device::imgproc::shfl_integral_gpu(src, buffer, stream);
-
-        _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
-        GpuMat dst = _dst.getGpuMat();
-
-        dst.setTo(Scalar::all(0), _stream);
-
-        GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
-        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
-
-        res.copyTo(inner, _stream);
-    }
-    else
-    {
-    #ifndef HAVE_OPENCV_CUDALEGACY
-        throw_no_cuda();
-    #else
-        _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
-        GpuMat dst = _dst.getGpuMat();
-
-        NcvSize32u roiSize;
-        roiSize.width = src.cols;
-        roiSize.height = src.rows;
-
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
-
-        Ncv32u bufSize;
-        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
-
-        NppStStreamHandler h(stream);
-
-        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
-            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    #endif
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// sqrIntegral
-
-void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& _stream)
-{
-#ifndef HAVE_OPENCV_CUDALEGACY
-    (void) _src;
-    (void) _dst;
-    (void) _stream;
-    throw_no_cuda();
-#else
-    GpuMat src = _src.getGpuMat();
-
-    CV_Assert( src.type() == CV_8U );
-
-    NcvSize32u roiSize;
-    roiSize.width = src.cols;
-    roiSize.height = src.rows;
-
-    cudaDeviceProp prop;
-    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
-
-    Ncv32u bufSize;
-    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
-
-    ensureSizeIsEnough(1, bufSize, CV_8U, buf);
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    NppStStreamHandler h(stream);
-
-    _dst.create(src.rows + 1, src.cols + 1, CV_64F);
-    GpuMat dst = _dst.getGpuMat();
-
-    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
-            dst.ptr<Ncv64u>(0), static_cast<int>(dst.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-#endif
-}
-
 #endif
--- a/modules/cudaarithm/test/test_arithm.cpp
+++ b/modules/cudaarithm/test/test_arithm.cpp
@@ -125,43 +125,6 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, GEMM, testing::Combine(
    ALL_GEMM_FLAGS,
    WHOLE_SUBMAT));

-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Integral
-
-PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi)
-{
-    cv::cuda::DeviceInfo devInfo;
-    cv::Size size;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
-
-        cv::cuda::setDevice(devInfo.deviceID());
-    }
-};
-
-CUDA_TEST_P(Integral, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_8UC1);
-
-    cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
-    cv::cuda::integral(loadMat(src, useRoi), dst);
-
-    cv::Mat dst_gold;
-    cv::integral(src, dst_gold, CV_32S);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    WHOLE_SUBMAT));
-
 ////////////////////////////////////////////////////////////////////////////
 // MulSpectrums


--- a/modules/cudaarithm/test/test_reductions.cpp
+++ b/modules/cudaarithm/test/test_reductions.cpp
@@ -816,4 +816,78 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine(
    DIFFERENT_SIZES,
    WHOLE_SUBMAT));

+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Integral
+
+PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Integral, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
+    cv::cuda::integral(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::integral(src, dst_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// IntegralSqr
+
+PARAM_TEST_CASE(IntegralSqr, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(IntegralSqr, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_64FC1, useRoi);
+    cv::cuda::sqrIntegral(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold, temp;
+    cv::integral(src, temp, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, IntegralSqr, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
 #endif // HAVE_CUDA