Commit da5aaab2 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

optimized gpu::integral for Kepler

parent 92795ba4
......@@ -72,9 +72,11 @@ namespace cv { namespace gpu
FEATURE_SET_COMPUTE_13 = 13,
FEATURE_SET_COMPUTE_20 = 20,
FEATURE_SET_COMPUTE_21 = 21,
FEATURE_SET_COMPUTE_30 = 30,
GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13
NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
};
// Gives information about what GPU archs this OpenCV GPU module was
......
This diff is collapsed.
......@@ -533,11 +533,65 @@ void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
integralBuffered(src, sum, buffer, s);
}
namespace cv { namespace gpu { namespace device
{
namespace imgproc
{
void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream);
}
}}}
void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
{
CV_Assert(src.type() == CV_8UC1);
if (sum.cols != src.cols + 1 && sum.rows != src.rows + 1)
sum.create(src.rows + 1, src.cols + 1, CV_32S);
cudaStream_t stream = StreamAccessor::getStream(s);
DeviceInfo info;
if (info.supports(WARP_SHUFFLE_FUNCTIONS))
{
GpuMat src16;
if (src.cols % 16 == 0)
src16 = src;
else
{
ensureSizeIsEnough(src.rows, ((src.cols + 15) / 16) * 16, src.type(), buffer);
GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows));
if (s)
{
s.enqueueMemSet(buffer, Scalar::all(0));
s.enqueueCopy(src, inner);
}
else
{
buffer.setTo(Scalar::all(0));
src.copyTo(inner);
}
src16 = buffer;
}
sum.create(src16.rows + 1, src16.cols + 1, CV_32SC1);
if (s)
s.enqueueMemSet(sum, Scalar::all(0));
else
sum.setTo(Scalar::all(0));
GpuMat inner = sum(Rect(1, 1, src16.cols, src16.rows));
cv::gpu::device::imgproc::shfl_integral_gpu(src16, inner, stream);
if (src16.cols != src.cols)
sum = sum(Rect(0, 0, src.cols + 1, src.rows + 1));
}
else
{
sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
NcvSize32u roiSize;
roiSize.width = src.cols;
......@@ -550,7 +604,6 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStStreamHandler h(stream);
......@@ -559,6 +612,7 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
//////////////////////////////////////////////////////////////////////////////
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment