Commit 3997514b authored by Alexey Spizhevoy's avatar Alexey Spizhevoy

added tests for gpu::sum, it supports all data types, but single channel images only

parent 442cd75c
...@@ -421,9 +421,12 @@ namespace cv ...@@ -421,9 +421,12 @@ namespace cv
CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode); CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode);
//! computes sum of array elements //! computes sum of array elements
//! supports CV_8UC1, CV_8UC4 types //! supports only single channel images
//! disabled until fix crash CV_EXPORTS Scalar sum(const GpuMat& src);
CV_EXPORTS Scalar sum(const GpuMat& m);
//! computes sum of array elements
//! supports only single channel images
CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
//! finds global minimum and maximum array elements and returns their values //! finds global minimum and maximum array elements and returns their values
CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat()); CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
......
...@@ -65,6 +65,7 @@ double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; } ...@@ -65,6 +65,7 @@ double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; } double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); } void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); } Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); } void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); } void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
...@@ -480,36 +481,50 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode) ...@@ -480,36 +481,50 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode)
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// sum // sum
Scalar cv::gpu::sum(const GpuMat& src) namespace cv { namespace gpu { namespace mathfunc
{ {
CV_Assert(!"disabled until fix crash"); template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum);
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4); template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);
NppiSize sz; namespace sum
sz.width = src.cols; {
sz.height = src.rows; void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
}
}}}
Scalar res; Scalar cv::gpu::sum(const GpuMat& src)
{
GpuMat buf;
return sum(src, buf);
}
int bufsz; Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
{
using namespace mathfunc;
CV_Assert(src.channels() == 1);
if (src.type() == CV_8UC1) typedef void (*Caller)(const DevMem2D, PtrStep, double*);
{ static const Caller callers[2][7] =
nppiReductionGetBufferHostSize_8u_C1R(sz, &bufsz); { { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>,
GpuMat buf(1, bufsz, CV_32S); sum_multipass_caller<unsigned short>, sum_multipass_caller<short>,
sum_multipass_caller<int>, sum_multipass_caller<float>, 0 },
{ sum_caller<unsigned char>, sum_caller<char>,
sum_caller<unsigned short>, sum_caller<short>,
sum_caller<int>, sum_caller<float>, sum_caller<double> } };
nppSafeCall( nppiSum_8u_C1R(src.ptr<Npp8u>(), src.step, sz, buf.ptr<Npp32s>(), res.val) ); Size bufSize;
} sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height);
else buf.create(bufSize, CV_8U);
{
nppiReductionGetBufferHostSize_8u_C4R(sz, &bufsz);
GpuMat buf(1, bufsz, CV_32S);
nppSafeCall( nppiSum_8u_C4R(src.ptr<Npp8u>(), src.step, sz, buf.ptr<Npp32s>(), res.val) ); Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
} if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
return res; double result;
caller(src, buf, &result);
return result;
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
......
...@@ -1419,6 +1419,15 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1419,6 +1419,15 @@ namespace cv { namespace gpu { namespace mathfunc
namespace sum namespace sum
{ {
template <typename T> struct SumType {};
template <> struct SumType<unsigned char> { typedef unsigned int R; };
template <> struct SumType<char> { typedef int R; };
template <> struct SumType<unsigned short> { typedef unsigned int R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
__device__ unsigned int blocks_finished = 0; __device__ unsigned int blocks_finished = 0;
...@@ -1436,12 +1445,11 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1436,12 +1445,11 @@ namespace cv { namespace gpu { namespace mathfunc
} }
template <typename T>
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows) void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid); estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(T); bufcols = grid.x * grid.y * sizeof(double);
bufrows = 1; bufrows = 1;
} }
...@@ -1454,17 +1462,17 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1454,17 +1462,17 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
} }
template <typename T, int nthreads> template <typename T, typename R, int nthreads>
__global__ void sum_kernel(const DevMem2D_<T> src, T* result) __global__ void sum_kernel(const DevMem2D_<T> src, R* result)
{ {
__shared__ T smem[nthreads]; __shared__ R smem[nthreads];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x; const int bid = blockIdx.y * gridDim.x + blockIdx.x;
T sum = 0; R sum = 0;
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{ {
const T* ptr = src.ptr(y0 + y * blockDim.y); const T* ptr = src.ptr(y0 + y * blockDim.y);
...@@ -1475,7 +1483,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1475,7 +1483,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = sum; smem[tid] = sum;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, T>(smem, tid); sum_in_smem<nthreads, R>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
...@@ -1496,7 +1504,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1496,7 +1504,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0; smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, T>(smem, tid); sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -1510,14 +1518,16 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1510,14 +1518,16 @@ namespace cv { namespace gpu { namespace mathfunc
} }
template <typename T, int nthreads> template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel(T* result, int size) __global__ void sum_pass2_kernel(R* result, int size)
{ {
__shared__ T smem[nthreads]; __shared__ R smem[nthreads];
int tid = threadIdx.y * blockDim.x + threadIdx.x; int tid = threadIdx.y * blockDim.x + threadIdx.x;
smem[tid] = tid < size ? result[tid] : 0; smem[tid] = tid < size ? result[tid] : 0;
sum_in_smem<nthreads, T>(smem, tid); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0) if (tid == 0)
result[0] = smem[0]; result[0] = smem[0];
...@@ -1527,60 +1537,61 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1527,60 +1537,61 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
T sum_multipass_caller(const DevMem2D_<T> src, PtrStep buf) void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)
{ {
using namespace sum; using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* buf_ = (T*)buf.ptr(0); R* buf_ = (R*)buf.ptr(0);
sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_); sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
sum_pass2_kernel<T, threads_x * threads_y><<<1, threads_x * threads_y>>>( sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
buf_, grid.x * grid.y); buf_, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T sum; R result = 0;
cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&result, buf_, result, cudaMemcpyDeviceToHost));
sum[0] = result;
return sum;
} }
template unsigned char sum_multipass_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep); template void sum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);
template char sum_multipass_caller<char>(const DevMem2D_<char>, PtrStep); template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*);
template unsigned short sum_multipass_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep); template void sum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);
template short sum_multipass_caller<short>(const DevMem2D_<short>, PtrStep); template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*);
template int sum_multipass_caller<int>(const DevMem2D_<int>, PtrStep); template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*);
template float sum_multipass_caller<float>(const DevMem2D_<float>, PtrStep); template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*);
template <typename T> template <typename T>
T sum_caller(const DevMem2D_<T> src, PtrStep buf) void sum_caller(const DevMem2D src, PtrStep buf, double* sum)
{ {
using namespace sum; using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* buf_ = (T*)buf.ptr(0); R* buf_ = (R*)buf.ptr(0);
sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_); sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T sum; R result = 0;
cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));
sum[0] = result;
return sum;
} }
template unsigned char sum_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep); template void sum_caller<unsigned char>(const DevMem2D, PtrStep, double*);
template char sum_caller<char>(const DevMem2D_<char>, PtrStep); template void sum_caller<char>(const DevMem2D, PtrStep, double*);
template unsigned short sum_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep); template void sum_caller<unsigned short>(const DevMem2D, PtrStep, double*);
template short sum_caller<short>(const DevMem2D_<short>, PtrStep); template void sum_caller<short>(const DevMem2D, PtrStep, double*);
template int sum_caller<int>(const DevMem2D_<int>, PtrStep); template void sum_caller<int>(const DevMem2D, PtrStep, double*);
template float sum_caller<float>(const DevMem2D_<float>, PtrStep); template void sum_caller<float>(const DevMem2D, PtrStep, double*);
template double sum_caller<double>(const DevMem2D_<double>, PtrStep); template void sum_caller<double>(const DevMem2D, PtrStep, double*);
}}} }}}
...@@ -458,29 +458,6 @@ struct CV_GpuNppImageFlipTest : public CV_GpuArithmTest ...@@ -458,29 +458,6 @@ struct CV_GpuNppImageFlipTest : public CV_GpuArithmTest
} }
}; };
////////////////////////////////////////////////////////////////////////////////
// sum
struct CV_GpuNppImageSumTest : public CV_GpuArithmTest
{
CV_GpuNppImageSumTest() : CV_GpuArithmTest( "GPU-NppImageSum", "sum" ) {}
int test( const Mat& mat1, const Mat& )
{
if (mat1.type() != CV_8UC1 && mat1.type() != CV_8UC4)
{
ts->printf(CvTS::LOG, "\tUnsupported type\t");
return CvTS::OK;
}
Scalar cpures = cv::sum(mat1);
GpuMat gpu1(mat1);
Scalar gpures = cv::gpu::sum(gpu1);
return CheckNorm(cpures, gpures);
}
};
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// LUT // LUT
struct CV_GpuNppImageLUTTest : public CV_GpuArithmTest struct CV_GpuNppImageLUTTest : public CV_GpuArithmTest
...@@ -949,27 +926,49 @@ struct CV_GpuCountNonZeroTest: CvTest ...@@ -949,27 +926,49 @@ struct CV_GpuCountNonZeroTest: CvTest
} }
}; };
////////////////////////////////////////////////////////////////////////////////
// min/max
struct CV_GpuImageMinMaxTest : public CV_GpuArithmTest //////////////////////////////////////////////////////////////////////////////
// sum
struct CV_GpuSumTest: CvTest
{ {
CV_GpuImageMinMaxTest() : CV_GpuArithmTest( "GPU-ImageMinMax", "min/max" ) {} CV_GpuSumTest(): CvTest("GPU-SumTest", "sum") {}
int test( const Mat& mat1, const Mat& mat2 ) void run(int)
{ {
cv::Mat cpuMinRes, cpuMaxRes; try
cv::min(mat1, mat2, cpuMinRes); {
cv::max(mat1, mat2, cpuMaxRes); Mat src;
Scalar a, b;
double max_err = 1e-6;
GpuMat gpu1(mat1); int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F;
GpuMat gpu2(mat2); for (int type = CV_8U; type <= typemax; ++type)
GpuMat gpuMinRes, gpuMaxRes; {
cv::gpu::min(gpu1, gpu2, gpuMinRes); gen(1 + rand() % 1000, 1 + rand() % 1000, type, src);
cv::gpu::max(gpu1, gpu2, gpuMaxRes); a = sum(src);
b = sum(GpuMat(src));
if (abs(a[0] - b[0]) > src.size().area() * max_err)
{
ts->printf(CvTS::CONSOLE, "cols: %d, rows: %d, expected: %f, actual: %f\n", src.cols, src.rows, a[0], b[0]);
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
return;
}
}
}
catch (const Exception& e)
{
if (!check_and_treat_gpu_exception(e, ts)) throw;
return;
}
}
void gen(int cols, int rows, int type, Mat& m)
{
m.create(rows, cols, type);
RNG rng;
rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(20));
return CheckNorm(cpuMinRes, gpuMinRes) == CvTS::OK && CheckNorm(cpuMaxRes, gpuMaxRes) == CvTS::OK ?
CvTS::OK : CvTS::FAIL_GENERIC;
} }
}; };
...@@ -992,7 +991,6 @@ CV_GpuNppImageCompareTest CV_GpuNppImageCompare_test; ...@@ -992,7 +991,6 @@ CV_GpuNppImageCompareTest CV_GpuNppImageCompare_test;
CV_GpuNppImageMeanStdDevTest CV_GpuNppImageMeanStdDev_test; CV_GpuNppImageMeanStdDevTest CV_GpuNppImageMeanStdDev_test;
CV_GpuNppImageNormTest CV_GpuNppImageNorm_test; CV_GpuNppImageNormTest CV_GpuNppImageNorm_test;
CV_GpuNppImageFlipTest CV_GpuNppImageFlip_test; CV_GpuNppImageFlipTest CV_GpuNppImageFlip_test;
CV_GpuNppImageSumTest CV_GpuNppImageSum_test;
CV_GpuNppImageLUTTest CV_GpuNppImageLUT_test; CV_GpuNppImageLUTTest CV_GpuNppImageLUT_test;
CV_GpuNppImageExpTest CV_GpuNppImageExp_test; CV_GpuNppImageExpTest CV_GpuNppImageExp_test;
CV_GpuNppImageLogTest CV_GpuNppImageLog_test; CV_GpuNppImageLogTest CV_GpuNppImageLog_test;
...@@ -1003,4 +1001,4 @@ CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test; ...@@ -1003,4 +1001,4 @@ CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test;
CV_GpuMinMaxTest CV_GpuMinMaxTest_test; CV_GpuMinMaxTest CV_GpuMinMaxTest_test;
CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test; CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test;
CV_GpuCountNonZeroTest CV_CountNonZero_test; CV_GpuCountNonZeroTest CV_CountNonZero_test;
CV_GpuImageMinMaxTest CV_GpuImageMinMax_test; CV_GpuSumTest CV_GpuSum_test;
...@@ -46,9 +46,6 @@ CvTS test_system("gpu"); ...@@ -46,9 +46,6 @@ CvTS test_system("gpu");
const char* blacklist[] = const char* blacklist[] =
{ {
"GPU-AsyncGpuMatOperator", // crash "GPU-AsyncGpuMatOperator", // crash
"GPU-NppImageSum", // crash, probably npp bug
"GPU-NppImageCanny", // NPP_TEXTURE_BIND_ERROR "GPU-NppImageCanny", // NPP_TEXTURE_BIND_ERROR
0 0
}; };
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment