Commit 640af662 authored by Alexey Spizhevoy's avatar Alexey Spizhevoy

added support of multichannel images into gpu::sqrSum

parent 3db5b687
......@@ -492,10 +492,10 @@ namespace cv { namespace gpu { namespace mathfunc
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum);
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
namespace sum
{
......@@ -543,9 +543,8 @@ Scalar cv::gpu::sqrSum(const GpuMat& src)
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
{
using namespace mathfunc;
CV_Assert(src.channels() == 1);
typedef void (*Caller)(const DevMem2D, PtrStep, double*);
typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
static const Caller callers[2][7] =
{ { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>,
sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>,
......@@ -555,15 +554,15 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
sqsum_caller<int>, sqsum_caller<float>, 0 } };
Size bufSize;
sum::get_buf_size_required(src.cols, src.rows, 1, bufSize.width, bufSize.height);
sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height);
buf.create(bufSize, CV_8U);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
double result;
caller(src, buf, &result);
return result;
double result[4];
caller(src, buf, result, src.channels());
return Scalar(result[0], result[1], result[2], result[3]);
}
////////////////////////////////////////////////////////////////////////
......
......@@ -1989,7 +1989,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
......@@ -1998,27 +1998,54 @@ namespace cv { namespace gpu { namespace mathfunc
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result = 0;
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost));
sum[0] = result;
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);
template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*);
template void sqsum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);
template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*);
template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*);
template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*);
template void sqsum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum)
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
......@@ -2027,20 +2054,42 @@ namespace cv { namespace gpu { namespace mathfunc
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result = 0;
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost));
sum[0] = result;
}
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
template void sqsum_caller<unsigned char>(const DevMem2D, PtrStep, double*);
template void sqsum_caller<char>(const DevMem2D, PtrStep, double*);
template void sqsum_caller<unsigned short>(const DevMem2D, PtrStep, double*);
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*);
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*);
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*);
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_caller<unsigned char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<unsigned short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int);
}}}
......@@ -983,6 +983,7 @@ struct CV_GpuSumTest: CvTest
}
if (type != CV_8S)
{
gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 1), src);
b = sqrSum(GpuMat(src));
Mat sqrsrc;
multiply(src, src, sqrsrc);
......@@ -993,6 +994,36 @@ struct CV_GpuSumTest: CvTest
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
return;
}
gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 2), src);
b = sqrSum(GpuMat(src));
multiply(src, src, sqrsrc);
a = sum(sqrsrc);
if (abs(a[0] - b[0]) + abs(a[1] - b[1])> src.size().area() * max_err * 2)
{
ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
return;
}
gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 3), src);
b = sqrSum(GpuMat(src));
multiply(src, src, sqrsrc);
a = sum(sqrsrc);
if (abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2])> src.size().area() * max_err * 3)
{
ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
return;
}
gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 4), src);
b = sqrSum(GpuMat(src));
multiply(src, src, sqrsrc);
a = sum(sqrsrc);
if (abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2]) + abs(a[3] - b[3])> src.size().area() * max_err * 4)
{
ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
return;
}
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment