Commit e0584bb8 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #2876 from vbystricky:oclopt_integralsum

parents 1493160f 09bcc061
...@@ -231,7 +231,7 @@ OCL_PERF_TEST_P(IntegralFixture, Integral1, ::testing::Combine(OCL_TEST_SIZES, O ...@@ -231,7 +231,7 @@ OCL_PERF_TEST_P(IntegralFixture, Integral1, ::testing::Combine(OCL_TEST_SIZES, O
OCL_TEST_CYCLE() cv::integral(src, dst, ddepth); OCL_TEST_CYCLE() cv::integral(src, dst, ddepth);
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); SANITY_CHECK(dst, 2e-6, ERROR_RELATIVE);
} }
OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32S, CV_32F))) OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32S, CV_32F)))
...@@ -243,11 +243,11 @@ OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, O ...@@ -243,11 +243,11 @@ OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, O
checkDeviceMaxMemoryAllocSize(srcSize, ddepth); checkDeviceMaxMemoryAllocSize(srcSize, ddepth);
UMat src(srcSize, CV_8UC1), sum(srcSize + Size(1, 1), ddepth), sqsum(srcSize + Size(1, 1), CV_32F); UMat src(srcSize, CV_8UC1), sum(srcSize + Size(1, 1), ddepth), sqsum(srcSize + Size(1, 1), CV_32F);
declare.in(src, WARMUP_RNG).out(sum).out(sqsum); declare.in(src, WARMUP_RNG).out(sum, sqsum);
OCL_TEST_CYCLE() cv::integral(src, sum, sqsum, ddepth, CV_32F); OCL_TEST_CYCLE() cv::integral(src, sum, sqsum, ddepth, CV_32F);
SANITY_CHECK(sum, 1e-6, ERROR_RELATIVE); SANITY_CHECK(sum, 2e-4, ERROR_RELATIVE);
SANITY_CHECK(sqsum, 5e-5, ERROR_RELATIVE); SANITY_CHECK(sqsum, 5e-5, ERROR_RELATIVE);
} }
......
This diff is collapsed.
This diff is collapsed.
...@@ -235,97 +235,87 @@ typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_ ...@@ -235,97 +235,87 @@ typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
enum { vlen = 4 };
static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth ) static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
{ {
if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 || bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
!(sdepth == CV_32S || sdepth == CV_32F) )
return false;
ocl::Kernel k1("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, if ( (_src.type() != CV_8UC1) ||
format("-D sdepth=%d", sdepth)); !(sdepth == CV_32S || sdepth == CV_32F || (doubleSupport && sdepth == CV_64F)))
if (k1.empty())
return false; return false;
Size size = _src.size(), t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width), static const int tileSize = 16;
ssize(size.width + 1, size.height + 1);
_sum.create(ssize, sdepth); String build_opt = format("-D sumT=%s -D LOCAL_SUM_SIZE=%d%s",
UMat src = _src.getUMat(), t_sum(t_size, sdepth), sum = _sum.getUMat(); ocl::typeToStr(sdepth), tileSize,
t_sum = t_sum(Range::all(), Range(0, size.height)); doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
if (kcols.empty())
return false;
int offset = (int)src.offset / vlen; UMat src = _src.getUMat();
int vcols = (src.cols + vlen - 1) / vlen; Size src_size = src.size();
int sum_offset = (int)sum.offset / vlen; Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
UMat buf(bufsize, sdepth);
kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
size_t gt = src.cols, lt = tileSize;
if (!kcols.run(1, &gt, &lt, false))
return false;
k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum), ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
offset, src.rows, src.cols, (int)src.step, (int)t_sum.step); if (krows.empty())
size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
if (!k1.run(1, &gt, &lt, false))
return false; return false;
ocl::Kernel k2("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, Size sumsize(src_size.width + 1, src_size.height + 1);
format("-D sdepth=%d", sdepth)); _sum.create(sumsize, sdepth);
k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrWriteOnly(sum), UMat sum = _sum.getUMat();
t_sum.rows, t_sum.cols, (int)t_sum.step, (int)sum.step, sum_offset);
size_t gt2 = t_sum.cols * 32, lt2 = 256; krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::WriteOnly(sum));
return k2.run(1, &gt2, &lt2, false); gt = src.rows;
return krows.run(1, &gt, &lt, false);
} }
static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth ) static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth )
{ {
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 || if ( _src.type() != CV_8UC1 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
(!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
return false;
char cvt[40];
String opts = format("-D sdepth=%d -D sqdepth=%d -D TYPE=%s -D TYPE4=%s4 -D convert_TYPE4=%s%s",
sdepth, sqdepth, ocl::typeToStr(sqdepth), ocl::typeToStr(sqdepth),
ocl::convertTypeStr(sdepth, sqdepth, 4, cvt),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k1("integral_cols", ocl::imgproc::integral_sqrsum_oclsrc, opts);
if (k1.empty())
return false; return false;
Size size = _src.size(), dsize = Size(size.width + 1, size.height + 1), static const int tileSize = 16;
t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width);
UMat src = _src.getUMat(), t_sum(t_size, sdepth), t_sqsum(t_size, sqdepth);
t_sum = t_sum(Range::all(), Range(0, size.height));
t_sqsum = t_sqsum(Range::all(), Range(0, size.height));
_sum.create(dsize, sdepth);
_sqsum.create(dsize, sqdepth);
UMat sum = _sum.getUMat(), sqsum = _sqsum.getUMat();
int offset = (int)src.offset / vlen; String build_opt = format("-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s",
int pre_invalid = src.offset % vlen; ocl::typeToStr(sdepth), ocl::typeToStr(sqdepth),
int vcols = (pre_invalid + src.cols + vlen - 1) / vlen; tileSize,
int sum_offset = (int)(sum.offset / sum.elemSize()); doubleSupport ? " -D DOUBLE_SUPPORT" : "");
int sqsum_offset = (int)(sqsum.offset / sqsum.elemSize());
k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum), ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
ocl::KernelArg::PtrWriteOnly(t_sqsum), offset, pre_invalid, src.rows, if (kcols.empty())
src.cols, (int)src.step, (int)t_sum.step, (int)t_sqsum.step); return false;
size_t gt = ((vcols + 1) / 2) * 256, lt = 256; UMat src = _src.getUMat();
if (!k1.run(1, &gt, &lt, false)) Size src_size = src.size();
Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
UMat buf(bufsize, sdepth);
UMat buf_sq(bufsize, sqdepth);
kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf), ocl::KernelArg::WriteOnlyNoSize(buf_sq));
size_t gt = src.cols, lt = tileSize;
if (!kcols.run(1, &gt, &lt, false))
return false; return false;
ocl::Kernel k2("integral_rows", ocl::imgproc::integral_sqrsum_oclsrc, opts); ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
if (k2.empty()) if (krows.empty())
return false; return false;
k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrReadOnly(t_sqsum), Size sumsize(src_size.width + 1, src_size.height + 1);
ocl::KernelArg::PtrWriteOnly(sum), ocl::KernelArg::PtrWriteOnly(sqsum), _sum.create(sumsize, sdepth);
t_sum.rows, t_sum.cols, (int)t_sum.step, (int)t_sqsum.step, UMat sum = _sum.getUMat();
(int)sum.step, (int)sqsum.step, sum_offset, sqsum_offset); _sqsum.create(sumsize, sqdepth);
UMat sum_sq = _sqsum.getUMat();
size_t gt2 = t_sum.cols * 32, lt2 = 256; krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::ReadOnlyNoSize(buf_sq), ocl::KernelArg::WriteOnly(sum), ocl::KernelArg::WriteOnlyNoSize(sum_sq));
return k2.run(1, &gt2, &lt2, false); gt = src.rows;
return krows.run(1, &gt, &lt, false);
} }
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment