Commit 09bcc061 authored by vbystricky's avatar vbystricky

Change kernel for optimization. Remove restriction to align data

Fix kernel compilation errors on AMD system

Fix licanse information in cl file

Support CV_64F destination type

Change build options of the kernel

Optimize sum of square

Remove separate kernel for integral square

Increase epsilon for perfomance tests

Increase epsilon for perfomance tests

Test double support on AMD devices

Fix some issues

Try to fix problems with AMD device

Try to solve problem with AMD device

Fix error of destination size in kernel

Fix warnings
parent 01a98fae
...@@ -231,7 +231,7 @@ OCL_PERF_TEST_P(IntegralFixture, Integral1, ::testing::Combine(OCL_TEST_SIZES, O ...@@ -231,7 +231,7 @@ OCL_PERF_TEST_P(IntegralFixture, Integral1, ::testing::Combine(OCL_TEST_SIZES, O
OCL_TEST_CYCLE() cv::integral(src, dst, ddepth); OCL_TEST_CYCLE() cv::integral(src, dst, ddepth);
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); SANITY_CHECK(dst, 2e-6, ERROR_RELATIVE);
} }
OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32S, CV_32F))) OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32S, CV_32F)))
...@@ -243,11 +243,11 @@ OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, O ...@@ -243,11 +243,11 @@ OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, O
checkDeviceMaxMemoryAllocSize(srcSize, ddepth); checkDeviceMaxMemoryAllocSize(srcSize, ddepth);
UMat src(srcSize, CV_8UC1), sum(srcSize + Size(1, 1), ddepth), sqsum(srcSize + Size(1, 1), CV_32F); UMat src(srcSize, CV_8UC1), sum(srcSize + Size(1, 1), ddepth), sqsum(srcSize + Size(1, 1), CV_32F);
declare.in(src, WARMUP_RNG).out(sum).out(sqsum); declare.in(src, WARMUP_RNG).out(sum, sqsum);
OCL_TEST_CYCLE() cv::integral(src, sum, sqsum, ddepth, CV_32F); OCL_TEST_CYCLE() cv::integral(src, sum, sqsum, ddepth, CV_32F);
SANITY_CHECK(sum, 1e-6, ERROR_RELATIVE); SANITY_CHECK(sum, 2e-4, ERROR_RELATIVE);
SANITY_CHECK(sqsum, 5e-5, ERROR_RELATIVE); SANITY_CHECK(sqsum, 5e-5, ERROR_RELATIVE);
} }
......
This diff is collapsed.
This diff is collapsed.
...@@ -235,97 +235,87 @@ typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_ ...@@ -235,97 +235,87 @@ typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
enum { vlen = 4 };
static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth ) static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
{ {
if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 || bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
!(sdepth == CV_32S || sdepth == CV_32F) )
return false;
ocl::Kernel k1("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, if ( (_src.type() != CV_8UC1) ||
format("-D sdepth=%d", sdepth)); !(sdepth == CV_32S || sdepth == CV_32F || (doubleSupport && sdepth == CV_64F)))
if (k1.empty())
return false; return false;
Size size = _src.size(), t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width), static const int tileSize = 16;
ssize(size.width + 1, size.height + 1);
_sum.create(ssize, sdepth);
UMat src = _src.getUMat(), t_sum(t_size, sdepth), sum = _sum.getUMat();
t_sum = t_sum(Range::all(), Range(0, size.height));
int offset = (int)src.offset / vlen; String build_opt = format("-D sumT=%s -D LOCAL_SUM_SIZE=%d%s",
int vcols = (src.cols + vlen - 1) / vlen; ocl::typeToStr(sdepth), tileSize,
int sum_offset = (int)sum.offset / vlen; doubleSupport ? " -D DOUBLE_SUPPORT" : "");
k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum), ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
offset, src.rows, src.cols, (int)src.step, (int)t_sum.step); if (kcols.empty())
size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
if (!k1.run(1, &gt, &lt, false))
return false; return false;
ocl::Kernel k2("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, UMat src = _src.getUMat();
format("-D sdepth=%d", sdepth)); Size src_size = src.size();
k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrWriteOnly(sum), Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
t_sum.rows, t_sum.cols, (int)t_sum.step, (int)sum.step, sum_offset); UMat buf(bufsize, sdepth);
kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
size_t gt = src.cols, lt = tileSize;
if (!kcols.run(1, &gt, &lt, false))
return false;
ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
if (krows.empty())
return false;
size_t gt2 = t_sum.cols * 32, lt2 = 256; Size sumsize(src_size.width + 1, src_size.height + 1);
return k2.run(1, &gt2, &lt2, false); _sum.create(sumsize, sdepth);
UMat sum = _sum.getUMat();
krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::WriteOnly(sum));
gt = src.rows;
return krows.run(1, &gt, &lt, false);
} }
static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth ) static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth )
{ {
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 || if ( _src.type() != CV_8UC1 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
(!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
return false; return false;
char cvt[40]; static const int tileSize = 16;
String opts = format("-D sdepth=%d -D sqdepth=%d -D TYPE=%s -D TYPE4=%s4 -D convert_TYPE4=%s%s",
sdepth, sqdepth, ocl::typeToStr(sqdepth), ocl::typeToStr(sqdepth), String build_opt = format("-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s",
ocl::convertTypeStr(sdepth, sqdepth, 4, cvt), ocl::typeToStr(sdepth), ocl::typeToStr(sqdepth),
tileSize,
doubleSupport ? " -D DOUBLE_SUPPORT" : ""); doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k1("integral_cols", ocl::imgproc::integral_sqrsum_oclsrc, opts); ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
if (k1.empty()) if (kcols.empty())
return false; return false;
Size size = _src.size(), dsize = Size(size.width + 1, size.height + 1), UMat src = _src.getUMat();
t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width); Size src_size = src.size();
UMat src = _src.getUMat(), t_sum(t_size, sdepth), t_sqsum(t_size, sqdepth); Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
t_sum = t_sum(Range::all(), Range(0, size.height)); UMat buf(bufsize, sdepth);
t_sqsum = t_sqsum(Range::all(), Range(0, size.height)); UMat buf_sq(bufsize, sqdepth);
kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf), ocl::KernelArg::WriteOnlyNoSize(buf_sq));
_sum.create(dsize, sdepth); size_t gt = src.cols, lt = tileSize;
_sqsum.create(dsize, sqdepth); if (!kcols.run(1, &gt, &lt, false))
UMat sum = _sum.getUMat(), sqsum = _sqsum.getUMat();
int offset = (int)src.offset / vlen;
int pre_invalid = src.offset % vlen;
int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
int sum_offset = (int)(sum.offset / sum.elemSize());
int sqsum_offset = (int)(sqsum.offset / sqsum.elemSize());
k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum),
ocl::KernelArg::PtrWriteOnly(t_sqsum), offset, pre_invalid, src.rows,
src.cols, (int)src.step, (int)t_sum.step, (int)t_sqsum.step);
size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
if (!k1.run(1, &gt, &lt, false))
return false; return false;
ocl::Kernel k2("integral_rows", ocl::imgproc::integral_sqrsum_oclsrc, opts); ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
if (k2.empty()) if (krows.empty())
return false; return false;
k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrReadOnly(t_sqsum), Size sumsize(src_size.width + 1, src_size.height + 1);
ocl::KernelArg::PtrWriteOnly(sum), ocl::KernelArg::PtrWriteOnly(sqsum), _sum.create(sumsize, sdepth);
t_sum.rows, t_sum.cols, (int)t_sum.step, (int)t_sqsum.step, UMat sum = _sum.getUMat();
(int)sum.step, (int)sqsum.step, sum_offset, sqsum_offset); _sqsum.create(sumsize, sqdepth);
UMat sum_sq = _sqsum.getUMat();
size_t gt2 = t_sum.cols * 32, lt2 = 256; krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::ReadOnlyNoSize(buf_sq), ocl::KernelArg::WriteOnly(sum), ocl::KernelArg::WriteOnlyNoSize(sum_sq));
return k2.run(1, &gt2, &lt2, false); gt = src.rows;
return krows.run(1, &gt, &lt, false);
} }
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment