Commit 09bcc061 authored by vbystricky's avatar vbystricky

Change kernel for optimization. Remove restriction to align data

Fix kernel compilation errors on AMD system

Fix licanse information in cl file

Support CV_64F destination type

Change build options of the kernel

Optimize sum of square

Remove separate kernel for integral square

Increase epsilon for perfomance tests

Increase epsilon for perfomance tests

Test double support on AMD devices

Fix some issues

Try to fix problems with AMD device

Try to solve problem with AMD device

Fix error of destination size in kernel

Fix warnings
parent 01a98fae
......@@ -231,7 +231,7 @@ OCL_PERF_TEST_P(IntegralFixture, Integral1, ::testing::Combine(OCL_TEST_SIZES, O
OCL_TEST_CYCLE() cv::integral(src, dst, ddepth);
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
SANITY_CHECK(dst, 2e-6, ERROR_RELATIVE);
}
OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32S, CV_32F)))
......@@ -243,11 +243,11 @@ OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, O
checkDeviceMaxMemoryAllocSize(srcSize, ddepth);
UMat src(srcSize, CV_8UC1), sum(srcSize + Size(1, 1), ddepth), sqsum(srcSize + Size(1, 1), CV_32F);
declare.in(src, WARMUP_RNG).out(sum).out(sqsum);
declare.in(src, WARMUP_RNG).out(sum, sqsum);
OCL_TEST_CYCLE() cv::integral(src, sum, sqsum, ddepth, CV_32F);
SANITY_CHECK(sum, 1e-6, ERROR_RELATIVE);
SANITY_CHECK(sum, 2e-4, ERROR_RELATIVE);
SANITY_CHECK(sqsum, 5e-5, ERROR_RELATIVE);
}
......
This diff is collapsed.
This diff is collapsed.
......@@ -235,97 +235,87 @@ typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_
#ifdef HAVE_OPENCL
enum { vlen = 4 };
static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
{
if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 ||
!(sdepth == CV_32S || sdepth == CV_32F) )
return false;
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
ocl::Kernel k1("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc,
format("-D sdepth=%d", sdepth));
if (k1.empty())
if ( (_src.type() != CV_8UC1) ||
!(sdepth == CV_32S || sdepth == CV_32F || (doubleSupport && sdepth == CV_64F)))
return false;
Size size = _src.size(), t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width),
ssize(size.width + 1, size.height + 1);
_sum.create(ssize, sdepth);
UMat src = _src.getUMat(), t_sum(t_size, sdepth), sum = _sum.getUMat();
t_sum = t_sum(Range::all(), Range(0, size.height));
static const int tileSize = 16;
String build_opt = format("-D sumT=%s -D LOCAL_SUM_SIZE=%d%s",
ocl::typeToStr(sdepth), tileSize,
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
if (kcols.empty())
return false;
int offset = (int)src.offset / vlen;
int vcols = (src.cols + vlen - 1) / vlen;
int sum_offset = (int)sum.offset / vlen;
UMat src = _src.getUMat();
Size src_size = src.size();
Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
UMat buf(bufsize, sdepth);
kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
size_t gt = src.cols, lt = tileSize;
if (!kcols.run(1, &gt, &lt, false))
return false;
k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum),
offset, src.rows, src.cols, (int)src.step, (int)t_sum.step);
size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
if (!k1.run(1, &gt, &lt, false))
ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
if (krows.empty())
return false;
ocl::Kernel k2("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc,
format("-D sdepth=%d", sdepth));
k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrWriteOnly(sum),
t_sum.rows, t_sum.cols, (int)t_sum.step, (int)sum.step, sum_offset);
Size sumsize(src_size.width + 1, src_size.height + 1);
_sum.create(sumsize, sdepth);
UMat sum = _sum.getUMat();
size_t gt2 = t_sum.cols * 32, lt2 = 256;
return k2.run(1, &gt2, &lt2, false);
krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::WriteOnly(sum));
gt = src.rows;
return krows.run(1, &gt, &lt, false);
}
static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth )
{
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 ||
(!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
return false;
char cvt[40];
String opts = format("-D sdepth=%d -D sqdepth=%d -D TYPE=%s -D TYPE4=%s4 -D convert_TYPE4=%s%s",
sdepth, sqdepth, ocl::typeToStr(sqdepth), ocl::typeToStr(sqdepth),
ocl::convertTypeStr(sdepth, sqdepth, 4, cvt),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k1("integral_cols", ocl::imgproc::integral_sqrsum_oclsrc, opts);
if (k1.empty())
if ( _src.type() != CV_8UC1 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
return false;
Size size = _src.size(), dsize = Size(size.width + 1, size.height + 1),
t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width);
UMat src = _src.getUMat(), t_sum(t_size, sdepth), t_sqsum(t_size, sqdepth);
t_sum = t_sum(Range::all(), Range(0, size.height));
t_sqsum = t_sqsum(Range::all(), Range(0, size.height));
_sum.create(dsize, sdepth);
_sqsum.create(dsize, sqdepth);
UMat sum = _sum.getUMat(), sqsum = _sqsum.getUMat();
static const int tileSize = 16;
int offset = (int)src.offset / vlen;
int pre_invalid = src.offset % vlen;
int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
int sum_offset = (int)(sum.offset / sum.elemSize());
int sqsum_offset = (int)(sqsum.offset / sqsum.elemSize());
String build_opt = format("-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s",
ocl::typeToStr(sdepth), ocl::typeToStr(sqdepth),
tileSize,
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum),
ocl::KernelArg::PtrWriteOnly(t_sqsum), offset, pre_invalid, src.rows,
src.cols, (int)src.step, (int)t_sum.step, (int)t_sqsum.step);
ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
if (kcols.empty())
return false;
size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
if (!k1.run(1, &gt, &lt, false))
UMat src = _src.getUMat();
Size src_size = src.size();
Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
UMat buf(bufsize, sdepth);
UMat buf_sq(bufsize, sqdepth);
kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf), ocl::KernelArg::WriteOnlyNoSize(buf_sq));
size_t gt = src.cols, lt = tileSize;
if (!kcols.run(1, &gt, &lt, false))
return false;
ocl::Kernel k2("integral_rows", ocl::imgproc::integral_sqrsum_oclsrc, opts);
if (k2.empty())
ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
if (krows.empty())
return false;
k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrReadOnly(t_sqsum),
ocl::KernelArg::PtrWriteOnly(sum), ocl::KernelArg::PtrWriteOnly(sqsum),
t_sum.rows, t_sum.cols, (int)t_sum.step, (int)t_sqsum.step,
(int)sum.step, (int)sqsum.step, sum_offset, sqsum_offset);
Size sumsize(src_size.width + 1, src_size.height + 1);
_sum.create(sumsize, sdepth);
UMat sum = _sum.getUMat();
_sqsum.create(sumsize, sqdepth);
UMat sum_sq = _sqsum.getUMat();
size_t gt2 = t_sum.cols * 32, lt2 = 256;
return k2.run(1, &gt2, &lt2, false);
krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::ReadOnlyNoSize(buf_sq), ocl::KernelArg::WriteOnly(sum), ocl::KernelArg::WriteOnlyNoSize(sum_sq));
gt = src.rows;
return krows.run(1, &gt, &lt, false);
}
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment