Commit 9786a552 authored by vludv's avatar vludv

Removed OpenCL realization to test build.

parent 291c87a1
#include "precomp.hpp"
#include "dtfilter_ocl.hpp"
#include "opencl_kernels.hpp"
//#include <opencv2/highgui.hpp>
using namespace std;
#ifdef _MSC_VER
# pragma warning(disable: 4512)
#endif
#define MEASURE_TIME(counter, code) { counter -= getTickCount(); {code;} counter += getTickCount(); }
#define MEASURE_KER_TIME(code) { kernelsTime -= getTickCount(); code; kernelsTime += getTickCount(); }
namespace cv
{
int64 DTFilterOCL::totalKernelsTime = 0;
Ptr<DTFilterOCL> DTFilterOCL::create(InputArray guide, double sigmaSpatial, double sigmaColor, int mode, int numIters)
{
DTFilterOCL *p = new DTFilterOCL();
p->init(guide, sigmaSpatial, sigmaColor, mode, numIters);
return Ptr<DTFilterOCL>(p);
}
void DTFilterOCL::init(InputArray guide_, double sigmaSpatial_, double sigmaColor_, int mode_, int numIters_)
{
CV_Assert( guide_.channels() <= 4 && (guide_.depth() == CV_8U || guide_.depth() == CV_32F) );
CV_Assert( numIters_ <= 3);
mode = mode_;
numIters = numIters_;
sigmaSpatial = std::max(1.00f, (float)sigmaSpatial_);
sigmaColor = std::max(0.01f, (float)sigmaColor_);
kernelsTime = 0;
singleFilterCall = false;
h = guide_.rows();
w = guide_.cols();
NC_ocl_implememtation = ALG_INDEX;
UMat guide = guide_.getUMat();
initProgram();
setBuildOptionsDT(guide);
if (mode == DTF_NC)
{
initDT_NC(guide);
}
else if (mode == DTF_RF)
{
initDT_RF(guide);
}
else if (mode == DTF_IC)
{
initDT_IC(guide);
}
else
{
CV_Error(Error::StsBadFlag, "Incorrect DT filter mode");
}
}
void DTFilterOCL::initDT_NC(UMat &guide)
{
bool useIDT = false;
if (useIDT)
{
UMat guideT;
MEASURE_KER_TIME( transpose(guide, guideT); );
computeIDT(guide, guideT);
if (NC_ocl_implememtation == ALG_INDEX)
{
computeIndexMat(idistHor, distIndexHor);
computeIndexMat(idistVert, distIndexVert);
idistHor.release();
idistVert.release();
}
}
else
{
UMat dsitHorWrapper, distVertWrapper;
computeDT(guide, dsitHorWrapper, distVertWrapper);
distHor = dsitHorWrapper(Range::all(), Range(1, w + 1));
MEASURE_KER_TIME( transpose(distVertWrapper, distVertWrapper); );
distVert = distVertWrapper(Range::all(), Range(1, h + 1));
if (NC_ocl_implememtation == ALG_INDEX)
{
computeIndexMat(distHor, distIndexHor, false);
computeIndexMat(distVert, distIndexVert, false);
distHor.release();
distVert.release();
}
}
}
void DTFilterOCL::initDT_RF(UMat &guide)
{
float sigmaRatio = (float)(sigmaSpatial / sigmaColor);
float alpha1 = getIterAlpha(1);
ocl::Kernel kerHor("compute_a0DT_vert", kerProgSrcDT, buildOptionsDT);
ocl::Kernel kerVert("compute_a0DT_vert", kerProgSrcDT, buildOptionsDT);
CV_Assert(!kerHor.empty() && !kerVert.empty());
UMat guideT;
transpose(guide, guideT);
a0distHor.create(guide.cols - 1, guide.rows, CV_32FC1);
a0distVert.create(guide.rows - 1, guide.cols, CV_32FC1);
kerHor.args(
ocl::KernelArg::ReadOnly(guideT),
ocl::KernelArg::ReadWriteNoSize(a0distHor),
sigmaRatio, alpha1);
kerVert.args(
ocl::KernelArg::ReadOnly(guide),
ocl::KernelArg::ReadWriteNoSize(a0distVert),
sigmaRatio, alpha1);
size_t globalSizeHor[] = { guide.cols - 1, guide.rows };
size_t globalSizeVert[] = { guide.rows - 1, guide.cols };
size_t localSizeHor[] = { 1, getMaxWorkGropSize() };
size_t localSizeVert[] = { 1, getMaxWorkGropSize() };
bool sync = true;
MEASURE_KER_TIME(kerHor.run(2, globalSizeHor, localSizeHor, sync));
MEASURE_KER_TIME(kerVert.run(2, globalSizeVert, localSizeVert, sync));
}
void DTFilterOCL::initDT_IC(UMat& guide)
{
UMat distHorWrapper, distVertWrapper;
computeDT(guide, distHorWrapper, distVertWrapper);
distHor = distHorWrapper(Range::all(), Range(1, w + 1));
distVert = distVertWrapper(Range(1, 1 + h), Range::all());
UMat distHorWrapperT;
MEASURE_KER_TIME( transpose(distHorWrapper, distHorWrapperT); );
distHorT = distHorWrapperT(Range(1, w + 1), Range::all());
UMat distVertWrapperT;
MEASURE_KER_TIME( transpose(distVertWrapper, distVertWrapperT); );
distVertT = distVertWrapperT(Range::all(), Range(1, h + 1));
computeIndexAndTailMat(distHor, distIndexHor, tailHor);
computeIndexAndTailMat(distVertT, distIndexVert, tailVert);
}
void DTFilterOCL::filter(InputArray src_, OutputArray dst_, int dDepth)
{
CV_Assert( src_.channels() <= 4 && (src_.depth() == CV_8U || src_.depth() == CV_32F) );
CV_Assert( src_.cols() == w && src_.rows() == h );
setBuildOptionsFlt(src_);
if (dDepth == -1) dDepth = src_.depth();
if (mode == DTF_NC)
{
UMat src = UMatAligned(h, w, workType);
UMat dst;
MEASURE_KER_TIME( src_.getUMat().convertTo(src, workType) );
if (NC_ocl_implememtation == ALG_PER_ROW)
{
filterNC_perRowAlg(src, dst);
}
else if (NC_ocl_implememtation == ALG_PER_PIXEL)
{
filterNC_PerPixelAlg(src, dst);
}
else if (NC_ocl_implememtation == ALG_INDEX)
{
filterNC_IndexAlg(src, dst);
}
dst.convertTo(dst_, dDepth);
}
else if (mode == DTF_RF)
{
filterRF(src_, dst_, dDepth);
}
else if (mode == DTF_IC)
{
filterIC(src_, dst_, dDepth);
}
else
{
CV_Error(Error::StsBadFlag, "Incorrect DT filter mode");
}
}
void DTFilterOCL::initProgram()
{
//CV_Assert(ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU);
kerProgSrcDT = ocl::ximgproc::dtfilter_dt_oclsrc;
kerProgSrcFilter = ocl::ximgproc::dtfilter_flt_oclsrc;
}
DTFilterOCL::DTFilterOCL()
: a0distHor(distHor), a0distVert(distVert)
{
}
DTFilterOCL::~DTFilterOCL()
{
totalKernelsTime += kernelsTime;
//std::cout << "Kernels time " << kernelsTime / getTickFrequency() << " sec.\n";
//std::cout << "Total kernels time " << totalKernelsTime / getTickFrequency() << " sec.\n";
}
void DTFilterOCL::integrateCols(UMat& src, UMat& isrc)
{
CV_Assert(src.depth() == CV_32F);
int sizeof_float4 = 16;
int reqChuncks4f = (src.cols*src.elemSize() + sizeof_float4 - 1) / sizeof_float4;
int maxChuncks4f = src.step.p[0] / sizeof_float4;
CV_Assert(reqChuncks4f <= maxChuncks4f);
createAligned(isrc, src.rows + 1, src.cols, src.type(), sizeof_float4, USAGE_ALLOCATE_DEVICE_MEMORY);
ocl::Kernel ker = ocl::Kernel("integrate_cols_4f", kerProgSrcFilter, buildOptionsFlt);
CV_Assert(!ker.empty());
ker.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(isrc), (int)src.rows, reqChuncks4f);
size_t globKerSize4f[] = { reqChuncks4f };
bool oclKerStatus;
MEASURE_KER_TIME( oclKerStatus = ker.run(1, globKerSize4f, NULL, true); );
CV_Assert(oclKerStatus);
}
void DTFilterOCL::computeIndexMat(UMat& _dist, UMat& distIndex, bool useIDT)
{
distIndex.create(_dist.rows*numIters, 2*_dist.cols, CV_32SC1);
const char *kernelName = (useIDT) ? "find_conv_bounds_by_idt" : "find_conv_bounds_by_dt";
ocl::Kernel ker = ocl::Kernel(kernelName, kerProgSrcDT, buildOptionsDT);
ker.args(
ocl::KernelArg::ReadOnly(_dist),
ocl::KernelArg::WriteOnlyNoSize(distIndex),
getIterRadius(1), getIterRadius(2), getIterRadius(3)
);
size_t globSize[] = { _dist.rows, _dist.cols };
size_t localSize[] = { 1, getMaxWorkGropSize() };
bool oclKernelStatus;
MEASURE_KER_TIME(oclKernelStatus = ker.run(2, globSize, localSize, true));
CV_Assert(oclKernelStatus);
}
void DTFilterOCL::computeIndexAndTailMat(UMat& dist, UMat& distIndex, UMat& tail)
{
distIndex.create(dist.rows*numIters, 2*dist.cols, CV_32SC1);
tail.create(dist.rows*numIters, 2*dist.cols, CV_32FC1);
ocl::Kernel ker = ocl::Kernel("find_conv_bounds_and_tails", kerProgSrcDT, buildOptionsDT);
CV_Assert(!ker.empty());
ker.args(
ocl::KernelArg::ReadOnly(dist),
ocl::KernelArg::WriteOnlyNoSize(distIndex),
ocl::KernelArg::WriteOnlyNoSize(tail),
getIterRadius(1), getIterRadius(2), getIterRadius(3)
);
size_t globSize[] = {dist.rows, dist.cols};
size_t localSize[] = {1, getMaxWorkGropSize()};
bool oclKerStatus;
MEASURE_KER_TIME( oclKerStatus = ker.run(2, globSize, localSize, true); );
CV_Assert(oclKerStatus);
}
int DTFilterOCL::getMaxWorkGropSize()
{
int maxWorkGropSize = ocl::Device::getDefault().maxWorkGroupSize();
return (maxWorkGropSize <= 0) ? 256 : maxWorkGropSize;
}
int DTFilterOCL::getCacheLineSize()
{
int lineSize = ocl::Device::getDefault().globalMemCacheLineSize();
return (lineSize <= 0) ? 128 : lineSize;
}
void DTFilterOCL::filterRF(InputArray src_, OutputArray dst_, int dDepth)
{
UMat res = UMatAligned(h, w, workType, getCacheLineSize());
UMat resT = UMatAligned(w, h, workType, getCacheLineSize());
UMat adistHor, adistVert;
MEASURE_KER_TIME(src_.getUMat().convertTo(res, workType));
if (singleFilterCall || numIters == 1)
{
adistHor = a0distHor;
adistVert = a0distVert;
}
else
{
a0distHor.copyTo(adistHor);
a0distVert.copyTo(adistVert);
}
if (true)
{
filterRF_blockAlg(res, resT, adistHor, adistVert);
}
else
{
filterRF_naiveAlg(res, resT, adistHor, adistVert);
}
res.convertTo(dst_, dDepth);
}
void DTFilterOCL::filterRF_blockAlg(UMat& res, UMat& resT, UMat& adistHor, UMat& adistVert)
{
resT.create(res.cols, res.rows, res.type());
UMat weights = UMatAligned(res.rows, res.cols, CV_32FC1, getCacheLineSize(), USAGE_ALLOCATE_DEVICE_MEMORY);
UMat weightsT = UMatAligned(res.cols, res.rows, CV_32FC1, getCacheLineSize(), USAGE_ALLOCATE_DEVICE_MEMORY);
for (int iter = 1; iter <= numIters; iter++)
{
transpose(res, resT);
filterRF_iter_vert_pass(resT, adistHor, weightsT);
transpose(resT, res);
filterRF_iter_vert_pass(res, adistVert, weights);
if (iter < numIters)
{
cv::pow(adistHor, 2, adistHor);
cv::pow(adistVert, 2, adistVert);
}
}
}
void DTFilterOCL::filterRF_iter_vert_pass(UMat& res, UMat& adist, UMat& weights)
{
int blockSize = (int)std::sqrt((float)res.rows);
int blocksCount = (res.rows + blockSize-1)/blockSize;
bool sync = true;
for (int passId = 0; passId <= 1; passId++)
{
{
char *knames[] = {"filter_RF_block_init_fwd", "filter_RF_block_init_bwd"};
ocl::Kernel kerInit(knames[passId], kerProgSrcFilter, buildOptionsFlt);
CV_Assert(!kerInit.empty());
kerInit.args(
ocl::KernelArg::ReadWrite(res),
ocl::KernelArg::ReadWriteNoSize(adist),
ocl::KernelArg::WriteOnlyNoSize(weights),
blockSize);
size_t globSize[] = { blocksCount, res.cols };
size_t localSize[] = { 1, getMaxWorkGropSize() };
CV_Assert(kerInit.run(2, globSize, localSize, sync));
//imwrite(String(knames[passId]) + ".png", res.t());
}
{
const char *knames[] = {"filter_RF_block_fill_borders_fwd", "filter_RF_block_fill_borders_bwd"};
ocl::Kernel kerFillBorders(knames[passId], kerProgSrcFilter, buildOptionsFlt);
CV_Assert(!kerFillBorders.empty());
kerFillBorders.args(
ocl::KernelArg::ReadWrite(res),
ocl::KernelArg::ReadOnlyNoSize(weights),
blockSize);
size_t globSize[] = { res.cols };
CV_Assert(kerFillBorders.run(1, globSize, NULL, sync));
//imwrite(String(knames[passId]) + ".png", res.t());
}
{
const char *knames[] = {"filter_RF_block_fill_fwd", "filter_RF_block_fill_bwd"};
ocl::Kernel kerFill(knames[passId], kerProgSrcFilter, buildOptionsFlt);
CV_Assert(!kerFill.empty());
kerFill.args(
ocl::KernelArg::ReadWrite(res),
ocl::KernelArg::ReadOnlyNoSize(weights),
blockSize);
size_t globSize[] = { res.rows - blockSize, res.cols };
size_t localSize[] = { 1, getMaxWorkGropSize() };
CV_Assert(kerFill.run(2, globSize, localSize, sync));
//imwrite(String(knames[passId]) + ".png", res.t());
}
}
}
void DTFilterOCL::filterRF_naiveAlg(UMat& res, UMat& resT, UMat& adistHor, UMat& adistVert)
{
ocl::Kernel kerHor("filter_RF_vert", kerProgSrcFilter, buildOptionsFlt);
ocl::Kernel kerVert("filter_RF_vert", kerProgSrcFilter, buildOptionsFlt);
CV_Assert(!kerHor.empty() && !kerVert.empty());
size_t globSizeHor[] = { h };
size_t globSizeVert[] = { w };
bool syncQueue = true;
for (int iter = 1; iter <= numIters; iter++)
{
bool statusHorPass, statusVertPass;
int write_a0 = (iter < numIters);
MEASURE_KER_TIME(transpose(res, resT));
kerHor.args(ocl::KernelArg::ReadWrite(resT), ocl::KernelArg::ReadWriteNoSize(adistHor), write_a0);
MEASURE_KER_TIME(statusHorPass = kerHor.run(1, globSizeHor, NULL, syncQueue));
MEASURE_KER_TIME(transpose(resT, res));
kerVert.args(ocl::KernelArg::ReadWrite(res), ocl::KernelArg::ReadWriteNoSize(adistVert), write_a0);
MEASURE_KER_TIME(statusVertPass = kerVert.run(1, globSizeVert, NULL, syncQueue));
CV_Assert(statusHorPass && statusVertPass);
}
}
void DTFilterOCL::filterNC_IndexAlg(UMat& src, UMat& dst)
{
CV_Assert(src.rows == h && src.cols == w);
UMat srcT = UMatAligned(w, h, workType);
UMat isrc, isrcT;
ocl::Kernel kerFiltHor = ocl::Kernel("filter_NC_hor_by_bounds", kerProgSrcFilter, buildOptionsFlt);
ocl::Kernel kerFiltVert = ocl::Kernel("filter_NC_hor_by_bounds", kerProgSrcFilter, buildOptionsFlt);
size_t globSizeHor[] = {h, w};
size_t globSizeVert[] = {w, h};
size_t localSize[] = {1, getMaxWorkGropSize()};
bool sync = true;
MEASURE_KER_TIME( transpose(src, srcT) );
for (int iter = 0; iter < numIters; iter++)
{
bool kerStatus = true;
MEASURE_KER_TIME( integrateCols(srcT, isrcT) );
MEASURE_KER_TIME( transpose(isrcT, isrc) );
kerFiltHor.args(
ocl::KernelArg::ReadOnlyNoSize(isrc),
ocl::KernelArg::ReadOnlyNoSize(distIndexHor),
ocl::KernelArg::WriteOnly(src),
iter
);
MEASURE_KER_TIME( kerStatus &= kerFiltHor.run(2, globSizeHor, localSize, sync) );
MEASURE_KER_TIME( integrateCols(src, isrc) );
MEASURE_KER_TIME( transpose(isrc, isrcT) );
kerFiltVert.args(
ocl::KernelArg::ReadOnlyNoSize(isrcT),
ocl::KernelArg::ReadOnlyNoSize(distIndexVert),
ocl::KernelArg::WriteOnly(srcT),
iter
);
MEASURE_KER_TIME( kerStatus &= kerFiltVert.run(2, globSizeVert, localSize, sync) );
CV_Assert(kerStatus);
}
MEASURE_KER_TIME( transpose(srcT, dst) );
}
void DTFilterOCL::filterNC_perRowAlg(UMat& src, UMat& dst)
{
dst.create(h, w, workType);
UMat& res = dst;
UMat srcT = UMat(w, h, workType);
UMat resT = UMat(w, h, workType);
MEASURE_KER_TIME(transpose(src, srcT));
ocl::Kernel kerHor("filter_NC_vert", kerProgSrcFilter, buildOptionsFlt);
ocl::Kernel kerVert("filter_NC_vert", kerProgSrcFilter, buildOptionsFlt);
CV_Assert(!kerHor.empty() && !kerVert.empty());
size_t globSizeHor[] = { h };
size_t globSizeVert[] = { w };
bool syncQueue = true;
for (int iter = 1; iter <= numIters; iter++)
{
bool statusHorPass, statusVertPass;
float radius = getIterRadius(iter);
kerHor.args(
ocl::KernelArg::ReadOnly(srcT),
ocl::KernelArg::ReadOnlyNoSize(idistHor),
ocl::KernelArg::WriteOnlyNoSize(resT),
radius
);
MEASURE_KER_TIME(statusHorPass = kerHor.run(1, globSizeHor, NULL, syncQueue));
MEASURE_KER_TIME(transpose(resT, src));
kerVert.args(
ocl::KernelArg::ReadOnly(src),
ocl::KernelArg::ReadOnlyNoSize(idistVert),
ocl::KernelArg::WriteOnlyNoSize(res),
radius
);
MEASURE_KER_TIME(statusVertPass = kerVert.run(1, globSizeVert, NULL, syncQueue));
if (iter < numIters)
MEASURE_KER_TIME(transpose(res, srcT));
CV_Assert(statusHorPass && statusVertPass);
}
}
void DTFilterOCL::filterNC_PerPixelAlg(UMat& src, UMat& dst)
{
dst = src;
UMat res(h, w, workType);
UMat srcT(w, h, workType), resT(w, h, workType);
ocl::Kernel kerHor("filter_NC_hor", kerProgSrcFilter);
ocl::Kernel kerVert("filter_NC_hor", kerProgSrcFilter);
CV_Assert(!kerHor.empty() && !kerVert.empty());
size_t globSizeHor[] = { h, w };
size_t globSizeVert[] = { w, h };
size_t localSize[] = { 1, getMaxWorkGropSize() };
bool sync = true;
for (int iter = 1; iter <= numIters; iter++)
{
bool kerStatus = true;
float radius = getIterRadius(iter);
kerHor.args(
ocl::KernelArg::ReadOnly(src),
ocl::KernelArg::ReadOnlyNoSize(idistHor),
ocl::KernelArg::WriteOnlyNoSize(res),
radius
);
MEASURE_KER_TIME(kerStatus &= kerHor.run(2, globSizeHor, localSize, sync));
MEASURE_KER_TIME(transpose(res, srcT));
kerVert.args(
ocl::KernelArg::ReadOnly(srcT),
ocl::KernelArg::ReadOnlyNoSize(idistVert),
ocl::KernelArg::WriteOnlyNoSize(resT),
radius
);
MEASURE_KER_TIME(kerStatus &= kerVert.run(2, globSizeVert, localSize, sync));
MEASURE_KER_TIME(transpose(resT, src));
CV_Assert(kerStatus);
}
}
void DTFilterOCL::filterIC(InputArray src_, OutputArray dst_, int dDepth)
{
UMat srcWrapper(h + 1, w + 2, workType);
UMat srcTWrapper(w + 1, h + 2, workType);
UMat src = srcWrapper(Range(1, 1 + h), Range(1, 1 + w));
UMat srcT = srcTWrapper(Range(1, 1 + w), Range(1, 1 + h));
UMat isrc = UMatAligned(h, w + 1, workType);
UMat isrcT = UMatAligned(w + 1, h, workType);
UMat res = UMatAligned(h, w, workType);
UMat resT = UMatAligned(w, h, workType);
MEASURE_KER_TIME( src_.getUMat().convertTo(src, workType) );
ocl::Kernel kerHor = ocl::Kernel("filter_IC_hor", kerProgSrcFilter, buildOptionsFlt);
ocl::Kernel kerVert = ocl::Kernel("filter_IC_hor", kerProgSrcFilter, buildOptionsFlt);
CV_Assert(!kerHor.empty() && !kerVert.empty());
size_t globSizeHor[] = {h, w};
size_t globSizeVert[] = {w, h};
size_t localSize[] = {1, getMaxWorkGropSize()};
bool sync = true;
MEASURE_KER_TIME( transpose(src, resT) );
for (int iter = 0; iter < numIters; iter++)
{
bool oclKerStatus = true;
float curIterRadius = getIterRadius(iter);
integrateColsIC(resT, distHorT, isrcT);
MEASURE_KER_TIME( transpose(isrcT, isrc); );
if (iter != 0) MEASURE_KER_TIME( transpose(resT, src); );
duplicateVerticalBorders(srcWrapper);
kerHor.args(
ocl::KernelArg::ReadOnly(src),
ocl::KernelArg::ReadOnlyNoSize(isrc),
ocl::KernelArg::ReadOnlyNoSize(distIndexHor),
ocl::KernelArg::ReadOnlyNoSize(tailHor),
ocl::KernelArg::ReadOnlyNoSize(distHor),
ocl::KernelArg::WriteOnlyNoSize(res),
curIterRadius, iter
);
MEASURE_KER_TIME( oclKerStatus &= kerHor.run(2, globSizeHor, localSize, sync); );
integrateColsIC(res, distVert, isrc);
MEASURE_KER_TIME( transpose(isrc, isrcT); );
MEASURE_KER_TIME( transpose(res, srcT); );
duplicateVerticalBorders(srcTWrapper);
kerVert.args(
ocl::KernelArg::ReadOnly(srcT),
ocl::KernelArg::ReadOnlyNoSize(isrcT),
ocl::KernelArg::ReadOnlyNoSize(distIndexVert),
ocl::KernelArg::ReadOnlyNoSize(tailVert),
ocl::KernelArg::ReadOnlyNoSize(distVertT),
ocl::KernelArg::WriteOnlyNoSize(resT),
curIterRadius, iter
);
MEASURE_KER_TIME( oclKerStatus &= kerVert.run(2, globSizeVert, localSize, sync); );
CV_Assert(oclKerStatus);
}
if (dDepth == resT.depth())
{
transpose(resT, dst_);
}
else
{
transpose(resT, res);
res.convertTo(dst_, dDepth);
}
}
void DTFilterOCL::setBuildOptionsFlt(InputArray src_)
{
int cn = src_.channels();
//int depth = src_.depth();
workType = CV_MAKE_TYPE(CV_32F, cn);
buildOptionsFlt = format(
"-D cn=%d "
"-D SrcVec=%s "
"-D NUM_ITERS=%d "
,
cn,
ocl::typeToStr(workType),
numIters
);
//std::cout << "build options flt.: " << buildOptionsFlt.c_str() << "\n";
}
void DTFilterOCL::setBuildOptionsDT(UMat &guide)
{
int gType = guide.type();
int gcn = guide.channels();
int gDepth = guide.depth();
char strBuf[40];
buildOptionsDT = format(
"-D NUM_ITERS=%d "
"-D cn=%d "
"-D GuideType=%s "
"-D GuideVec=%s "
,
numIters,
gcn,
ocl::typeToStr(gDepth),
ocl::typeToStr(gType)
);
if (gDepth != CV_32F)
{
buildOptionsDT += format("-D convert_guide=%s ", ocl::convertTypeStr(gDepth, CV_32F, gcn, strBuf));
}
//cout << "buildOptions:" << buildOptionsDT.c_str() << "\n";
}
void DTFilterOCL::computeDT(UMat& guide, UMat& distHorOuter, UMat& distVertOuter)
{
ocl::Kernel kerHor = ocl::Kernel("compute_dt_hor", kerProgSrcDT, buildOptionsDT);
ocl::Kernel kerVert = ocl::Kernel("compute_dt_vert", kerProgSrcDT, buildOptionsDT);
CV_Assert(!kerHor.empty() && !kerVert.empty());
distHorOuter.create(h, w + 1, CV_32FC1);
distVertOuter.create(h + 1, w, CV_32FC1);
UMat distHor = distHorOuter(Range::all(), Range(1, w + 1));
UMat distVert = distVertOuter(Range(1, h + 1), Range::all());
float maxRadius = 1.1f*getIterRadius(1);
float sigmaRatio = (float)(sigmaSpatial/sigmaColor);
kerHor.args(
ocl::KernelArg::ReadOnly(guide),
ocl::KernelArg::WriteOnlyNoSize(distHor),
sigmaRatio,
maxRadius
);
kerVert.args(
ocl::KernelArg::ReadOnly(guide),
ocl::KernelArg::WriteOnlyNoSize(distVert),
sigmaRatio,
maxRadius
);
size_t globSizeHor[] = {h, w + 1};
size_t globSizeVert[] = {h + 1, w};
size_t localSize[] = {1, getMaxWorkGropSize()};
bool sync = true;
bool oclKernelStatus = true;
MEASURE_KER_TIME( oclKernelStatus &= kerHor.run(2, globSizeHor, localSize, sync); );
MEASURE_KER_TIME( oclKernelStatus &= kerVert.run(2, globSizeVert, localSize, sync); );
CV_Assert(oclKernelStatus);
}
void DTFilterOCL::computeIDT(UMat& guide, UMat& guideT)
{
ocl::Kernel kerHor("compute_idt_vert", kerProgSrcDT, buildOptionsDT);
ocl::Kernel kerVert("compute_idt_vert", kerProgSrcDT, buildOptionsDT);
CV_Assert(!kerHor.empty() && !kerVert.empty());
UMat idistHorWrap(w + 2, h, CV_32FC1);
UMat idistVertWrap(h + 2, w, CV_32FC1);
idistHor = idistHorWrap(Range(1, 1 + w), Range::all());
idistVert = idistVertWrap(Range(1, 1 + h), Range::all());
float sigmaRatio = (float)(sigmaSpatial / sigmaColor);
kerHor.args(ocl::KernelArg::ReadOnly(guideT),
ocl::KernelArg::WriteOnlyNoSize(idistHor),
sigmaRatio);
kerVert.args(ocl::KernelArg::ReadOnly(guide),
ocl::KernelArg::WriteOnlyNoSize(idistVert),
sigmaRatio);
size_t globalSizeHor[] = { guide.rows };
size_t globalSizeVert[] = { guide.cols };
bool sync = true;
MEASURE_KER_TIME(kerHor.run(1, globalSizeHor, NULL, sync));
MEASURE_KER_TIME(kerVert.run(1, globalSizeVert, NULL, sync));
if (NC_ocl_implememtation != ALG_PER_ROW)
{
MEASURE_KER_TIME(transpose(idistHorWrap, idistHorWrap));
MEASURE_KER_TIME(transpose(idistVertWrap, idistVertWrap));
idistHor = idistHorWrap(Range::all(), Range(1, 1 + w));
idistVert = idistVertWrap(Range::all(), Range(1, 1 + h));
}
else
{
idistHor = idistHorWrap(Range(1, 1 + w), Range::all());
idistVert = idistVertWrap(Range(1, 1 + h), Range::all());
}
}
void DTFilterOCL::duplicateVerticalBorders(UMat& srcOuter)
{
Range rangeRows(0, srcOuter.rows);
{
UMat leftBorderSrc = srcOuter(rangeRows, Range(1, 2));
UMat leftBorderDst = srcOuter(rangeRows, Range(0, 1));
leftBorderSrc.copyTo(leftBorderDst);
}
{
UMat rightBorderSrc = srcOuter(rangeRows, Range(srcOuter.cols - 2, srcOuter.cols - 1));
UMat rightBorderDst = srcOuter(rangeRows, Range(srcOuter.cols - 1, srcOuter.cols));
rightBorderSrc.copyTo(rightBorderDst);
}
}
void DTFilterOCL::integrateColsIC(UMat& src, UMat& dist, UMat& isrc)
{
CV_Assert(src.size() == dist.size() && src.depth() == CV_32F);
isrc.create(src.rows + 1, src.cols, workType);
ocl::Kernel ker("integrate_cols_with_dist", kerProgSrcFilter, buildOptionsFlt);
CV_Assert(!ker.empty());
ker.args(
ocl::KernelArg::ReadOnly(src),
ocl::KernelArg::ReadOnlyNoSize(dist),
ocl::KernelArg::WriteOnlyNoSize(isrc)
);
size_t globSize[] = {src.cols};
size_t localSize[] = {getMaxWorkGropSize()};
bool oclKernelStatus;
MEASURE_KER_TIME( oclKernelStatus = ker.run(1, globSize, localSize, true) );
CV_Assert(oclKernelStatus);
}
cv::UMat DTFilterOCL::UMatAligned(int rows, int cols, int type, int align, int usageFlags)
{
int depth = CV_MAT_DEPTH(type);
int cn = CV_MAT_CN(type);
int elemSize1 = CV_ELEM_SIZE1(type);
int elemSize = CV_ELEM_SIZE(type);
int lineSize = ((cols*elemSize + align-1) / align)*align;
CV_Assert(lineSize % elemSize1 == 0);
UMat dst(rows, lineSize/elemSize1, CV_MAKE_TYPE(depth, 1), usageFlags);
return dst(Range::all(), Range(0, cols*cn)).reshape(cn);
}
void DTFilterOCL::createAligned(UMat& src, int rows, int cols, int type, int align, int usageFlags)
{
if (src.empty() || src.rows != rows || src.cols != cols || src.type() != type)
{
if (CV_ELEM_SIZE(type)*cols % align == 0)
{
src.create(rows, cols, type, (UMatUsageFlags)usageFlags);
}
else
{
src = UMatAligned(rows, cols, type, align, usageFlags);
}
}
}
void DTFilterOCL::setSingleFilterCall(bool flag)
{
this->singleFilterCall = flag;
}
}
\ No newline at end of file
#pragma once
#include "precomp.hpp"
namespace cv
{
class DTFilterOCL : public DTFilter
{
public:
static Ptr<DTFilterOCL> create(InputArray guide, double sigmaSpatial, double sigmaColor, int mode = DTF_NC, int numIters = 3);
void filter(InputArray src, OutputArray dst, int dDepth = -1);
void setSingleFilterCall(bool flag);
~DTFilterOCL();
protected: /*Members declarations*/
UMat idistHor, idistVert;
UMat distHor, distVert;
UMat distHorT, distVertT;
UMat &a0distHor, &a0distVert; //synonyms of distHor distVert
UMat distIndexHor, distIndexVert;
UMat tailVert, tailHor;
int mode, numIters;
float sigmaSpatial, sigmaColor;
bool singleFilterCall;
int h, w;
int workType;
double meanDist;
static int64 totalKernelsTime;
int64 kernelsTime;
ocl::ProgramSource kerProgSrcDT;
ocl::ProgramSource kerProgSrcFilter;
cv::String buildOptionsFlt;
cv::String buildOptionsDT;
int NC_ocl_implememtation;
enum NCOclImplementation
{
ALG_PER_ROW,
ALG_PER_PIXEL,
ALG_INDEX,
};
protected: /*Functions declarations*/
DTFilterOCL();
void init(InputArray guide, double sigmaSpatial, double sigmaColor, int mode = DTF_NC, int numIters = 3);
inline double getIterSigmaH(int iterNum)
{
return sigmaSpatial * std::pow(2.0, numIters - iterNum) / sqrt(std::pow(4.0, numIters) - 1);
}
inline float getIterRadius(int iterNum)
{
return (float)(3.0*getIterSigmaH(iterNum));
}
inline float getIterAlpha(int iterNum)
{
return (float)std::exp(-std::sqrt(2.0 / 3.0) / getIterSigmaH(iterNum));
}
int getMaxWorkGropSize();
int getCacheLineSize();
void initProgram();
void setBuildOptionsFlt(InputArray src_);
void setBuildOptionsDT(UMat& guide);
void initDT_NC(UMat& guide);
void initDT_RF(UMat& guide);
void initDT_IC(UMat& guide);
void computeDT(UMat& guide, UMat& distHorOuter, UMat& distVertOuter);
void computeIDT(UMat& guide, UMat& guideT);
void filterNC_IndexAlg(UMat& src, UMat& dst);
void filterNC_perRowAlg(UMat& src, UMat& dst);
void filterNC_PerPixelAlg(UMat& src, UMat& dst);
void filterIC(InputArray src_, OutputArray dst_, int dDepth);
void filterRF(InputArray src_, OutputArray dst_, int dDepth);
void filterRF_naiveAlg(UMat& res, UMat& resT, UMat& adistHor, UMat& adistVert);
void filterRF_blockAlg(UMat& res, UMat& resT, UMat& adistHor, UMat& adistVert);
void filterRF_iter_vert_pass(UMat& res, UMat& adist, UMat& weights);
void computeIndexMat(UMat& idist, UMat& distIndex, bool useIDT = true);
void integrateCols(UMat& src, UMat& isrc);
void computeIndexAndTailMat(UMat& dist, UMat& distIndex, UMat& tail);
void duplicateVerticalBorders(UMat& srcOuter);
void integrateColsIC(UMat& src, UMat& dist, UMat& isrc);
static UMat UMatAligned(int rows, int cols, int type, int align = 16, int useageFlags = USAGE_DEFAULT);
static void createAligned(UMat& src, int rows, int cols, int type, int align = 16, int usageFlags = USAGE_DEFAULT);
};
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
#if defined(_MSC_VER) //for correct syntax highlighting
#include "OpenCLKernel.hpp"
#define cn 3
#define GuideType uchar
#define GuideVec uchar3
#define convert_guide convert_float3
#define NUM_ITERS 3
#endif
#if !defined(convert_guide)
#define convert_guide(a) (a)
#endif
#if cn==3
#define ELEM_SIZE (3*sizeof(GuideType))
#define loadGuide(addr) vload3(0, (__global const GuideType *)(addr))
#else
#define ELEM_SIZE sizeof(GuideVec)
#define loadGuide(addr) ( *(__global const GuideVec*)(addr) )
#endif
#define storeDist(val, addr) *(__global float*)(addr) = val
#if cn == 1
#define SUM(a) (a)
#elif cn == 2
#define SUM(a) (a.x + a.y)
#elif cn == 3
#define SUM(a) (a.x + a.y + a.z)
#elif cn == 4
#define SUM(a) (a.x + a.y + a.z + a.w)
#else
#error "cn should be <= 4"
#endif
#define NORM(a, b) SUM(fabs(convert_guide(a) - convert_guide(b)))
#define DT(a, b, sigmaRatios) (1.0f + (sigmaRatios)*NORM(a, b))
#define getFloat(addr, index) ( *(__global const float*)(addr + (index)*sizeof(float)) )
#define storeFloat(val, addr, index) ( *(__global float*)(addr + (index)*sizeof(float)) = val )
#define PtrAdd(addr, bytesNum, Type) ((__global Type *)((__global uchar*)(addr) + bytesNum))
#define PtrAddConst(addr, bytesNum, Type) ((__global const Type *)((__global const uchar*)(addr) + bytesNum))
__kernel void find_conv_bounds_by_idt(__global const uchar *idist, int idist_step, int idist_offset, int rows, int cols,
__global uchar *bounds, int bounds_step, int bounds_offset,
float radius1, float radius2, float radius3)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < rows) || !(j >= 0 && j < cols))
return;
idist += mad24(i, idist_step, idist_offset);
__global int *bound = bounds + mad24((NUM_ITERS-1)*rows + i, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset));
float center_idt_val = getFloat(idist, j);
float radius[] = {radius1, radius2, radius3};
int low_bound = j;
int high_bound = j;
float search_idt_val;
for (int iter = NUM_ITERS - 1; iter >= 0; iter--)
{
search_idt_val = center_idt_val - radius[iter];
while (search_idt_val < getFloat(idist, low_bound-1))
low_bound--;
bound[0] = low_bound;
search_idt_val = center_idt_val + radius[iter];
while (getFloat(idist, high_bound + 1) < search_idt_val)
high_bound++;
bound[1] = high_bound;
bound = PtrAdd(bound, -rows*bounds_step, int);
}
}
__kernel void find_conv_bounds_by_dt(__global const uchar *dist, int dist_step, int dist_offset, int rows, int cols,
__global uchar *bounds, int bounds_step, int bounds_offset,
float radius1, float radius2, float radius3)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < rows) || !(j >= 0 && j < cols))
return;
dist += mad24(i, dist_step, dist_offset);
int rowsOffset = mad24(NUM_ITERS - 1, rows, i);
__global int *bound = PtrAdd(bounds, mad24(rowsOffset, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset)), int);
float radius[] = {radius1, radius2, radius3};
int low_bound = j;
int high_bound = j;
float val, cur_radius, low_dt_val = 0.0f, high_dt_val = 0.0f;
for (int iter = NUM_ITERS - 1; iter >= 0; iter--)
{
cur_radius = radius[iter];
while (cur_radius > (val = low_dt_val + getFloat(dist, low_bound - 1)) )
{
low_dt_val = val;
low_bound--;
}
bound[0] = low_bound;
while (cur_radius > (val = high_dt_val + getFloat(dist, high_bound)) )
{
high_dt_val = val;
high_bound++;
}
bound[1] = high_bound;
bound = PtrAdd(bound, -rows*bounds_step, int);
}
}
__kernel void find_conv_bounds_and_tails(__global const uchar *dist, int dist_step, int dist_offset, int rows, int cols,
__global uchar *bounds, int bounds_step, int bounds_offset,
__global float *tailmat, int tail_step, int tail_offset,
float radius1, float radius2, float radius3)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < rows) || !(j >= 0 && j < cols))
return;
dist += mad24(i, dist_step, dist_offset);
int rowsOffset = mad24(NUM_ITERS - 1, rows, i);
__global int *bound = PtrAdd(bounds, mad24(rowsOffset, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset)), int);
__global float *tail = PtrAdd(tailmat, mad24(rowsOffset, tail_step, mad24(j, 2*(int)sizeof(float), tail_offset)), float);
float radius[] = {radius1, radius2, radius3};
int low_bound = j;
int high_bound = j;
float val, cur_radius, low_dt_val = 0.0f, high_dt_val = 0.0f;
for (int iter = NUM_ITERS - 1; iter >= 0; iter--)
{
cur_radius = radius[iter];
while (cur_radius > (val = low_dt_val + getFloat(dist, low_bound - 1)) )
{
low_dt_val = val;
low_bound--;
}
bound[0] = low_bound;
tail[0] = (cur_radius - low_dt_val);
while (cur_radius > (val = high_dt_val + getFloat(dist, high_bound)) )
{
high_dt_val = val;
high_bound++;
}
bound[1] = high_bound;
tail[1] = (cur_radius - high_dt_val);
bound = PtrAdd(bound, -rows*bounds_step, int);
tail = PtrAdd(tail, -rows*tail_step, float);
}
}
__kernel void compute_dt_hor(__global const uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar *dst, int dst_step, int dst_offset,
float sigma_ratio, float max_radius)
{
int i = get_global_id(0);
int j = get_global_id(1) - 1;
if (!(i >= 0 && i < src_rows && j >= -1 && j < src_cols))
return;
src += mad24(i, src_step, mad24(j, (int)ELEM_SIZE, src_offset));
dst += mad24(i, dst_step, mad24(j, (int)sizeof(float) , dst_offset));
float dist;
if (j == -1 || j == src_cols - 1)
dist = max_radius;
else
dist = DT(loadGuide(src), loadGuide(src + ELEM_SIZE), sigma_ratio);
storeDist(dist, dst);
}
__kernel void compute_dt_vert(__global const uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar *dst, int dst_step, int dst_offset,
float sigma_ratio, float max_radius)
{
int i = get_global_id(0) - 1;
int j = get_global_id(1);
if (!(i >= -1 && i < src_rows && j >= 0 && j < src_cols))
return;
src += mad24(i, src_step, mad24(j, (int)ELEM_SIZE, src_offset));
dst += mad24(i, dst_step, mad24(j, (int)sizeof(float) , dst_offset));
float dist;
if (i == -1 || i == src_rows - 1)
dist = max_radius;
else
dist = DT(loadGuide(src), loadGuide(src + src_step), sigma_ratio);
storeDist(dist, dst);
}
__kernel void compute_idt_vert(__global const uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar *dst, int dst_step, int dst_offset,
float sigma_ratio)
{
int j = get_global_id(0);
if (!(j >= 0 && j < src_cols))
return;
int i;
float idist = 0;
__global const uchar *src_col = src + mad24(j, (int)ELEM_SIZE, src_offset);
__global uchar *dst_col = dst + mad24(j, (int)sizeof(float), dst_offset);
storeFloat(-FLT_MAX, dst_col + (-1)*dst_step, 0);
storeFloat(0.0f, dst_col + 0*dst_step, 0);
for (i = 1; i < src_rows; i++, src_col += src_step)
{
idist += DT(loadGuide(src_col), loadGuide(src_col + src_step), sigma_ratio);
storeFloat(idist, dst_col + i*dst_step, 0);
}
storeFloat(FLT_MAX, dst_col + i*dst_step, 0);
}
__kernel void compute_a0DT_vert(__global const uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar *dst, int dst_step, int dst_offset,
float sigma_ratio, float alpha)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < src_rows-1 && j >= 0 && j < src_cols))
return;
src += mad24(i, src_step, mad24(j, (int)ELEM_SIZE, src_offset));
dst += mad24(i, dst_step, mad24(j, (int)sizeof(float) , dst_offset));
float dist = DT(loadGuide(src), loadGuide(src + src_step), sigma_ratio);
storeDist(native_powr(alpha, dist), dst);
}
\ No newline at end of file
#if defined(_MSC_VER) //for correct syntax highlighting
#include "OpenCLKernel.hpp"
#define cn 3
#define SrcVec float3
#define NUM_ITERS 3
#endif
#define getFloatn(n, addr, index) vload##n(index, (__global const float*)(addr))
#define storeFloatn(n, val, addr, index) vstore##n(val, index, (__global float*)(addr))
#define PtrAdd(addr, bytesNum, Type) ((__global Type *)((__global uchar*)(addr) + bytesNum))
#define PtrAddConst(addr, bytesNum, Type) ((__global const Type *)((__global const uchar*)(addr) + bytesNum))
#if cn==3
#define ELEM_SIZE (int)(3*sizeof(float))
#define getPix(addr, index) vload3(index, (__global const float*)(addr))
#define storePix(val, addr, index) vstore3(val, index, (__global float*)(addr))
#else
#define ELEM_SIZE (int)sizeof(SrcVec)
#define getPix(addr, index) ( *(__global const SrcVec*)(addr + (index)*ELEM_SIZE) )
#define storePix(val, addr, index) ( *(__global SrcVec*)(addr + (index)*ELEM_SIZE) = val )
#endif
#define getFloat(addr, index) ( *(__global const float*)(addr + (index)*sizeof(float)) )
#define storeFloat(val, addr, index) ( *(__global float*)(addr + (index)*sizeof(float)) = val )
#define getInt(addr, index) ( *(__global const int*)(addr + (index)*sizeof(int)) )
#define storeInt(val, addr, index) ( *(__global int*)(addr + (index)*sizeof(int)) = val )
#define getFloat4(addr, index) getFloatn(4, addr, index)
#define storeFloat4(val, addr, index) storeFloatn(4, val, addr, index)
#define NC_USE_INTEGRAL_SRC
#undef NC_USE_INTEGRAL_SRC
__kernel void integrate_cols_4f(__global uchar *src, int src_step, int src_offset,
__global uchar *isrc, int isrc_step, int isrc_offset,
int rows, int col_chunks)
{
int j = get_global_id(0);
if ( !(j >= 0 && j < col_chunks) )
return;
src += mad24(j, (int)sizeof(float4), src_offset);
isrc += mad24(j, (int)sizeof(float4), isrc_offset);
float4 sum = 0;
storeFloat4(0, isrc, 0);
isrc += isrc_step;
for (int i = 0; i < rows; i++, src += src_step, isrc += isrc_step)
{
sum += getFloat4(src, 0);
storeFloat4(sum, isrc, 0);
}
}
__kernel void integrate_cols_with_dist(__global const uchar *src, int src_step, int src_offset,
int src_rows, int src_cols,
__global const uchar *dist, int dist_step, int dist_offset,
__global uchar *isrc, int isrc_step, int isrc_offset)
{
int j = get_global_id(0);
if ( !(j >= 0 && j < src_cols) )
return;
src += mad24(j, ELEM_SIZE, src_offset);
isrc += mad24(j, ELEM_SIZE, isrc_offset);
dist += mad24(j, (int)sizeof(float), dist_offset);
SrcVec sum = 0;
storePix(0, isrc, 0);
isrc += isrc_step;
for (int i = 0; i < src_rows; i++, src += src_step, isrc += isrc_step, dist += dist_step)
{
sum += 0.5f * (getPix(src, 0) + getPix(src + src_step, 0)) * getFloat(dist, 0);
storePix(sum, isrc, 0);
}
}
__kernel void filter_IC_hor(__global const uchar *src, int src_step, int src_offset, int rows, int cols,
__global const uchar *isrc, int isrc_step, int isrc_offset,
__global const int *bounds, int bounds_step, int bounds_offset,
__global const float *tail, int tail_step, int tail_offset,
__global float *dist, int dist_step, int dist_offset,
__global uchar *dst, int dst_step, int dst_offset,
float radius, int iterNum
)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < rows) || !(j >= 0 && j < cols))
return;
dst += mad24(i, dst_step, dst_offset);
isrc += mad24(i, isrc_step, isrc_offset);
src += mad24(i, src_step, src_offset);
int ioffset = mad24(iterNum*rows + i, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset));
bounds = PtrAddConst(bounds, ioffset, int);
int il = bounds[0];
int ir = bounds[1];
int toffset = mad24(iterNum*rows + i, tail_step, mad24(j, 2*(int)sizeof(float), tail_offset));
tail = PtrAddConst(tail, toffset, float);
float tl = tail[0];
float tr = tail[1];
dist = PtrAdd(dist, mad24(i, dist_step, dist_offset), float);
float dl = tl / dist[il-1];
float dr = tr / dist[ir];
SrcVec res;
res = getPix(isrc, ir) - getPix(isrc, il); //center part
res += 0.5f*tl * ( dl*getPix(src, il - 1) + (2.0f - dl)*getPix(src, il) ); //left tail
res += 0.5f*tr * ( dr*getPix(src, ir + 1) + (2.0f - dr)*getPix(src, ir) ); //right tail
res /= radius;
storePix(res, dst, j);
}
__kernel void filter_NC_hor_by_bounds(__global const uchar *isrc, int isrc_step, int isrc_offset,
__global const uchar *bounds, int bounds_step, int bounds_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols,
int iterNum)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < rows) || !(j >= 0 && j < cols))
return;
dst += mad24(i, dst_step, dst_offset);
isrc += mad24(i, isrc_step, isrc_offset);
int ioffset = mad24(iterNum*rows + i, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset));
__global const int *index = PtrAddConst(bounds, ioffset, int);
int li = index[0];
int hi = index[1];
SrcVec res = (getPix(isrc, hi+1) - getPix(isrc, li)) / (float)(hi - li + 1);
storePix(res, dst, j);
}
__kernel void filter_NC_hor(__global uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global const uchar *idist, int idist_step, int idist_offset,
__global uchar *dst, int dst_step, int dst_offset,
float radius)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < src_rows) || !(j >= 0 && j < src_cols))
return;
src += mad24(i, src_step, src_offset);
idist += mad24(i, idist_step, idist_offset);
dst += mad24(i, dst_step, dst_offset);
int low_bound = j;
int high_bound = j;
float cur_idt_val = getFloat(idist, j);
float low_idt_val = cur_idt_val - radius;
float high_idt_val = cur_idt_val + radius;
SrcVec sum = getPix(src, j);
while (low_bound > 0 && low_idt_val < getFloat(idist, low_bound-1))
{
low_bound--;
sum += getPix(src, low_bound);
}
while (getFloat(idist, high_bound + 1) < high_idt_val)
{
high_bound++;
sum += getPix(src, high_bound);
}
storePix(sum / (float)(high_bound - low_bound + 1), dst, j);
}
__kernel void filter_NC_vert(__global const uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global const uchar *idist, int idist_step, int idist_offset,
__global uchar *dst, int dst_step, int dst_offset,
float radius)
{
int j = get_global_id(0);
if (!(j >= 0 && j < src_cols))
return;
__global const uchar *src_col = src + mad24(j, (int)ELEM_SIZE, src_offset);
__global uchar *dst_col = dst + mad24(j, (int)ELEM_SIZE, dst_offset);
__global const uchar *idist_col = idist + mad24(j, (int)sizeof(float), idist_offset);
int low_bound = 0, high_bound = 0;
SrcVec sum = getPix(src_col, 0);
SrcVec res;
for (int i = 0; i < src_rows; i++)
{
float cur_idt_value = getFloat(idist_col + i*idist_step, 0);
float low_idt_value = cur_idt_value - radius;
float high_idt_value = cur_idt_value + radius;
while (getFloat(idist_col + low_bound*idist_step, 0) < low_idt_value)
{
sum -= getPix(src_col + low_bound*src_step, 0);
low_bound++;
}
while (getFloat(idist_col + (high_bound + 1)*idist_step, 0) < high_idt_value)
{
high_bound++;
sum += getPix(src_col + high_bound*src_step, 0);
}
res = sum / (float)(high_bound - low_bound + 1);
storePix(res, dst_col + i*dst_step, 0);
}
}
__kernel void filter_RF_vert(__global uchar *res, int res_step, int res_offset, int res_rows, int res_cols,
__global uchar *adist, int adist_step, int adist_offset,
int write_new_a_dist)
{
int j = get_global_id(0);
if (!(j >= 0 && j < res_cols))
return;
res += mad24(j, (int)ELEM_SIZE, res_offset);
adist += mad24(j, (int)sizeof(float), adist_offset);
SrcVec cur_val;
float ad;
int i;
cur_val = getPix(res + 0*res_step, 0);
for (i = 1; i < res_rows; i++)
{
ad = getFloat(adist + (i-1)*adist_step, 0);
cur_val = (1.0f - ad)*getPix(res + i*res_step, 0) + ad*cur_val;
storePix(cur_val, res + i*res_step, 0);
}
for (i = res_rows - 2; i >= 0; i--)
{
ad = getFloat(adist + i*adist_step, 0);
cur_val = (1.0f - ad)*getPix(res + i*res_step, 0) + ad*cur_val;
storePix(cur_val, res + i*res_step, 0);
if (write_new_a_dist)
storeFloat(ad*ad, adist + i*adist_step, 0);
}
}
#define genMinMaxPtrs(ptr, h) __global const uchar *ptr##_min = (__global const uchar *)ptr;\
__global const uchar *ptr##_max = (__global const uchar *)ptr + mad24(h, ptr##_step, ptr##_offset);
#define checkPtr(ptr, bound) (bound##_min <= (__global const uchar *)(ptr) && (__global const uchar *)(ptr) < bound##_max)
__kernel void filter_RF_block_init_fwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *adist, int adist_step, int adist_offset,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int bid = get_global_id(0);
int j = get_global_id(1);
if (j < 0 || j >= cols) return;
int startRow = max(1, bid*blockSize); //skip first row
int endRow = min(rows, (bid + 1)*blockSize);
res += mad24(j, ELEM_SIZE, mad24(startRow, res_step, res_offset));;
adist += mad24(j, (int) sizeof(float), mad24(startRow-1, adist_step, adist_offset));
weights += mad24(j, (int) sizeof(float), mad24(startRow, weights_step, weights_offset));
SrcVec cur_val = (startRow != 1) ? 0 : getPix(res - res_step, 0);
float weight0 = 1.0f;
for (int i = startRow; i < endRow; i++)
{
float ad = getFloat(adist, 0);
cur_val = (1.0f - ad)*getPix(res, 0) + ad*cur_val;
storePix(cur_val, res, 0);
weight0 *= ad;
storeFloat(weight0, weights, 0);
res += res_step;
adist += adist_step;
weights += weights_step;
}
}
__kernel void filter_RF_block_fill_borders_fwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int j = get_global_id(0);
if (j < 0 || j >= cols) return;
int startRow = 2*blockSize - 1;
res += mad24(j, ELEM_SIZE, mad24(startRow, res_step, res_offset));
weights += mad24(j, (int) sizeof(float), mad24(startRow, weights_step, weights_offset));
int res_step_block = blockSize*res_step;
int weights_step_block = blockSize*weights_step;
SrcVec prev_pix = getPix(res - res_step_block, 0);
for (int i = startRow; i < rows; i += blockSize)
{
prev_pix = getPix(res, 0) + getFloat(weights, 0)*prev_pix;
storePix(prev_pix, res, 0);
res += res_step_block;
weights += weights_step_block;
}
}
__kernel void filter_RF_block_fill_fwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int i = get_global_id(0) + blockSize;
int j = get_global_id(1);
if (j < 0 || j >= cols || i < 0 || i >= rows) return;
if (i % blockSize == blockSize - 1) return; //to avoid rewriting bound pixels
int bid = i / blockSize;
int ref_row_id = bid*blockSize - 1;
__global uchar *res_ref_row = res + mad24(ref_row_id, res_step, res_offset);
__global uchar *res_dst_row = res + mad24(i, res_step, res_offset);
__global float *weights_row = PtrAdd(weights, mad24(i, weights_step, weights_offset), float);
storePix(getPix(res_dst_row, j) + weights_row[j]*getPix(res_ref_row, j), res_dst_row, j);
}
__kernel void filter_RF_block_init_bwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *adist, int adist_step, int adist_offset,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int bid = get_global_id(0);
int j = get_global_id(1);
if (j < 0 || j >= cols) return;
int startRow = rows-1 - max(1, bid*blockSize);
int endRow = max(-1, rows-1 - (bid+1)*blockSize);
res += mad24(j, ELEM_SIZE, mad24(startRow, res_step, res_offset));;
adist += mad24(j, (int) sizeof(float), mad24(startRow, adist_step, adist_offset));
weights += mad24(j, (int) sizeof(float), mad24(startRow, weights_step, weights_offset));
SrcVec cur_val = (startRow != rows-2) ? 0 : getPix(res + res_step, 0);
float weight0 = 1.0f;
for (int i = startRow; i > endRow; i--)
{
float ad = getFloat(adist, 0);
cur_val = (1.0f - ad)*getPix(res, 0) + ad*cur_val;
storePix(cur_val, res, 0);
weight0 *= ad;
storeFloat(weight0, weights, 0);
res -= res_step;
adist -= adist_step;
weights -= weights_step;
}
}
__kernel void filter_RF_block_fill_borders_bwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int j = get_global_id(0);
if (j < 0 || j >= cols) return;
int startRow = rows-1 - (2*blockSize - 1);
res += mad24(j, ELEM_SIZE, mad24(startRow, res_step, res_offset));
weights += mad24(j, (int) sizeof(float), mad24(startRow, weights_step, weights_offset));
int res_step_block = blockSize*res_step;
int weights_step_block = blockSize*weights_step;
SrcVec prev_pix = getPix(res + res_step_block, 0);
for (int i = startRow; i >= 0; i -= blockSize)
{
prev_pix = getPix(res, 0) + getFloat(weights, 0)*prev_pix;
storePix(prev_pix, res, 0);
res -= blockSize*res_step;
weights -= blockSize*weights_step;
}
}
__kernel void filter_RF_block_fill_bwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int i = get_global_id(0) + blockSize;
int j = get_global_id(1);
if (j < 0 || j >= cols || i < 0 || i >= rows) return;
if (i % blockSize == blockSize - 1) return; //to avoid rewriting bound pixels
int bid = i / blockSize;
int ref_row_id = rows-1 - (bid*blockSize - 1);
int dst_row_id = rows-1 - i;
__global uchar *res_ref_row = res + mad24(ref_row_id, res_step, res_offset);
__global uchar *res_dst_row = res + mad24(dst_row_id, res_step, res_offset);
__global float *weights_row = PtrAdd(weights, mad24(dst_row_id, weights_step, weights_offset), float);
storePix(getPix(res_dst_row, j) + weights_row[j]*getPix(res_ref_row, j), res_dst_row, j);
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment