/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors // Niko Li, newlife20080214@gmail.com // Jia Haipeng, jiahaipeng95@gmail.com // Zero Lin, Zero.Lin@amd.com // Zhang Ying, zhangying913@gmail.com // Yao Wang, bitwangyaoyao@gmail.com // Harris Gasparakis, harris.gasparakis@amd.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "precomp.hpp" #include "opencl_kernels.hpp" using namespace cv; using namespace cv::ocl; namespace { inline void normalizeAnchor(int &anchor, int ksize) { if (anchor < 0) anchor = ksize >> 1; CV_Assert(0 <= anchor && anchor < ksize); } inline void normalizeAnchor(Point &anchor, const Size &ksize) { normalizeAnchor(anchor.x, ksize.width); normalizeAnchor(anchor.y, ksize.height); } inline void normalizeROI(Rect &roi, const Size &ksize, const Point &/*anchor*/, const Size &src_size) { if (roi == Rect(0, 0, -1, -1)) roi = Rect(0, 0, src_size.width, src_size.height); CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height); } } //////////////////////////////////////////////////////////////////////////////////////////////////// // Filter2D namespace { class Filter2DEngine_GPU : public FilterEngine_GPU { public: Filter2DEngine_GPU(const Ptr<BaseFilter_GPU> &filter2D_) : filter2D(filter2D_) {} virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) { Size src_size = src.size(); // Delete those two clause below which exist before, However, the result is also correct // dst.create(src_size, src.type()); // dst = Scalar(0.0); normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); oclMat srcROI = src(roi); oclMat dstROI = dst(roi); (*filter2D)(srcROI, dstROI); } Ptr<BaseFilter_GPU> filter2D; }; } Ptr<FilterEngine_GPU> cv::ocl::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D) { return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D)); } //////////////////////////////////////////////////////////////////////////////////////////////////// // Box Filter namespace { typedef void (*FilterBox_t)(const oclMat & , oclMat & , Size &, const Point, const int); class GPUBoxFilter : public BaseFilter_GPU { public: GPUBoxFilter(const Size &ksize_, const Point &anchor_, const int borderType_, FilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_, borderType_), func(func_) {} virtual void operator()(const oclMat &src, oclMat &dst) { func(src, dst, ksize, anchor, borderType); } FilterBox_t func; }; } //////////////////////////////////////////////////////////////////////////////////////////////////// // Morphology Filter namespace { typedef void (*GPUMorfFilter_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point, bool rectKernel); class MorphFilter_GPU : public BaseFilter_GPU { public: MorphFilter_GPU(const Size &ksize_, const Point &anchor_, const Mat &kernel_, GPUMorfFilter_t func_) : BaseFilter_GPU(ksize_, anchor_, BORDER_CONSTANT), kernel(kernel_), func(func_), rectKernel(false) {} virtual void operator()(const oclMat &src, oclMat &dst) { func(src, dst, kernel, ksize, anchor, rectKernel) ; } oclMat kernel; GPUMorfFilter_t func; bool rectKernel; }; } /* **We should be able to support any data types here. **Extend this if necessary later. **Note that the kernel need to be further refined. */ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, const Point anchor, bool rectKernel) { //Normalize the result by default //float alpha = ksize.height * ksize.width; CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && (src.rows == dst.rows)); CV_Assert((src.oclchannels() == dst.oclchannels())); int srcStep = src.step / src.elemSize(); int dstStep = dst.step / dst.elemSize(); int srcOffset = src.offset / src.elemSize(); int dstOffset = dst.offset / dst.elemSize(); int srcOffset_x = srcOffset % srcStep; int srcOffset_y = srcOffset / srcStep; Context *clCxt = src.clCxt; string kernelName; #ifdef ANDROID size_t localThreads[3] = {16, 8, 1}; #else size_t localThreads[3] = {16, 16, 1}; #endif size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1}; if (src.type() == CV_8UC1) { kernelName = "morph_C1_D0"; globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } else { kernelName = "morph"; CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } char s[64]; switch (src.type()) { case CV_8UC1: sprintf(s, "-D VAL=255"); break; case CV_8UC3: case CV_8UC4: sprintf(s, "-D VAL=255 -D GENTYPE=uchar4"); break; case CV_32FC1: sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float"); break; case CV_32FC3: case CV_32FC4: sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float4"); break; default: CV_Error(CV_StsUnsupportedFormat, "unsupported type"); } char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s", anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], s, rectKernel?"-D RECTKERNEL":""); vector< pair<size_t, const void *> > args; args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_x)); args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_y)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep)); args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep)); args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset)); openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } //! data type supported: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4 static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, const Point anchor, bool rectKernel) { //Normalize the result by default //float alpha = ksize.height * ksize.width; CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && (src.rows == dst.rows)); CV_Assert((src.oclchannels() == dst.oclchannels())); int srcStep = src.step1() / src.oclchannels(); int dstStep = dst.step1() / dst.oclchannels(); int srcOffset = src.offset / src.elemSize(); int dstOffset = dst.offset / dst.elemSize(); int srcOffset_x = srcOffset % srcStep; int srcOffset_y = srcOffset / srcStep; Context *clCxt = src.clCxt; string kernelName; #ifdef ANDROID size_t localThreads[3] = {16, 10, 1}; #else size_t localThreads[3] = {16, 16, 1}; #endif size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1}; if (src.type() == CV_8UC1) { kernelName = "morph_C1_D0"; globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } else { kernelName = "morph"; CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } char s[64]; switch (src.type()) { case CV_8UC1: sprintf(s, "-D VAL=0"); break; case CV_8UC3: case CV_8UC4: sprintf(s, "-D VAL=0 -D GENTYPE=uchar4"); break; case CV_32FC1: sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float"); break; case CV_32FC3: case CV_32FC4: sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float4"); break; default: CV_Error(CV_StsUnsupportedFormat, "unsupported type"); } char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s", anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], s, rectKernel?"-D RECTKERNEL":""); vector< pair<size_t, const void *> > args; args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_x)); args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_y)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep)); args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep)); args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset)); openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat &_kernel, const Size &ksize, Point anchor) { CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE); CV_Assert(type == CV_8UC1 || type == CV_8UC3 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC3 || type == CV_32FC4); normalizeAnchor(anchor, ksize); Mat kernel8U; _kernel.convertTo(kernel8U, CV_8U); Mat kernel = kernel8U.reshape(1, 1); bool noZero = true; for(int i = 0; i < kernel.rows * kernel.cols; ++i) if(kernel.at<uchar>(i) != 1) noZero = false; MorphFilter_GPU* mfgpu = new MorphFilter_GPU(ksize, anchor, kernel, op == MORPH_ERODE ? GPUErode : GPUDilate); if(noZero) mfgpu->rectKernel = true; return Ptr<BaseFilter_GPU>(mfgpu); } namespace { class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU { public: MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU> &filter2D_, int iters_) : Filter2DEngine_GPU(filter2D_), iters(iters_) {} virtual void apply(const oclMat &src, oclMat &dst) { Filter2DEngine_GPU::apply(src, dst); for (int i = 1; i < iters; ++i) { Size wholesize; Point ofs; dst.locateROI(wholesize, ofs); int rows = dst.rows, cols = dst.cols; dst.adjustROI(ofs.y, -ofs.y - rows + dst.wholerows, ofs.x, -ofs.x - cols + dst.wholecols); dst.copyTo(morfBuf); dst.adjustROI(-ofs.y, ofs.y + rows - dst.wholerows, -ofs.x, ofs.x + cols - dst.wholecols); morfBuf.adjustROI(-ofs.y, ofs.y + rows - dst.wholerows, -ofs.x, ofs.x + cols - dst.wholecols); Filter2DEngine_GPU::apply(morfBuf, dst); } } int iters; oclMat morfBuf; }; } Ptr<FilterEngine_GPU> cv::ocl::createMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Point &anchor, int iterations) { CV_Assert(iterations > 0); Size ksize = kernel.size(); Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor); return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, iterations)); } namespace { void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point anchor, int iterations, int borderType, const Scalar &borderValue) { if ((borderType != cv::BORDER_CONSTANT) || (borderValue != morphologyDefaultBorderValue())) { CV_Error(CV_StsBadArg, "unsupported border type"); } Mat kernel; Size ksize = _kernel.data ? _kernel.size() : Size(3, 3); normalizeAnchor(anchor, ksize); if (iterations == 0 || _kernel.rows *_kernel.cols == 1) { src.copyTo(dst); return; } dst.create(src.size(), src.type()); if (!_kernel.data) { kernel = getStructuringElement(MORPH_RECT, Size(1 + iterations * 2, 1 + iterations * 2)); anchor = Point(iterations, iterations); iterations = 1; } else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols) { anchor = Point(anchor.x * iterations, anchor.y * iterations); kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + (iterations - 1) * (ksize.width - 1), ksize.height + (iterations - 1) * (ksize.height - 1)), anchor); iterations = 1; } else kernel = _kernel; Ptr<MorphologyFilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); f->apply(src, dst); } } void cv::ocl::erode(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, int borderType, const Scalar &borderValue) { bool allZero = true; for (int i = 0; i < kernel.rows * kernel.cols; ++i) if (kernel.data[i] != 0) allZero = false; if (allZero) kernel.data[0] = 1; morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue); } void cv::ocl::dilate(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, int borderType, const Scalar &borderValue) { morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue); } void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations, int borderType, const Scalar &borderValue) { oclMat temp; switch (op) { case MORPH_ERODE: erode(src, dst, kernel, anchor, iterations, borderType, borderValue); break; case MORPH_DILATE: dilate(src, dst, kernel, anchor, iterations, borderType, borderValue); break; case MORPH_OPEN: erode(src, temp, kernel, anchor, iterations, borderType, borderValue); dilate(temp, dst, kernel, anchor, iterations, borderType, borderValue); break; case CV_MOP_CLOSE: dilate(src, temp, kernel, anchor, iterations, borderType, borderValue); erode(temp, dst, kernel, anchor, iterations, borderType, borderValue); break; case CV_MOP_GRADIENT: erode(src, temp, kernel, anchor, iterations, borderType, borderValue); dilate(src, dst, kernel, anchor, iterations, borderType, borderValue); subtract(dst, temp, dst); break; case CV_MOP_TOPHAT: erode(src, dst, kernel, anchor, iterations, borderType, borderValue); dilate(dst, temp, kernel, anchor, iterations, borderType, borderValue); subtract(src, temp, dst); break; case CV_MOP_BLACKHAT: dilate(src, dst, kernel, anchor, iterations, borderType, borderValue); erode(dst, temp, kernel, anchor, iterations, borderType, borderValue); subtract(temp, src, dst); break; default: CV_Error(CV_StsBadArg, "unknown morphological operation"); } } //////////////////////////////////////////////////////////////////////////////////////////////////// // Linear Filter namespace { typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , const Mat & , const Size &, const Point&, const int); class LinearFilter_GPU : public BaseFilter_GPU { public: LinearFilter_GPU(const Size &ksize_, const Point &anchor_, const Mat &kernel_, GPUFilter2D_t func_, int borderType_) : BaseFilter_GPU(ksize_, anchor_, borderType_), kernel(kernel_), func(func_) {} virtual void operator()(const oclMat &src, oclMat &dst) { func(src, dst, kernel, ksize, anchor, borderType) ; } Mat kernel; GPUFilter2D_t func; }; } // prepare kernel: transpose and make double rows (+align). Returns size of aligned row // Samples: // a b c // Input: d e f // g h i // Output, last two zeros is the alignment: // a d g a d g 0 0 // b e h b e h 0 0 // c f i c f i 0 0 template <typename T> static int _prepareKernelFilter2D(std::vector<T>& data, const Mat &kernel) { Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value); int size_y_aligned = roundUp(kernel.rows * 2, 4); data.clear(); data.resize(size_y_aligned * kernel.cols, 0); for (int x = 0; x < kernel.cols; x++) { for (int y = 0; y < kernel.rows; y++) { data[x * size_y_aligned + y] = _kernel.at<T>(y, x); data[x * size_y_aligned + y + kernel.rows] = _kernel.at<T>(y, x); } } return size_y_aligned; } static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel, const Size &ksize, const Point& anchor, const int borderType) { CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && (src.rows == dst.rows)); CV_Assert(src.oclchannels() == dst.oclchannels()); CV_Assert(kernel.cols == ksize.width && kernel.rows == ksize.height); CV_Assert(kernel.channels() == 1); CV_Assert(anchor.x >= 0 && anchor.x < kernel.cols); CV_Assert(anchor.y >= 0 && anchor.y < kernel.rows); bool useDouble = src.depth() == CV_64F; std::vector<float> kernelDataFloat; std::vector<double> kernelDataDouble; int kernel_size_y2_aligned = useDouble ? _prepareKernelFilter2D<double>(kernelDataDouble, kernel) : _prepareKernelFilter2D<float>(kernelDataFloat, kernel); oclMat oclKernelParameter; if (useDouble) { oclKernelParameter.createEx(1, kernelDataDouble.size(), CV_64FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT); openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataDouble.size()*sizeof(double), &kernelDataDouble[0], kernelDataDouble.size()*sizeof(double), kernelDataDouble.size()*sizeof(double), 1, clMemcpyHostToDevice); } else { oclKernelParameter.createEx(1, kernelDataFloat.size(), CV_32FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT); openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataFloat.size()*sizeof(float), &kernelDataFloat[0], kernelDataFloat.size()*sizeof(float), kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice); } size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; do { size_t BLOCK_SIZE = tryWorkItems; while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2) BLOCK_SIZE /= 2; #if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices size_t BLOCK_SIZE_Y = 1; #else size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) BLOCK_SIZE_Y *= 2; #endif CV_Assert((size_t)ksize.width <= BLOCK_SIZE); bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; vector<pair<size_t , const void *> > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); cl_uint stepBytes = src.step; args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); int offsetXBytes = src.offset % src.step; int offsetX = offsetXBytes / src.elemSize(); CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); int offsetY = src.offset / src.step; int endX = (offsetX + src.cols); int endY = (offsetY + src.rows); cl_int rect[4] = {offsetX, offsetY, endX, endY}; if (!isIsolatedBorder) { rect[2] = src.wholecols; rect[3] = src.wholerows; } args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); cl_uint _stepBytes = dst.step; args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); int _offsetXBytes = dst.offset % dst.step; int _offsetX = _offsetXBytes / dst.elemSize(); CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); int _offsetY = dst.offset / dst.step; int _endX = (_offsetX + dst.cols); int _endY = (_offsetY + dst.rows); cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) { if (useDouble) args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); else args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); } args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data)); const char* btype = NULL; switch (borderType & ~BORDER_ISOLATED) { case BORDER_CONSTANT: btype = "BORDER_CONSTANT"; break; case BORDER_REPLICATE: btype = "BORDER_REPLICATE"; break; case BORDER_REFLECT: btype = "BORDER_REFLECT"; break; case BORDER_WRAP: CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); return; case BORDER_REFLECT101: btype = "BORDER_REFLECT_101"; break; } int requiredTop = anchor.y; int requiredLeft = BLOCK_SIZE; // not this: anchor.x; int requiredBottom = ksize.height - 1 - anchor.y; int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; int h = isIsolatedBorder ? src.rows : src.wholerows; int w = isIsolatedBorder ? src.cols : src.wholecols; bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; char build_options[1024]; sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d " "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d " "-D %s -D %s -D %s", (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, src.depth(), src.oclchannels(), useDouble ? 1 : 0, anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned, btype, extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); size_t lt[3] = {BLOCK_SIZE, 1, 1}; size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}; cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options); size_t kernelWorkGroupSize; openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt), CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); if (lt[0] > kernelWorkGroupSize) { clReleaseKernel(kernel); CV_Assert(BLOCK_SIZE > kernelWorkGroupSize); tryWorkItems = kernelWorkGroupSize; continue; } openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here } while (false); } Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize, const Point &anchor, int borderType) { Point norm_archor = anchor; normalizeAnchor(norm_archor, ksize); return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, norm_archor, kernel, GPUFilter2D, borderType)); } Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor, int borderType) { Size ksize = kernel.size(); // TODO remove duplicated parameter Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor, borderType); return createFilter2D_GPU(linearFilter); } void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, double delta, int borderType) { CV_Assert(delta == 0); if (ddepth < 0) ddepth = src.depth(); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst.type(), kernel, anchor, borderType); f->apply(src, dst); } const int optimizedSepFilterLocalSize = 16; static void sepFilter2D_SinglePass(const oclMat &src, oclMat &dst, const Mat &row_kernel, const Mat &col_kernel, int bordertype = BORDER_DEFAULT) { size_t lt2[3] = {optimizedSepFilterLocalSize, optimizedSepFilterLocalSize, 1}; size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1}; unsigned int src_pitch = src.step; unsigned int dst_pitch = dst.step; int src_offset_x = (src.offset % src.step) / src.elemSize(); int src_offset_y = src.offset / src.step; std::vector<std::pair<size_t , const void *> > args; args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y )); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.offset )); args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholecols )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholerows )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows )); string option = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d",(int)lt2[0], (int)lt2[1], row_kernel.rows / 2, col_kernel.rows / 2 ); option += " -D KERNEL_MATRIX_X="; for(int i=0; i<row_kernel.rows; i++) option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &row_kernel.at<float>(i) ) ); option += "0x0"; option += " -D KERNEL_MATRIX_Y="; for(int i=0; i<col_kernel.rows; i++) option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &col_kernel.at<float>(i) ) ); option += "0x0"; switch(src.type()) { case CV_8UC1: option += " -D SRCTYPE=uchar -D CONVERT_SRCTYPE=convert_float -D WORKTYPE=float"; break; case CV_32FC1: option += " -D SRCTYPE=float -D CONVERT_SRCTYPE= -D WORKTYPE=float"; break; case CV_8UC2: option += " -D SRCTYPE=uchar2 -D CONVERT_SRCTYPE=convert_float2 -D WORKTYPE=float2"; break; case CV_32FC2: option += " -D SRCTYPE=float2 -D CONVERT_SRCTYPE= -D WORKTYPE=float2"; break; case CV_8UC3: option += " -D SRCTYPE=uchar3 -D CONVERT_SRCTYPE=convert_float3 -D WORKTYPE=float3"; break; case CV_32FC3: option += " -D SRCTYPE=float3 -D CONVERT_SRCTYPE= -D WORKTYPE=float3"; break; case CV_8UC4: option += " -D SRCTYPE=uchar4 -D CONVERT_SRCTYPE=convert_float4 -D WORKTYPE=float4"; break; case CV_32FC4: option += " -D SRCTYPE=float4 -D CONVERT_SRCTYPE= -D WORKTYPE=float4"; break; default: CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!"); break; } switch(dst.type()) { case CV_8UC1: option += " -D DSTTYPE=uchar -D CONVERT_DSTTYPE=convert_uchar_sat"; break; case CV_8UC2: option += " -D DSTTYPE=uchar2 -D CONVERT_DSTTYPE=convert_uchar2_sat"; break; case CV_8UC3: option += " -D DSTTYPE=uchar3 -D CONVERT_DSTTYPE=convert_uchar3_sat"; break; case CV_8UC4: option += " -D DSTTYPE=uchar4 -D CONVERT_DSTTYPE=convert_uchar4_sat"; break; case CV_32FC1: option += " -D DSTTYPE=float -D CONVERT_DSTTYPE="; break; case CV_32FC2: option += " -D DSTTYPE=float2 -D CONVERT_DSTTYPE="; break; case CV_32FC3: option += " -D DSTTYPE=float3 -D CONVERT_DSTTYPE="; break; case CV_32FC4: option += " -D DSTTYPE=float4 -D CONVERT_DSTTYPE="; break; default: CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!"); break; } switch(bordertype) { case cv::BORDER_CONSTANT: option += " -D BORDER_CONSTANT"; break; case cv::BORDER_REPLICATE: option += " -D BORDER_REPLICATE"; break; case cv::BORDER_REFLECT: option += " -D BORDER_REFLECT"; break; case cv::BORDER_REFLECT101: option += " -D BORDER_REFLECT_101"; break; case cv::BORDER_WRAP: option += " -D BORDER_WRAP"; break; default: CV_Error(CV_StsBadFlag, "BORDER type is not supported!"); break; } openCLExecuteKernel(src.clCxt, &filtering_sep_filter_singlepass, "sep_filter_singlepass", gt2, lt2, args, -1, -1, option.c_str() ); } //////////////////////////////////////////////////////////////////////////////////////////////////// // SeparableFilter namespace { class SeparableFilterEngine_GPU : public FilterEngine_GPU { public: SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter_, const Ptr<BaseColumnFilter_GPU> &columnFilter_) : rowFilter(rowFilter_), columnFilter(columnFilter_) { ksize = Size(rowFilter->ksize, columnFilter->ksize); anchor = Point(rowFilter->anchor, columnFilter->anchor); } virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) { Size src_size = src.size(); int cn = src.oclchannels(); dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn)); normalizeROI(roi, ksize, anchor, src_size); srcROI = src(roi); dstROI = dst(roi); (*rowFilter)(srcROI, dstBuf); (*columnFilter)(dstBuf, dstROI); } Ptr<BaseRowFilter_GPU> rowFilter; Ptr<BaseColumnFilter_GPU> columnFilter; Size ksize; Point anchor; oclMat dstBuf; oclMat srcROI; oclMat dstROI; oclMat dstBufROI; }; } Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter, const Ptr<BaseColumnFilter_GPU> &columnFilter) { return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter)); } namespace { class SingleStepSeparableFilterEngine_GPU : public FilterEngine_GPU { public: SingleStepSeparableFilterEngine_GPU( const Mat &rowKernel_, const Mat &columnKernel_, const int btype ) { bordertype = btype; rowKernel = rowKernel_; columnKernel = columnKernel_; } virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) { normalizeROI(roi, Size(rowKernel.rows, columnKernel.rows), Point(-1,-1), src.size()); oclMat srcROI = src(roi); oclMat dstROI = dst(roi); sepFilter2D_SinglePass(src, dst, rowKernel, columnKernel, bordertype); } Mat rowKernel; Mat columnKernel; int bordertype; }; } static void GPUFilterBox(const oclMat &src, oclMat &dst, Size &ksize, const Point anchor, const int borderType) { //Normalize the result by default float alpha = 1.0f / (ksize.height * ksize.width); CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && (src.rows == dst.rows)); CV_Assert(src.oclchannels() == dst.oclchannels()); size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; do { size_t BLOCK_SIZE = tryWorkItems; while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2) BLOCK_SIZE /= 2; size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) BLOCK_SIZE_Y *= 2; CV_Assert((size_t)ksize.width <= BLOCK_SIZE); bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; vector<pair<size_t , const void *> > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); cl_uint stepBytes = src.step; args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); int offsetXBytes = src.offset % src.step; int offsetX = offsetXBytes / src.elemSize(); CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); int offsetY = src.offset / src.step; int endX = (offsetX + src.cols); int endY = (offsetY + src.rows); cl_int rect[4] = {offsetX, offsetY, endX, endY}; if (!isIsolatedBorder) { rect[2] = src.wholecols; rect[3] = src.wholerows; } args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); cl_uint _stepBytes = dst.step; args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); int _offsetXBytes = dst.offset % dst.step; int _offsetX = _offsetXBytes / dst.elemSize(); CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); int _offsetY = dst.offset / dst.step; int _endX = (_offsetX + dst.cols); int _endY = (_offsetY + dst.rows); cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); bool useDouble = src.depth() == CV_64F; float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) { if (useDouble) args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); else args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); } double alphaDouble = alpha; // DON'T move into 'if' body if (useDouble) args.push_back( make_pair( sizeof(double), (void *)&alphaDouble)); else args.push_back( make_pair( sizeof(float), (void *)&alpha)); const char* btype = NULL; switch (borderType & ~BORDER_ISOLATED) { case BORDER_CONSTANT: btype = "BORDER_CONSTANT"; break; case BORDER_REPLICATE: btype = "BORDER_REPLICATE"; break; case BORDER_REFLECT: btype = "BORDER_REFLECT"; break; case BORDER_WRAP: CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); return; case BORDER_REFLECT101: btype = "BORDER_REFLECT_101"; break; } int requiredTop = anchor.y; int requiredLeft = BLOCK_SIZE; // not this: anchor.x; int requiredBottom = ksize.height - 1 - anchor.y; int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; int h = isIsolatedBorder ? src.rows : src.wholerows; int w = isIsolatedBorder ? src.cols : src.wholecols; bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well char build_options[1024]; sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s", (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, src.depth(), src.oclchannels(), useDouble ? 1 : 0, anchor.x, anchor.y, ksize.width, ksize.height, btype, extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); size_t lt[3] = {BLOCK_SIZE, 1, 1}; size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}; cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options); size_t kernelWorkGroupSize; openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt), CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); if (lt[0] > kernelWorkGroupSize) { clReleaseKernel(kernel); CV_Assert(BLOCK_SIZE > kernelWorkGroupSize); tryWorkItems = kernelWorkGroupSize; continue; } openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here } while (false); } Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/, const Size &ksize, Point anchor, int borderType) { normalizeAnchor(anchor, ksize); return Ptr<BaseFilter_GPU>(new GPUBoxFilter(ksize, anchor, borderType, GPUFilterBox)); } Ptr<FilterEngine_GPU> cv::ocl::createBoxFilter_GPU(int srcType, int dstType, const Size &ksize, const Point &anchor, int borderType) { Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor, borderType); return createFilter2D_GPU(boxFilter); } void cv::ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor, int borderType) { int sdepth = src.depth(), cn = src.channels(); if (ddepth < 0) { ddepth = sdepth; } dst.create(src.size(), CV_MAKETYPE(ddepth, cn)); Ptr<FilterEngine_GPU> f = createBoxFilter_GPU(src.type(), dst.type(), ksize, anchor, borderType); f->apply(src, dst); } namespace { typedef void (*gpuFilter1D_t)(const oclMat &src, const oclMat &dst, oclMat kernel, int ksize, int anchor, int bordertype); class GpuLinearRowFilter : public BaseRowFilter_GPU { public: GpuLinearRowFilter(int ksize_, int anchor_, const oclMat &kernel_, gpuFilter1D_t func_, int bordertype_) : BaseRowFilter_GPU(ksize_, anchor_, bordertype_), kernel(kernel_), func(func_) {} virtual void operator()(const oclMat &src, oclMat &dst) { func(src, dst, kernel, ksize, anchor, bordertype); } oclMat kernel; gpuFilter1D_t func; }; } template <typename T> struct index_and_sizeof; template <> struct index_and_sizeof<uchar> { enum { index = 1 }; }; template <> struct index_and_sizeof<char> { enum { index = 2 }; }; template <> struct index_and_sizeof<ushort> { enum { index = 3 }; }; template <> struct index_and_sizeof<short> { enum { index = 4 }; }; template <> struct index_and_sizeof<int> { enum { index = 5 }; }; template <> struct index_and_sizeof<float> { enum { index = 6 }; }; template <typename T> void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel, int ksize, int anchor, int bordertype) { CV_Assert(bordertype <= BORDER_REFLECT_101); CV_Assert(ksize == (anchor << 1) + 1); int channels = src.oclchannels(); #ifdef ANDROID size_t localThreads[3] = { 16, 10, 1 }; #else size_t localThreads[3] = { 16, 16, 1 }; #endif size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }; std::string buildOptions = format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s", anchor, (int)localThreads[0], (int)localThreads[1], channels, borderMap[bordertype]); if (src.depth() == CV_8U) { switch (channels) { case 1: globalThreads[0] = (dst.cols + 3) >> 2; break; case 2: globalThreads[0] = (dst.cols + 1) >> 1; break; case 4: globalThreads[0] = dst.cols; break; } } int src_pix_per_row = src.step / src.elemSize(); int src_offset_x = (src.offset % src.step) / src.elemSize(); int src_offset_y = src.offset / src.step; int dst_pix_per_row = dst.step / dst.elemSize(); int ridusy = (dst.rows - src.rows) >> 1; vector<pair<size_t , const void *> > args; args.push_back(make_pair(sizeof(cl_mem), &src.data)); args.push_back(make_pair(sizeof(cl_mem), &dst.data)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); args.push_back(make_pair(sizeof(cl_int), (void *)&src_pix_per_row)); args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_x)); args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_y)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst_pix_per_row)); args.push_back(make_pair(sizeof(cl_int), (void *)&ridusy)); args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); openCLExecuteKernel(src.clCxt, &filter_sep_row, "row_filter", globalThreads, localThreads, args, channels, src.depth(), buildOptions.c_str()); } Ptr<BaseRowFilter_GPU> cv::ocl::getLinearRowFilter_GPU(int srcType, int /*bufType*/, const Mat &rowKernel, int anchor, int bordertype) { static const gpuFilter1D_t gpuFilter1D_callers[6] = { linearRowFilter_gpu<uchar>, linearRowFilter_gpu<char>, linearRowFilter_gpu<ushort>, linearRowFilter_gpu<short>, linearRowFilter_gpu<int>, linearRowFilter_gpu<float> }; Mat temp = rowKernel.reshape(1, 1); oclMat mat_kernel(temp); int ksize = temp.cols; //CV_Assert(ksize < 16); normalizeAnchor(anchor, ksize); return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, mat_kernel, gpuFilter1D_callers[CV_MAT_DEPTH(srcType)], bordertype)); } namespace { class GpuLinearColumnFilter : public BaseColumnFilter_GPU { public: GpuLinearColumnFilter(int ksize_, int anchor_, const oclMat &kernel_, gpuFilter1D_t func_, int bordertype_) : BaseColumnFilter_GPU(ksize_, anchor_, bordertype_), kernel(kernel_), func(func_) {} virtual void operator()(const oclMat &src, oclMat &dst) { func(src, dst, kernel, ksize, anchor, bordertype); } oclMat kernel; gpuFilter1D_t func; }; } template <typename T> void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel, int ksize, int anchor, int bordertype) { Context *clCxt = src.clCxt; int channels = src.oclchannels(); #ifdef ANDROID size_t localThreads[3] = {16, 10, 1}; #else size_t localThreads[3] = {16, 16, 1}; #endif string kernelName = "col_filter"; char btype[30]; switch (bordertype) { case 0: sprintf(btype, "BORDER_CONSTANT"); break; case 1: sprintf(btype, "BORDER_REPLICATE"); break; case 2: sprintf(btype, "BORDER_REFLECT"); break; case 3: sprintf(btype, "BORDER_WRAP"); break; case 4: sprintf(btype, "BORDER_REFLECT_101"); break; } char compile_option[256]; size_t globalThreads[3]; globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; globalThreads[2] = (1 + localThreads[2] - 1) / localThreads[2] * localThreads[2]; if (dst.depth() == CV_8U) { switch (channels) { case 1: globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float", "uchar", "convert_uchar_sat"); break; case 2: globalThreads[0] = ((dst.cols + 1) / 2 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float2", "uchar2", "convert_uchar2_sat"); break; case 3: case 4: globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float4", "uchar4", "convert_uchar4_sat"); break; } } else { globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; switch (dst.type()) { case CV_32SC1: sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float", "int", "convert_int_sat"); break; case CV_32SC3: case CV_32SC4: sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float4", "int4", "convert_int4_sat"); break; case CV_32FC1: sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float", "float", ""); break; case CV_32FC3: case CV_32FC4: sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float4", "float4", ""); break; } } //sanity checks CV_Assert(clCxt == dst.clCxt); CV_Assert(src.cols == dst.cols); CV_Assert(src.oclchannels() == dst.oclchannels()); CV_Assert(ksize == (anchor << 1) + 1); int src_pix_per_row, dst_pix_per_row; int dst_offset_in_pixel; src_pix_per_row = src.step / src.elemSize(); dst_pix_per_row = dst.step / dst.elemSize(); dst_offset_in_pixel = dst.offset / dst.elemSize(); vector<pair<size_t , const void *> > args; args.push_back(make_pair(sizeof(cl_mem), &src.data)); args.push_back(make_pair(sizeof(cl_mem), &dst.data)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); args.push_back(make_pair(sizeof(cl_int), (void *)&src_pix_per_row)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst_pix_per_row)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel)); args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); openCLExecuteKernel(clCxt, &filter_sep_col, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, int dstType, const Mat &columnKernel, int anchor, int bordertype, double /*delta*/) { static const gpuFilter1D_t gpuFilter1D_callers[6] = { linearColumnFilter_gpu<uchar>, linearColumnFilter_gpu<char>, linearColumnFilter_gpu<ushort>, linearColumnFilter_gpu<short>, linearColumnFilter_gpu<int>, linearColumnFilter_gpu<float> }; Mat temp = columnKernel.reshape(1, 1); oclMat mat_kernel(temp); int ksize = temp.cols; normalizeAnchor(anchor, ksize); return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, mat_kernel, gpuFilter1D_callers[CV_MAT_DEPTH(dstType)], bordertype)); } Ptr<FilterEngine_GPU> cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype, Size imgSize ) { int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(srcType); int bdepth = std::max(std::max(sdepth, ddepth), CV_32F); int bufType = CV_MAKETYPE(bdepth, cn); Context* clCxt = Context::getContext(); //if image size is non-degenerate and large enough //and if filter support is reasonable to satisfy larger local memory requirements, //then we can use single pass routine to avoid extra runtime calls overhead if( clCxt && clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && rowKernel.rows <= 21 && columnKernel.rows <= 21 && (rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 && imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) && imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) ) { return Ptr<FilterEngine_GPU>(new SingleStepSeparableFilterEngine_GPU(rowKernel, columnKernel, bordertype)); } else { Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype); Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta); return createSeparableFilter_GPU(rowFilter, columnFilter); } } void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype) { if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi { if ((bordertype & cv::BORDER_ISOLATED) != 0) { bordertype &= ~cv::BORDER_ISOLATED; if ((bordertype != cv::BORDER_CONSTANT) && (bordertype != cv::BORDER_REPLICATE)) { CV_Error(CV_StsBadArg, "unsupported border type"); } } } if (ddepth < 0) ddepth = src.depth(); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype, src.size()); f->apply(src, dst); } Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType, Size imgSize ) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1, -1), 0, borderType, imgSize); } //////////////////////////////////////////////////////////////////////////////////////////////////// // Deriv Filter void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, int ksize, double scale, double delta, int borderType) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); if (scale != 1) { // usually the smoothing part is the slowest to compute, // so try to scale it instead of the faster differenciating part if (dx == 0) kx *= scale; else ky *= scale; } sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType); } void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale, double delta , int bordertype) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F); if (scale != 1) { // usually the smoothing part is the slowest to compute, // so try to scale it instead of the faster differenciating part if (dx == 0) kx *= scale; else ky *= scale; } sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype); } void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale, double delta, int borderType) { CV_Assert(delta == 0); if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F) { CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double"); return; } CV_Assert(ksize == 1 || ksize == 3); double K[2][9] = { {0, 1, 0, 1, -4, 1, 0, 1, 0}, {2, 0, 2, 0, -8, 0, 2, 0, 2} }; Mat kernel(3, 3, CV_64F, (void *)K[ksize == 3 ? 1 : 0]); if (scale != 1) kernel *= scale; filter2D(src, dst, ddepth, kernel, Point(-1, -1), 0, borderType); } //////////////////////////////////////////////////////////////////////////////////////////////////// // Gaussian Filter Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype, Size imgSize) { int depth = CV_MAT_DEPTH(type); if (sigma2 <= 0) sigma2 = sigma1; // automatic detection of kernel size from sigma if (ksize.width <= 0 && sigma1 > 0) ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1; if (ksize.height <= 0 && sigma2 > 0) ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1; CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1); sigma1 = std::max(sigma1, 0.0); sigma2 = std::max(sigma2, 0.0); Mat kx = getGaussianKernel(ksize.width, sigma1, std::max(depth, CV_32F)); Mat ky; if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON) ky = kx; else ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F)); return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype, imgSize); } void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype) { if (bordertype != BORDER_CONSTANT) { if (src.rows == 1) ksize.height = 1; if (src.cols == 1) ksize.width = 1; } if (ksize.width == 1 && ksize.height == 1) { src.copyTo(dst); return; } if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi { if ((bordertype & cv::BORDER_ISOLATED) != 0) { bordertype &= ~cv::BORDER_ISOLATED; if ((bordertype != cv::BORDER_CONSTANT) && (bordertype != cv::BORDER_REPLICATE)) { CV_Error(CV_StsBadArg, "unsupported border type"); } } } dst.create(src.size(), src.type()); Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype, src.size()); f->apply(src, dst); } //////////////////////////////////////////////////////////////////////////////////////////////////// // Adaptive Bilateral Filter void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, double maxSigmaColor, Point anchor, int borderType) { CV_Assert((ksize.width & 1) && (ksize.height & 1)); // ksize must be odd CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3); // source must be 8bit RGB image if( sigmaSpace <= 0 ) sigmaSpace = 1; Mat lut(Size(ksize.width, ksize.height), CV_32FC1); double sigma2 = sigmaSpace * sigmaSpace; int idx = 0; int w = ksize.width / 2; int h = ksize.height / 2; int ABF_GAUSSIAN_ocl = 1; if(ABF_GAUSSIAN_ocl) { for(int y=-h; y<=h; y++) for(int x=-w; x<=w; x++) { lut.at<float>(idx++) = expf( (float)(-0.5 * (x * x + y * y)/sigma2)); } } else { for(int y=-h; y<=h; y++) for(int x=-w; x<=w; x++) { lut.at<float>(idx++) = (float) (sigma2 / (sigma2 + x * x + y * y)); } } oclMat dlut(lut); int depth = src.depth(); int cn = src.oclchannels(); normalizeAnchor(anchor, ksize); const static String kernelName = "adaptiveBilateralFilter"; dst.create(src.size(), src.type()); char btype[30]; switch(borderType) { case BORDER_CONSTANT: sprintf(btype, "BORDER_CONSTANT"); break; case BORDER_REPLICATE: sprintf(btype, "BORDER_REPLICATE"); break; case BORDER_REFLECT: sprintf(btype, "BORDER_REFLECT"); break; case BORDER_WRAP: sprintf(btype, "BORDER_WRAP"); break; case BORDER_REFLECT101: sprintf(btype, "BORDER_REFLECT_101"); break; default: CV_Error(CV_StsBadArg, "This border type is not supported"); break; } //the following constants may be adjusted for performance concerns const static size_t blockSizeX = 64, blockSizeY = 1, EXTRA = ksize.height - 1; //Normalize the result by default const float alpha = ksize.height * ksize.width; const size_t gSize = blockSizeX - ksize.width / 2 * 2; const size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX; const size_t rows_per_thread = 1 + EXTRA; const size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY; size_t globalThreads[3] = { globalSizeX, globalSizeY, 1}; size_t localThreads[3] = { blockSizeX, blockSizeY, 1}; char build_options[250]; //LDATATYPESIZE is sizeof local data store. This is to exemplify effect of LDS on kernel performance sprintf(build_options, "-D VAR_PER_CHANNEL=1 -D CALCVAR=1 -D FIXED_WEIGHT=0 -D EXTRA=%d -D MAX_VAR_VAL=%f -D ABF_GAUSSIAN=%d" " -D THREADS=%d -D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", static_cast<int>(EXTRA), static_cast<float>(maxSigmaColor*maxSigmaColor), static_cast<int>(ABF_GAUSSIAN_ocl), static_cast<int>(blockSizeX), anchor.x, anchor.y, ksize.width, ksize.height, btype); std::vector<pair<size_t , const void *> > args; args.push_back(std::make_pair(sizeof(cl_mem), &src.data)); args.push_back(std::make_pair(sizeof(cl_mem), &dst.data)); args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step)); args.push_back(std::make_pair(sizeof(cl_mem), &dlut.data)); int lut_step = dlut.step1(); args.push_back(std::make_pair(sizeof(cl_int), (void *)&lut_step)); openCLExecuteKernel(Context::getContext(), &filtering_adaptive_bilateral, kernelName, globalThreads, localThreads, args, cn, depth, build_options); }