Commit 324fa268 authored by Erik Karlsson's avatar Erik Karlsson

Refactoring of OpenCL implementation

parent 41ffcc27
...@@ -51,7 +51,8 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h, ...@@ -51,7 +51,8 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
Size src_size = _src.size(); Size src_size = _src.size();
CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) && CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, false)) ocl_fastNlMeansDenoising(_src, _dst, &h, 1,
templateWindowSize, searchWindowSize, false))
Mat src = _src.getMat(); Mat src = _src.getMat();
_dst.create(src_size, src.type()); _dst.create(src_size, src.type());
...@@ -95,7 +96,8 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h, ...@@ -95,7 +96,8 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
Size src_size = _src.size(); Size src_size = _src.size();
CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) && CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, true)) ocl_fastNlMeansDenoising(_src, _dst, &h, 1,
templateWindowSize, searchWindowSize, true))
Mat src = _src.getMat(); Mat src = _src.getMat();
_dst.create(src_size, src.type()); _dst.create(src_size, src.type());
......
...@@ -29,7 +29,7 @@ static int divUp(int a, int b) ...@@ -29,7 +29,7 @@ static int divUp(int a, int b)
} }
template <typename FT, typename ST, typename WT> template <typename FT, typename ST, typename WT>
static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn, static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT *h, int hn, int cn,
int & almostTemplateWindowSizeSqBinShift, bool abs) int & almostTemplateWindowSizeSqBinShift, bool abs)
{ {
const WT maxEstimateSumValue = searchWindowSize * searchWindowSize * const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
...@@ -53,24 +53,32 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow ...@@ -53,24 +53,32 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
int maxDist = abs ? std::numeric_limits<ST>::max() * cn : int maxDist = abs ? std::numeric_limits<ST>::max() * cn :
std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn; std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1); int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
FT den = 1.0f / (h * h * cn); FT den[4];
CV_Assert(hn > 0 && hn <= 4);
for (int i=0; i<hn; i++)
den[i] = 1.0f / (h[i] * h[i] * cn);
almostDist2Weight.create(1, almostMaxDist, CV_32SC1); almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));
char buf[40];
ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc, ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
format("-D OP_CALC_WEIGHTS -D FT=%s%s%s", ocl::typeToStr(depth), format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
" -D wlut_t=%s -D convert_wlut_t=%s%s%s",
ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
doubleSupport ? " -D DOUBLE_SUPPORT" : "", abs ? " -D ABS" : "")); doubleSupport ? " -D DOUBLE_SUPPORT" : "", abs ? " -D ABS" : ""));
if (k.empty()) if (k.empty())
return false; return false;
k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist, k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD); almostDist2ActualDistMultiplier, fixedPointMult,
ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);
size_t globalsize[1] = { almostMaxDist }; size_t globalsize[1] = { almostMaxDist };
return k.run(1, globalsize, NULL, false); return k.run(1, globalsize, NULL, false);
} }
static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float *h, int hn,
int templateWindowSize, int searchWindowSize, bool abs) int templateWindowSize, int searchWindowSize, bool abs)
{ {
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
...@@ -89,18 +97,22 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, ...@@ -89,18 +97,22 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
char buf[4][40]; char buf[4][40];
String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d" String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
" -D pixel_t=%s -D int_t=%s" " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
" -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s" " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
" -D BLOCK_COLS=%d -D BLOCK_ROWS=%d" " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
" -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d" " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
" -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s", " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
templateWindowSize, searchWindowSize, templateWindowSize, searchWindowSize,
ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
depth == CV_8U ? ocl::typeToStr(CV_32S) : "long", ocl::typeToStr(CV_32SC(hn)),
depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) : depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
(sprintf(buf[0], "long%d", cn), buf[0]), format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) : depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
(sprintf(buf[1], "convert_long%d", cn), buf[1]), format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
BLOCK_COLS, BLOCK_ROWS, BLOCK_COLS, BLOCK_ROWS,
ctaSize, templateWindowHalfWize, searchWindowHalfSize, ctaSize, templateWindowHalfWize, searchWindowHalfSize,
ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn, ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
...@@ -115,13 +127,13 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, ...@@ -115,13 +127,13 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
if ((depth == CV_8U && if ((depth == CV_8U &&
!ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight, !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
searchWindowSize, templateWindowSize, searchWindowSize, templateWindowSize,
h, cn, h, hn, cn,
almostTemplateWindowSizeSqBinShift, almostTemplateWindowSizeSqBinShift,
abs)) || abs)) ||
(depth == CV_16U && (depth == CV_16U &&
!ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight, !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
searchWindowSize, templateWindowSize, searchWindowSize, templateWindowSize,
h, cn, h, hn, cn,
almostTemplateWindowSizeSqBinShift, almostTemplateWindowSizeSqBinShift,
abs))) abs)))
return false; return false;
......
...@@ -20,9 +20,9 @@ ...@@ -20,9 +20,9 @@
#ifdef OP_CALC_WEIGHTS #ifdef OP_CALC_WEIGHTS
__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist, __kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist,
FT almostDist2ActualDistMultiplier, int fixedPointMult, FT almostDist2ActualDistMultiplier, int fixedPointMult,
FT den, FT WEIGHT_THRESHOLD) w_t den, FT WEIGHT_THRESHOLD)
{ {
int almostDist = get_global_id(0); int almostDist = get_global_id(0);
...@@ -30,14 +30,13 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost ...@@ -30,14 +30,13 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
{ {
FT dist = almostDist * almostDist2ActualDistMultiplier; FT dist = almostDist * almostDist2ActualDistMultiplier;
#ifdef ABS #ifdef ABS
int weight = convert_int_sat_rte(fixedPointMult * exp(-dist*dist * den)); w_t w = exp((w_t)(-dist*dist) * den);
#else #else
int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); w_t w = exp((w_t)(-dist) * den);
#endif #endif
if (weight < WEIGHT_THRESHOLD * fixedPointMult) wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));
weight = 0; almostDist2Weight[almostDist] =
weight < WEIGHT_THRESHOLD * fixedPointMult ? (wlut_t)0 : weight;
almostDist2Weight[almostDist] = weight;
} }
} }
...@@ -208,14 +207,14 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset ...@@ -208,14 +207,14 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
} }
inline void convolveWindow(__global const uchar * src, int src_step, int src_offset, inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,
__local int * dists, __global const int * almostDist2Weight, __local int * dists, __global const wlut_t * almostDist2Weight,
__global uchar * dst, int dst_step, int dst_offset, __global uchar * dst, int dst_step, int dst_offset,
int y, int x, int id, __local weight_t * weights_local, int y, int x, int id, __local weight_t * weights_local,
__local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) __local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
{ {
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
weight_t weights = 0; weight_t weights = (weight_t)0;
sum_t weighted_sum = (sum_t)(0); sum_t weighted_sum = (sum_t)0;
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE) for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
{ {
...@@ -223,10 +222,10 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off ...@@ -223,10 +222,10 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index)); sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift; int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
int weight = almostDist2Weight[almostAvgDist]; weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]);
weights += (weight_t)weight; weights += weight;
weighted_sum += (sum_t)(weight) * src_value; weighted_sum += (sum_t)weight * src_value;
} }
weights_local[id] = weights; weights_local[id] = weights;
...@@ -251,13 +250,13 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off ...@@ -251,13 +250,13 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
weighted_sum_local[2] + weighted_sum_local[3]; weighted_sum_local[2] + weighted_sum_local[3];
weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]; weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
*(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)(weights_local_0)); *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0);
} }
} }
__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset, __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global const int * almostDist2Weight, __global uchar * buffer, __global const wlut_t * almostDist2Weight, __global uchar * buffer,
int almostTemplateWindowSizeSqBinShift) int almostTemplateWindowSizeSqBinShift)
{ {
int block_x = get_group_id(0), nblocks_x = get_num_groups(0); int block_x = get_group_id(0), nblocks_x = get_num_groups(0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment