Commit 0fdb95e1 authored by Erik Karlsson's avatar Erik Karlsson

Refactoring and addition of CV_8UC3 to ocl_fastNlMeansDenoising

parent 8e7aff44
...@@ -70,11 +70,11 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow ...@@ -70,11 +70,11 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
int templateWindowSize, int searchWindowSize) int templateWindowSize, int searchWindowSize)
{ {
int type = _src.type(), cn = CV_MAT_CN(type); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT; int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
Size size = _src.size(); Size size = _src.size();
if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 ) if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC3 )
return false; return false;
int templateWindowHalfWize = templateWindowSize / 2; int templateWindowHalfWize = templateWindowSize / 2;
...@@ -86,13 +86,15 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, ...@@ -86,13 +86,15 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
char cvt[2][40]; char cvt[2][40];
String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d" String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
" -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d" " -D sample_t=%s -D pixel_t=%s -D int_t=%s"
" -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
" -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d" " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
" -D convert_int_t=%s -D cn=%d -D convert_uchar_t=%s", " -D convert_int_t=%s -D cn=%d -D convert_pixel_t=%s",
templateWindowSize, searchWindowSize, ocl::typeToStr(type), templateWindowSize, searchWindowSize,
ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, ctaSize, ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
templateWindowHalfWize, searchWindowHalfSize, BLOCK_COLS, BLOCK_ROWS,
ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn, ctaSize, templateWindowHalfWize, searchWindowHalfSize,
ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), type == CV_8UC3 ? 4 : cn,
ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1])); ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts); ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
...@@ -107,10 +109,22 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, ...@@ -107,10 +109,22 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
UMat srcex; UMat srcex;
int borderSize = searchWindowHalfSize + templateWindowHalfWize; int borderSize = searchWindowHalfSize + templateWindowHalfWize;
copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT); if (type == CV_8UC3) {
Mat src_rgb = _src.getMat(), src_rgba(size, CV_8UC4);
int from_to[] = { 0,0, 1,1, 2,2 };
mixChannels(&src_rgb, 1, &src_rgba, 1, from_to, 3);
copyMakeBorder(src_rgba, srcex,
borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
}
else
copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
_dst.create(size, type); _dst.create(size, type);
UMat dst = _dst.getUMat(); UMat dst;
if (type == CV_8UC3)
dst.create(size, CV_8UC4);
else
dst = _dst.getUMat();
int searchWindowSizeSq = searchWindowSize * searchWindowSize; int searchWindowSizeSq = searchWindowSize * searchWindowSize;
Size upColSumSize(size.width, searchWindowSizeSq * nblocksy); Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
...@@ -123,7 +137,15 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, ...@@ -123,7 +137,15 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 }; size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
return k.run(2, globalsize, localsize, false); if (!k.run(2, globalsize, localsize, false)) return false;
if (type == CV_8UC3) {
Mat dst_rgba = dst.getMat(ACCESS_READ), dst_rgb = _dst.getMat();
int from_to[] = { 0,0, 1,1, 2,2 };
mixChannels(&dst_rgba, 1, &dst_rgb, 1, from_to, 3);
}
return true;
} }
static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst, static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
......
...@@ -29,8 +29,11 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost ...@@ -29,8 +29,11 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
if (almostDist < almostMaxDist) if (almostDist < almostMaxDist)
{ {
FT dist = almostDist * almostDist2ActualDistMultiplier; FT dist = almostDist * almostDist2ActualDistMultiplier;
#ifdef ABS
int weight = convert_int_sat_rte(fixedPointMult * exp(-dist*dist * den));
#else
int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den));
#endif
if (weight < WEIGHT_THRESHOLD * fixedPointMult) if (weight < WEIGHT_THRESHOLD * fixedPointMult)
weight = 0; weight = 0;
...@@ -44,21 +47,33 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost ...@@ -44,21 +47,33 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
#define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE) #define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE)
inline int calcDist(uchar_t a, uchar_t b) inline int calcDist(pixel_t a, pixel_t b)
{ {
#ifdef ABS
int_t retval = convert_int_t(abs_diff(a, b));
#else
int_t diff = convert_int_t(a) - convert_int_t(b); int_t diff = convert_int_t(a) - convert_int_t(b);
int_t retval = diff * diff; int_t retval = diff * diff;
#endif
#if cn == 1 #if cn == 1
return retval; return retval;
#elif cn == 2 #elif cn == 2
return retval.x + retval.y; return retval.x + retval.y;
#elif cn == 3 || cn == 4 /* A is ignored */
return retval.x + retval.y + retval.z;
#else #else
#error "cn should be either 1 or 2" #error "cn should be either 1, 2, 3 or 4"
#endif #endif
} }
inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_value, uchar_t up_value_t) #ifdef ABS
inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
{
return calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);
}
#else
inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
{ {
int_t A = convert_int_t(down_value) - convert_int_t(down_value_t); int_t A = convert_int_t(down_value) - convert_int_t(down_value_t);
int_t B = convert_int_t(up_value) - convert_int_t(up_value_t); int_t B = convert_int_t(up_value) - convert_int_t(up_value_t);
...@@ -68,14 +83,17 @@ inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_v ...@@ -68,14 +83,17 @@ inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_v
return retval; return retval;
#elif cn == 2 #elif cn == 2
return retval.x + retval.y; return retval.x + retval.y;
#elif cn == 3 || cn == 4 /* A is ignored */
return retval.x + retval.y + retval.z;
#else #else
#error "cn should be either 1 or 2" #error "cn should be either 1, 2, 3 or 4"
#endif #endif
} }
#endif
#define COND if (x == 0 && y == 0) #define COND if (x == 0 && y == 0)
inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset, inline void calcFirstElementInRow(__global const sample_t * src, int src_step, int src_offset,
__local int * dists, int y, int x, int id, __local int * dists, int y, int x, int id,
__global int * col_dists, __global int * up_col_dists) __global int * col_dists, __global int * up_col_dists)
{ {
...@@ -87,9 +105,9 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int ...@@ -87,9 +105,9 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
{ {
int dist = 0, value; int dist = 0, value;
__global const uchar_t * src_template = (__global const uchar_t *)(src + __global const pixel_t * src_template = (__global const pixel_t *)(src +
mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
__global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
__global int * col_dists_current = col_dists + i * TEMPLATE_SIZE; __global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;
#pragma unroll #pragma unroll
...@@ -107,8 +125,8 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int ...@@ -107,8 +125,8 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
dist += value; dist += value;
} }
src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step); src_current = (__global const pixel_t *)((__global const sample_t *)src_current + src_step);
src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); src_template = (__global const pixel_t *)((__global const sample_t *)src_template + src_step);
} }
#pragma unroll #pragma unroll
...@@ -120,7 +138,7 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int ...@@ -120,7 +138,7 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
} }
} }
inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset, inline void calcElementInFirstRow(__global const sample_t * src, int src_step, int src_offset,
__local int * dists, int y, int x0, int x, int id, int first, __local int * dists, int y, int x0, int x, int id, int first,
__global int * col_dists, __global int * up_col_dists) __global int * col_dists, __global int * up_col_dists)
{ {
...@@ -130,8 +148,8 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int ...@@ -130,8 +148,8 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE) for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
{ {
__global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
__global const uchar_t * src_template = (__global const uchar_t *)(src + __global const pixel_t * src_template = (__global const pixel_t *)(src +
mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
__global int * col_dists_current = col_dists + TEMPLATE_SIZE * i; __global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;
...@@ -142,8 +160,8 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int ...@@ -142,8 +160,8 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
{ {
col_dist += calcDist(src_current[0], src_template[0]); col_dist += calcDist(src_current[0], src_template[0]);
src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step); src_current = (__global const pixel_t *)((__global const sample_t *)src_current + src_step);
src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); src_template = (__global const pixel_t *)((__global const sample_t *)src_template + src_step);
} }
dists[i] += col_dist - col_dists_current[first]; dists[i] += col_dist - col_dists_current[first];
...@@ -152,7 +170,7 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int ...@@ -152,7 +170,7 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
} }
} }
inline void calcElement(__global const uchar * src, int src_step, int src_offset, inline void calcElement(__global const sample_t * src, int src_step, int src_offset,
__local int * dists, int y, int x0, int x, int id, int first, __local int * dists, int y, int x0, int x, int id, int first,
__global int * col_dists, __global int * up_col_dists) __global int * col_dists, __global int * up_col_dists)
{ {
...@@ -160,8 +178,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset ...@@ -160,8 +178,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
int sy_up = y - TEMPLATE_SIZE2 - 1; int sy_up = y - TEMPLATE_SIZE2 - 1;
int sy_down = y + TEMPLATE_SIZE2; int sy_down = y + TEMPLATE_SIZE2;
uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset))); pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset)));
uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset))); pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset)));
sx -= SEARCH_SIZE2; sx -= SEARCH_SIZE2;
sy_up -= SEARCH_SIZE2; sy_up -= SEARCH_SIZE2;
...@@ -171,8 +189,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset ...@@ -171,8 +189,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
{ {
int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE; int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset))); pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset)));
uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset))); pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset)));
__global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first); __global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
__global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i); __global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
...@@ -185,9 +203,9 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset ...@@ -185,9 +203,9 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
} }
} }
inline void convolveWindow(__global const uchar * src, int src_step, int src_offset, inline void convolveWindow(__global const sample_t * src, int src_step, int src_offset,
__local int * dists, __global const int * almostDist2Weight, __local int * dists, __global const int * almostDist2Weight,
__global uchar * dst, int dst_step, int dst_offset, __global sample_t * dst, int dst_step, int dst_offset,
int y, int x, int id, __local int * weights_local, int y, int x, int id, __local int * weights_local,
__local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
{ {
...@@ -197,7 +215,7 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off ...@@ -197,7 +215,7 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE) for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
{ {
int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset)); int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
int_t src_value = convert_int_t(*(__global const uchar_t *)(src + src_index)); int_t src_value = convert_int_t(*(__global const pixel_t *)(src + src_index));
int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift; int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
int weight = almostDist2Weight[almostAvgDist]; int weight = almostDist2Weight[almostAvgDist];
...@@ -228,13 +246,13 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off ...@@ -228,13 +246,13 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
weighted_sum_local[2] + weighted_sum_local[3]; weighted_sum_local[2] + weighted_sum_local[3];
int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]; int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
*(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local_0 / (int_t)(weights_local_0)); *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (int_t)(weights_local_0));
} }
} }
__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset, __kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global sample_t * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global const int * almostDist2Weight, __global uchar * buffer, __global const int * almostDist2Weight, __global sample_t * buffer,
int almostTemplateWindowSizeSqBinShift) int almostTemplateWindowSizeSqBinShift)
{ {
int block_x = get_group_id(0), nblocks_x = get_num_groups(0); int block_x = get_group_id(0), nblocks_x = get_num_groups(0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment