Commit 9db28f33 authored by Ilya Lavrenov's avatar Ilya Lavrenov

more fixes

parent 891dbeab
...@@ -137,6 +137,8 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker( ...@@ -137,6 +137,8 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
double dist = almost_dist * almost_dist2actual_dist_multiplier; double dist = almost_dist * almost_dist2actual_dist_multiplier;
int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T)))); int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
printf("%d ", weight);
if (weight < WEIGHT_THRESHOLD * fixed_point_mult_) if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
weight = 0; weight = 0;
......
...@@ -111,15 +111,17 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, ...@@ -111,15 +111,17 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
_dst.create(size, type); _dst.create(size, type);
UMat dst = _dst.getUMat(); UMat dst = _dst.getUMat();
Size upColSumSize(size.width, searchWindowSize * searchWindowSize * nblocksy); int searchWindowSizeSq = searchWindowSize * searchWindowSize;
Size colSumSize(nblocksx * templateWindowSize, searchWindowSize * searchWindowSize * nblocksy); Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy);
UMat buffer(upColSumSize + colSumSize, CV_32SC(cn)); UMat buffer(upColSumSize + colSumSize, CV_32SC(cn));
srcex = srcex(Rect(Point(borderSize, borderSize), size));
k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst), k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst),
ocl::KernelArg::PtrReadOnly(almostDist2Weight), nblocksy, nblocksx, ocl::KernelArg::PtrReadOnly(almostDist2Weight),
ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
size_t globalsize[2] = { nblocksx, nblocksy }, localsize[2] = { CTA_SIZE, 1 }; size_t globalsize[2] = { nblocksx * BLOCK_COLS, nblocksy * BLOCK_ROWS }, localsize[2] = { CTA_SIZE, 1 };
return k.run(2, globalsize, localsize, false); return k.run(2, globalsize, localsize, false);
} }
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
#pragma OPENCL_EXTENSION cl_amd_printf:enable
#ifdef OP_CALC_WEIGHTS #ifdef OP_CALC_WEIGHTS
__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist, __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist,
...@@ -18,6 +20,8 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost ...@@ -18,6 +20,8 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
float dist = almostDist * almostDist2ActualDistMultiplier; float dist = almostDist * almostDist2ActualDistMultiplier;
int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den));
// printf("%d ", weight);
if (weight < WEIGHT_THRESHOLD * fixedPointMult) if (weight < WEIGHT_THRESHOLD * fixedPointMult)
weight = 0; weight = 0;
...@@ -31,7 +35,7 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost ...@@ -31,7 +35,7 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
inline int_t calcDist(uchar_t a, uchar_t b) inline int_t calcDist(uchar_t a, uchar_t b)
{ {
int_t diff = convert_int_t(a) -convert_int_t(b); int_t diff = convert_int_t(a) - convert_int_t(b);
return diff * diff; return diff * diff;
} }
...@@ -39,16 +43,14 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int ...@@ -39,16 +43,14 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
__local int_t * dists, int y, int x, int id, __local int_t * dists, int y, int x, int id,
__global int_t * col_dists, __global int_t * up_col_dists) __global int_t * col_dists, __global int_t * up_col_dists)
{ {
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2 - TEMPLATE_SIZE2;
for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
{ {
int_t dist = (int_t)(0), value; int_t dist = (int_t)(0), value;
sx += i % SEARCH_SIZE; __global const uchar_t * src_template = (__global const uchar_t *)(src +
sy += i / SEARCH_SIZE; mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
__global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset)));
__global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
__global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE; __global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE;
...@@ -57,7 +59,7 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int ...@@ -57,7 +59,7 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
col_dists_current[j] = (int_t)(0); col_dists_current[j] = (int_t)(0);
#pragma unroll #pragma unroll
for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty) for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)
{ {
#pragma unroll #pragma unroll
for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx) for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx)
...@@ -68,78 +70,86 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int ...@@ -68,78 +70,86 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
dist += value; dist += value;
} }
src_current += src_step; src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
src_template += src_step; src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
} }
dists[i] = dist; dists[i] = dist;
up_col_dists[i] = col_dists[TEMPLATE_SIZE - 1]; up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1];
} }
} }
#define COND if (i == 252 && x0 == 20)
inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset, inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset,
__local int_t * dists, int y, int x, int id, int first, __local int_t * dists, int y, int x0, int x, int id, int first,
__global int_t * col_dists, __global int_t * up_col_dists) __global int_t * col_dists, __global int_t * up_col_dists)
{ {
x += TEMPLATE_SIZE2; x += TEMPLATE_SIZE2;
y -= TEMPLATE_SIZE2;
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
{ {
sx += i % SEARCH_SIZE;
sy += i / SEARCH_SIZE;
__global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
__global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset))); __global const uchar_t * src_template = (__global const uchar_t *)(src +
mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
__global int_t * col_dists_current = col_dists + TEMPLATE_SIZE * i; __global int_t * col_dists_current = col_dists + TEMPLATE_SIZE * i;
int_t value; int_t col_dist = (int_t)(0);
dists[id] -= col_dists_current[first];
col_dists_current[first] = (int_t)(0);
#pragma unroll #pragma unroll
for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty) for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)
{ {
value = calcDist(src_current[0], src_template[0]); col_dist += calcDist(src_current[0], src_template[0]);
col_dists_current[first] += value; // COND printf("%d\n", calcDist(src_current[0], src_template[0]));
src_current += src_step; src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
src_template += src_step; src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
} }
dists[id] += col_dists_current[first]; dists[i] += col_dist - col_dists_current[first];
up_col_dists[id] = col_dists_current[first]; col_dists_current[first] = col_dist;
up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist;
// COND printf("res = %d\n", col_dist);
} }
} }
inline void calcElement(__global const uchar * src, int src_step, int src_offset, inline void calcElement(__global const uchar * src, int src_step, int src_offset,
__local int_t * dists, int y, int x, int id, int first, __local int_t * dists, int y, int x0, int x, int id, int first,
__global int_t * col_dists, __global int_t * up_col_dists) __global int_t * col_dists, __global int_t * up_col_dists)
{ {
int sx_up = x + TEMPLATE_SIZE2, sy_up = y - TEMPLATE_SIZE2 - 1; int sx = x + TEMPLATE_SIZE2;
int sx_down = x + TEMPLATE_SIZE2, sy_down = y + TEMPLATE_SIZE2; int sy_up = y - TEMPLATE_SIZE2 - 1 /*- TEMPLATE_SIZE*/;
int sy_down = y + TEMPLATE_SIZE2;
uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset))); uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset)));
uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset))); uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset)));
for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) sx -= SEARCH_SIZE2;
sy_up -= SEARCH_SIZE2;
sy_down -= SEARCH_SIZE2;
for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
{ {
int wx = i % SEARCH_SIZE; int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
int wy = i / SEARCH_SIZE;
sx_up += wx, sx_down += wx; uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset)));
sy_up += wy, sy_down += wy; uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset)));
uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset))); __global int_t * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset))); __global int_t * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
__global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE; // COND printf("\nres = %d\n", up_col_dists_current[0]);
__global int_t * up_col_dists_current = up_col_dists + i; // COND printf("up = %d, down = %d\n", calcDist(up_value, up_value_t), calcDist(down_value, down_value_t));
int_t col_dist = up_col_dists_current[0] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);
dists[i] += col_dist - col_dists_current[0];
col_dists_current[0] = col_dist;
up_col_dists_current[0] = col_dist;
dists[i] -= col_dists_current[first]; // COND printf("res = %d\n", up_col_dists_current[0]);
col_dists_current[first] = up_col_dists_current[id] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t); // if (up_col_dists_current[0] < 0) printf("%d %d -- %d\n", i, x0, up_col_dists_current[0]);
dists[i] += col_dists_current[first];
up_col_dists_current[id] = col_dists_current[first];
} }
} }
...@@ -147,27 +157,28 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off ...@@ -147,27 +157,28 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
__local int * dists, __global const int * almostDist2Weight, __local int * dists, __global const int * almostDist2Weight,
__global uchar * dst, int dst_step, int dst_offset, __global uchar * dst, int dst_step, int dst_offset,
int y, int x, int id, __local int * weights_local, int y, int x, int id, __local int * weights_local,
__local int * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
{ {
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0; int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0;
int_t weighted_sum = (int_t)(0); int_t weighted_sum = (int_t)(0);
for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += id) for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
{ {
int src_index = mad24(sy + i / SEARCH_SIZE, src_step, (i % SEARCH_SIZE + sx) * cn + src_offset); int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
__global const uchar_t * src_search = (__global const uchar_t *)(src + src_index); int_t src_value = convert_int_t(*(__global const uchar_t *)(src + src_index));
int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift; int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
int weight = almostDist2Weight[almostAvgDist]; int weight = almostDist2Weight[almostAvgDist];
weights += weight; weights += weight;
weighted_sum += (int_t)(weight) * convert_int_t(src_search[0]); weighted_sum += (int_t)(weight) * src_value;
} }
if (id >= CTA_SIZE2) if (id >= CTA_SIZE2)
{ {
weights_local[id - CTA_SIZE2] = weights; int id2 = id - CTA_SIZE2;
weighted_sum_local[id - CTA_SIZE2] = weighted_sum; weights_local[id2] = weights;
weighted_sum_local[id2] = weighted_sum;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -191,9 +202,9 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off ...@@ -191,9 +202,9 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
if (id == 0) if (id == 0)
{ {
int dst_index = mad24(y, dst_step, dst_offset + x * cn); int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset));
int_t weights_local_0 = (int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]); int_t weights_local_0 = (int_t)(1);//(int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]);
int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + weighted_sum_local[2] + weighted_sum_local[3]; int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + weighted_sum_local[2] + weighted_sum_local[3];
*(__global uchar_t *)(dst + dst_index) = convert_uchar_t((weighted_sum_local_0 + weights_local_0 >> 1) / weights_local_0); *(__global uchar_t *)(dst + dst_index) = convert_uchar_t((weighted_sum_local_0 + weights_local_0 >> 1) / weights_local_0);
...@@ -202,48 +213,50 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off ...@@ -202,48 +213,50 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset, __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global const int * almostDist2Weight, int nblocksy, int nblocksx, __global const int * almostDist2Weight, __global uchar * buffer,
__global uchar * buffer, int almostTemplateWindowSizeSqBinShift) int almostTemplateWindowSizeSqBinShift)
{ {
int block_x = get_global_id(0); int block_x = get_group_id(0), nblocks_x = get_num_groups(0);
int block_y = get_global_id(1); int block_y = get_group_id(1);
int id = get_local_id(0), first; int id = get_local_id(0), first;
__local int_t dists[SEARCH_SIZE_SQ], weighted_sum[CTA_SIZE2]; __local int_t dists[SEARCH_SIZE_SQ], weighted_sum[CTA_SIZE2];
__local int weights[CTA_SIZE2]; __local int weights[CTA_SIZE2];
int block_data_start = mad24(block_y, nblocksx, block_x) * SEARCH_SIZE_SQ * (TEMPLATE_SIZE + BLOCK_COLS); int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
__global int_t * col_dists = (__global int_t *)(buffer + block_data_start * sizeof(int_t)); int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
__global int_t * up_col_dists = (__global int_t *)(buffer + sizeof(int_t) * (block_data_start + SEARCH_SIZE_SQ * TEMPLATE_SIZE));
if (block_x < nblocksx && block_y < nblocksy) // for each group we need SEARCH_SIZE_SQ * TEMPLATE_SIZE integer buffer for storing part column sum for current element
{ // and SEARCH_SIZE_SQ * BLOCK_COLS integer buffer for storing last column sum for each element of search window of up row
int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE);
int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); __global int_t * col_dists = (__global int_t *)(buffer + block_data_start * sizeof(int_t));
__global int_t * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE;
for (int y = y0; y < y1; ++y) for (int y = y0; y < y1; ++y)
for (int x = x0; x < x1; ++x) for (int x = x0; x < x1; ++x)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (x == x0)
{ {
if (x == x0) calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists);
{ first = 0;
calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists); }
first = 0; else
} {
if (y == y0)
calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);
else else
{ {
if (y == y0) calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);
calcElementInFirstRow(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists); first = (first + 1) % TEMPLATE_SIZE;
else
{
calcElement(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists);
first = (first + 1) % TEMPLATE_SIZE;
}
convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,
y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);
} }
} }
}
barrier(CLK_LOCAL_MEM_FENCE);
convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,
y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);
}
} }
#endif #endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment