Commit 9db28f33 authored by Ilya Lavrenov's avatar Ilya Lavrenov

more fixes

parent 891dbeab
......@@ -137,6 +137,8 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
double dist = almost_dist * almost_dist2actual_dist_multiplier;
int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
printf("%d ", weight);
if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
weight = 0;
......
......@@ -111,15 +111,17 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
_dst.create(size, type);
UMat dst = _dst.getUMat();
Size upColSumSize(size.width, searchWindowSize * searchWindowSize * nblocksy);
Size colSumSize(nblocksx * templateWindowSize, searchWindowSize * searchWindowSize * nblocksy);
int searchWindowSizeSq = searchWindowSize * searchWindowSize;
Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy);
UMat buffer(upColSumSize + colSumSize, CV_32SC(cn));
srcex = srcex(Rect(Point(borderSize, borderSize), size));
k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst),
ocl::KernelArg::PtrReadOnly(almostDist2Weight), nblocksy, nblocksx,
ocl::KernelArg::PtrReadOnly(almostDist2Weight),
ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
size_t globalsize[2] = { nblocksx, nblocksy }, localsize[2] = { CTA_SIZE, 1 };
size_t globalsize[2] = { nblocksx * BLOCK_COLS, nblocksy * BLOCK_ROWS }, localsize[2] = { CTA_SIZE, 1 };
return k.run(2, globalsize, localsize, false);
}
......
......@@ -5,6 +5,8 @@
// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
#pragma OPENCL_EXTENSION cl_amd_printf:enable
#ifdef OP_CALC_WEIGHTS
__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist,
......@@ -18,6 +20,8 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
float dist = almostDist * almostDist2ActualDistMultiplier;
int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den));
// printf("%d ", weight);
if (weight < WEIGHT_THRESHOLD * fixedPointMult)
weight = 0;
......@@ -31,7 +35,7 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
inline int_t calcDist(uchar_t a, uchar_t b)
{
int_t diff = convert_int_t(a) -convert_int_t(b);
int_t diff = convert_int_t(a) - convert_int_t(b);
return diff * diff;
}
......@@ -39,16 +43,14 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
__local int_t * dists, int y, int x, int id,
__global int_t * col_dists, __global int_t * up_col_dists)
{
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2 - TEMPLATE_SIZE2;
for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
{
int_t dist = (int_t)(0), value;
sx += i % SEARCH_SIZE;
sy += i / SEARCH_SIZE;
__global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset)));
__global const uchar_t * src_template = (__global const uchar_t *)(src +
mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
__global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
__global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE;
......@@ -57,7 +59,7 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
col_dists_current[j] = (int_t)(0);
#pragma unroll
for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty)
for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)
{
#pragma unroll
for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx)
......@@ -68,78 +70,86 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
dist += value;
}
src_current += src_step;
src_template += src_step;
src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
}
dists[i] = dist;
up_col_dists[i] = col_dists[TEMPLATE_SIZE - 1];
up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1];
}
}
#define COND if (i == 252 && x0 == 20)
inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset,
__local int_t * dists, int y, int x, int id, int first,
__local int_t * dists, int y, int x0, int x, int id, int first,
__global int_t * col_dists, __global int_t * up_col_dists)
{
x += TEMPLATE_SIZE2;
y -= TEMPLATE_SIZE2;
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
{
sx += i % SEARCH_SIZE;
sy += i / SEARCH_SIZE;
__global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
__global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset)));
__global const uchar_t * src_template = (__global const uchar_t *)(src +
mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
__global int_t * col_dists_current = col_dists + TEMPLATE_SIZE * i;
int_t value;
dists[id] -= col_dists_current[first];
col_dists_current[first] = (int_t)(0);
int_t col_dist = (int_t)(0);
#pragma unroll
for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty)
for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)
{
value = calcDist(src_current[0], src_template[0]);
col_dists_current[first] += value;
col_dist += calcDist(src_current[0], src_template[0]);
// COND printf("%d\n", calcDist(src_current[0], src_template[0]));
src_current += src_step;
src_template += src_step;
src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
}
dists[id] += col_dists_current[first];
up_col_dists[id] = col_dists_current[first];
dists[i] += col_dist - col_dists_current[first];
col_dists_current[first] = col_dist;
up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist;
// COND printf("res = %d\n", col_dist);
}
}
inline void calcElement(__global const uchar * src, int src_step, int src_offset,
__local int_t * dists, int y, int x, int id, int first,
__local int_t * dists, int y, int x0, int x, int id, int first,
__global int_t * col_dists, __global int_t * up_col_dists)
{
int sx_up = x + TEMPLATE_SIZE2, sy_up = y - TEMPLATE_SIZE2 - 1;
int sx_down = x + TEMPLATE_SIZE2, sy_down = y + TEMPLATE_SIZE2;
int sx = x + TEMPLATE_SIZE2;
int sy_up = y - TEMPLATE_SIZE2 - 1 /*- TEMPLATE_SIZE*/;
int sy_down = y + TEMPLATE_SIZE2;
uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset)));
uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset)));
uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset)));
uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset)));
for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
sx -= SEARCH_SIZE2;
sy_up -= SEARCH_SIZE2;
sy_down -= SEARCH_SIZE2;
for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
{
int wx = i % SEARCH_SIZE;
int wy = i / SEARCH_SIZE;
int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
sx_up += wx, sx_down += wx;
sy_up += wy, sy_down += wy;
uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset)));
uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset)));
uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset)));
uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset)));
__global int_t * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
__global int_t * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
__global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE;
__global int_t * up_col_dists_current = up_col_dists + i;
// COND printf("\nres = %d\n", up_col_dists_current[0]);
// COND printf("up = %d, down = %d\n", calcDist(up_value, up_value_t), calcDist(down_value, down_value_t));
int_t col_dist = up_col_dists_current[0] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);
dists[i] += col_dist - col_dists_current[0];
col_dists_current[0] = col_dist;
up_col_dists_current[0] = col_dist;
dists[i] -= col_dists_current[first];
col_dists_current[first] = up_col_dists_current[id] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);
dists[i] += col_dists_current[first];
up_col_dists_current[id] = col_dists_current[first];
// COND printf("res = %d\n", up_col_dists_current[0]);
// if (up_col_dists_current[0] < 0) printf("%d %d -- %d\n", i, x0, up_col_dists_current[0]);
}
}
......@@ -147,27 +157,28 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
__local int * dists, __global const int * almostDist2Weight,
__global uchar * dst, int dst_step, int dst_offset,
int y, int x, int id, __local int * weights_local,
__local int * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
__local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
{
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0;
int_t weighted_sum = (int_t)(0);
for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += id)
for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE)
{
int src_index = mad24(sy + i / SEARCH_SIZE, src_step, (i % SEARCH_SIZE + sx) * cn + src_offset);
__global const uchar_t * src_search = (__global const uchar_t *)(src + src_index);
int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
int_t src_value = convert_int_t(*(__global const uchar_t *)(src + src_index));
int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
int weight = almostDist2Weight[almostAvgDist];
weights += weight;
weighted_sum += (int_t)(weight) * convert_int_t(src_search[0]);
weighted_sum += (int_t)(weight) * src_value;
}
if (id >= CTA_SIZE2)
{
weights_local[id - CTA_SIZE2] = weights;
weighted_sum_local[id - CTA_SIZE2] = weighted_sum;
int id2 = id - CTA_SIZE2;
weights_local[id2] = weights;
weighted_sum_local[id2] = weighted_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -191,9 +202,9 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
if (id == 0)
{
int dst_index = mad24(y, dst_step, dst_offset + x * cn);
int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset));
int_t weights_local_0 = (int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]);
int_t weights_local_0 = (int_t)(1);//(int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]);
int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + weighted_sum_local[2] + weighted_sum_local[3];
*(__global uchar_t *)(dst + dst_index) = convert_uchar_t((weighted_sum_local_0 + weights_local_0 >> 1) / weights_local_0);
......@@ -202,48 +213,50 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global const int * almostDist2Weight, int nblocksy, int nblocksx,
__global uchar * buffer, int almostTemplateWindowSizeSqBinShift)
__global const int * almostDist2Weight, __global uchar * buffer,
int almostTemplateWindowSizeSqBinShift)
{
int block_x = get_global_id(0);
int block_y = get_global_id(1);
int block_x = get_group_id(0), nblocks_x = get_num_groups(0);
int block_y = get_group_id(1);
int id = get_local_id(0), first;
__local int_t dists[SEARCH_SIZE_SQ], weighted_sum[CTA_SIZE2];
__local int weights[CTA_SIZE2];
int block_data_start = mad24(block_y, nblocksx, block_x) * SEARCH_SIZE_SQ * (TEMPLATE_SIZE + BLOCK_COLS);
__global int_t * col_dists = (__global int_t *)(buffer + block_data_start * sizeof(int_t));
__global int_t * up_col_dists = (__global int_t *)(buffer + sizeof(int_t) * (block_data_start + SEARCH_SIZE_SQ * TEMPLATE_SIZE));
int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
if (block_x < nblocksx && block_y < nblocksy)
{
int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
// for each group we need SEARCH_SIZE_SQ * TEMPLATE_SIZE integer buffer for storing part column sum for current element
// and SEARCH_SIZE_SQ * BLOCK_COLS integer buffer for storing last column sum for each element of search window of up row
int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE);
__global int_t * col_dists = (__global int_t *)(buffer + block_data_start * sizeof(int_t));
__global int_t * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE;
for (int y = y0; y < y1; ++y)
for (int x = x0; x < x1; ++x)
for (int y = y0; y < y1; ++y)
for (int x = x0; x < x1; ++x)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (x == x0)
{
if (x == x0)
{
calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists);
first = 0;
}
calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists);
first = 0;
}
else
{
if (y == y0)
calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);
else
{
if (y == y0)
calcElementInFirstRow(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists);
else
{
calcElement(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists);
first = (first + 1) % TEMPLATE_SIZE;
}
convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,
y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);
calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);
first = (first + 1) % TEMPLATE_SIZE;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,
y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);
}
}
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment