Commit 60f9ba0c authored by Ilya Lavrenov's avatar Ilya Lavrenov

added ROI support to ocl::CLAHE

parent 4a81be7d
...@@ -1314,6 +1314,8 @@ namespace cv ...@@ -1314,6 +1314,8 @@ namespace cv
args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX )); args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit )); args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit ));
args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale )); args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
String kernelName = "calcLut"; String kernelName = "calcLut";
size_t localThreads[3] = { 32, 8, 1 }; size_t localThreads[3] = { 32, 8, 1 };
...@@ -1333,7 +1335,7 @@ namespace cv ...@@ -1333,7 +1335,7 @@ namespace cv
} }
static void transform(const oclMat &src, oclMat &dst, const oclMat &lut, static void transform(const oclMat &src, oclMat &dst, const oclMat &lut,
const int tilesX, const int tilesY, const cv::Size tileSize) const int tilesX, const int tilesY, const Size & tileSize)
{ {
cl_int2 tile_size; cl_int2 tile_size;
tile_size.s[0] = tileSize.width; tile_size.s[0] = tileSize.width;
...@@ -1351,6 +1353,9 @@ namespace cv ...@@ -1351,6 +1353,9 @@ namespace cv
args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size )); args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX )); args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY )); args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.offset ));
size_t localThreads[3] = { 32, 8, 1 }; size_t localThreads[3] = { 32, 8, 1 };
size_t globalThreads[3] = { src.cols, src.rows, 1 }; size_t globalThreads[3] = { src.cols, src.rows, 1 };
...@@ -1419,9 +1424,10 @@ namespace cv ...@@ -1419,9 +1424,10 @@ namespace cv
} }
else else
{ {
cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar()); ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0,
tilesX_ - (src.cols % tilesX_), BORDER_REFLECT_101, Scalar::all(0));
tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_); tileSize = Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
srcForLut = srcExt_; srcForLut = srcExt_;
} }
......
...@@ -53,12 +53,8 @@ int calc_lut(__local int* smem, int val, int tid) ...@@ -53,12 +53,8 @@ int calc_lut(__local int* smem, int val, int tid)
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0) if (tid == 0)
{
for (int i = 1; i < 256; ++i) for (int i = 1; i < 256; ++i)
{
smem[i] += smem[i - 1]; smem[i] += smem[i - 1];
}
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
return smem[tid]; return smem[tid];
...@@ -71,69 +67,51 @@ void reduce(volatile __local int* smem, int val, int tid) ...@@ -71,69 +67,51 @@ void reduce(volatile __local int* smem, int val, int tid)
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128) if (tid < 128)
{
smem[tid] = val += smem[tid + 128]; smem[tid] = val += smem[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64) if (tid < 64)
{
smem[tid] = val += smem[tid + 64]; smem[tid] = val += smem[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32) if (tid < 32)
{
smem[tid] += smem[tid + 32]; smem[tid] += smem[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) if (tid < 16)
{
smem[tid] += smem[tid + 16]; smem[tid] += smem[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) if (tid < 8)
{
smem[tid] += smem[tid + 8]; smem[tid] += smem[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4) if (tid < 4)
{
smem[tid] += smem[tid + 4]; smem[tid] += smem[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2) if (tid < 2)
{
smem[tid] += smem[tid + 2]; smem[tid] += smem[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1) if (tid < 1)
{
smem[256] = smem[tid] + smem[tid + 1]; smem[256] = smem[tid] + smem[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
#else #else
void reduce(__local volatile int* smem, int val, int tid) void reduce(__local volatile int* smem, int val, int tid)
{ {
smem[tid] = val; smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128) if (tid < 128)
{
smem[tid] = val += smem[tid + 128]; smem[tid] = val += smem[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64) if (tid < 64)
{
smem[tid] = val += smem[tid + 64]; smem[tid] = val += smem[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32) if (tid < 32)
...@@ -141,12 +119,17 @@ void reduce(__local volatile int* smem, int val, int tid) ...@@ -141,12 +119,17 @@ void reduce(__local volatile int* smem, int val, int tid)
smem[tid] += smem[tid + 32]; smem[tid] += smem[tid + 32];
#if WAVE_SIZE < 32 #if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
if (tid < 16)
{
#endif #endif
smem[tid] += smem[tid + 16]; smem[tid] += smem[tid + 16];
#if WAVE_SIZE < 16 #if WAVE_SIZE < 16
} barrier(CLK_LOCAL_MEM_FENCE); }
if (tid < 8) { barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif #endif
smem[tid] += smem[tid + 8]; smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4]; smem[tid] += smem[tid + 4];
...@@ -159,7 +142,8 @@ void reduce(__local volatile int* smem, int val, int tid) ...@@ -159,7 +142,8 @@ void reduce(__local volatile int* smem, int val, int tid)
__kernel void calcLut(__global __const uchar * src, __global uchar * lut, __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
const int srcStep, const int dstStep, const int srcStep, const int dstStep,
const int2 tileSize, const int tilesX, const int2 tileSize, const int tilesX,
const int clipLimit, const float lutScale) const int clipLimit, const float lutScale,
const int src_offset, const int dst_offset)
{ {
__local int smem[512]; __local int smem[512];
...@@ -173,25 +157,21 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut, ...@@ -173,25 +157,21 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1)) for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
{ {
__global const uchar* srcPtr = src + mad24( ty * tileSize.y + i, __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
srcStep, tx * tileSize.x );
for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0)) for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
{ {
const int data = srcPtr[j]; const int data = srcPtr[j];
atomic_inc(&smem[data]); atomic_inc(&smem[data]);
} }
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int tHistVal = smem[tid]; int tHistVal = smem[tid];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (clipLimit > 0) if (clipLimit > 0)
{ {
// clip histogram bar // clip histogram bar
int clipped = 0; int clipped = 0;
if (tHistVal > clipLimit) if (tHistVal > clipLimit)
{ {
...@@ -200,7 +180,6 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut, ...@@ -200,7 +180,6 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
} }
// find number of overall clipped samples // find number of overall clipped samples
reduce(smem, clipped, tid); reduce(smem, clipped, tid);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU #ifdef CPU
...@@ -229,7 +208,7 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut, ...@@ -229,7 +208,7 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
const int lutVal = calc_lut(smem, tHistVal, tid); const int lutVal = calc_lut(smem, tHistVal, tid);
uint ires = (uint)convert_int_rte(lutScale * lutVal); uint ires = (uint)convert_int_rte(lutScale * lutVal);
lut[(ty * tilesX + tx) * dstStep + tid] = lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
convert_uchar(clamp(ires, (uint)0, (uint)255)); convert_uchar(clamp(ires, (uint)0, (uint)255));
} }
...@@ -239,7 +218,8 @@ __kernel void transform(__global __const uchar * src, ...@@ -239,7 +218,8 @@ __kernel void transform(__global __const uchar * src,
const int srcStep, const int dstStep, const int lutStep, const int srcStep, const int dstStep, const int lutStep,
const int cols, const int rows, const int cols, const int rows,
const int2 tileSize, const int2 tileSize,
const int tilesX, const int tilesY) const int tilesX, const int tilesY,
const int src_offset, const int dst_offset, int lut_offset)
{ {
const int x = get_global_id(0); const int x = get_global_id(0);
const int y = get_global_id(1); const int y = get_global_id(1);
...@@ -261,15 +241,15 @@ __kernel void transform(__global __const uchar * src, ...@@ -261,15 +241,15 @@ __kernel void transform(__global __const uchar * src,
tx1 = max(tx1, 0); tx1 = max(tx1, 0);
tx2 = min(tx2, tilesX - 1); tx2 = min(tx2, tilesX - 1);
const int srcVal = src[mad24(y, srcStep, x)]; const int srcVal = src[mad24(y, srcStep, x + src_offset)];
float res = 0; float res = 0;
res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (1.0f - ya)); res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (1.0f - ya)); res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (ya)); res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (ya)); res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
uint ires = (uint)convert_int_rte(res); uint ires = (uint)convert_int_rte(res);
dst[mad24(y, dstStep, x)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment