Commit bd1a1cc0 authored by Andrey Pavlenko's avatar Andrey Pavlenko Committed by OpenCV Buildbot

Merge pull request #1633 from ilya-lavrenov:ocl_imgproc

parents 599d51ec 4f68f35a
......@@ -52,25 +52,24 @@ using namespace cv::ocl;
void cv::ocl::columnSum(const oclMat &src, oclMat &dst)
{
CV_Assert(src.type() == CV_32FC1);
dst.create(src.size(), src.type());
Context *clCxt = src.clCxt;
const std::string kernelName = "columnSum";
int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
std::vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step));
args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset));
size_t globalThreads[3] = {dst.cols, 1, 1};
size_t localThreads[3] = {256, 1, 1};
openCLExecuteKernel(clCxt, &imgproc_columnsum, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
openCLExecuteKernel(src.clCxt, &imgproc_columnsum, "columnSum", globalThreads, localThreads, args, src.oclchannels(), src.depth());
}
This diff is collapsed.
......@@ -53,12 +53,8 @@ int calc_lut(__local int* smem, int val, int tid)
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0)
{
for (int i = 1; i < 256; ++i)
{
smem[i] += smem[i - 1];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
return smem[tid];
......@@ -71,69 +67,51 @@ void reduce(volatile __local int* smem, int val, int tid)
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
{
smem[tid] = val += smem[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
{
smem[tid] = val += smem[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
{
smem[tid] += smem[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem[tid] += smem[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem[tid] += smem[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem[tid] += smem[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem[tid] += smem[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem[256] = smem[tid] + smem[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
#else
void reduce(__local volatile int* smem, int val, int tid)
{
smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
{
smem[tid] = val += smem[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
{
smem[tid] = val += smem[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
......@@ -141,12 +119,17 @@ void reduce(__local volatile int* smem, int val, int tid)
smem[tid] += smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
if (tid < 16)
{
#endif
smem[tid] += smem[tid + 16];
#if WAVE_SIZE < 16
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) {
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
......@@ -159,7 +142,8 @@ void reduce(__local volatile int* smem, int val, int tid)
__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
const int srcStep, const int dstStep,
const int2 tileSize, const int tilesX,
const int clipLimit, const float lutScale)
const int clipLimit, const float lutScale,
const int src_offset, const int dst_offset)
{
__local int smem[512];
......@@ -173,25 +157,21 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
{
__global const uchar* srcPtr = src + mad24( ty * tileSize.y + i,
srcStep, tx * tileSize.x );
__global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
{
const int data = srcPtr[j];
atomic_inc(&smem[data]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int tHistVal = smem[tid];
barrier(CLK_LOCAL_MEM_FENCE);
if (clipLimit > 0)
{
// clip histogram bar
int clipped = 0;
if (tHistVal > clipLimit)
{
......@@ -200,7 +180,6 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
}
// find number of overall clipped samples
reduce(smem, clipped, tid);
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU
......@@ -229,7 +208,7 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
const int lutVal = calc_lut(smem, tHistVal, tid);
uint ires = (uint)convert_int_rte(lutScale * lutVal);
lut[(ty * tilesX + tx) * dstStep + tid] =
lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
convert_uchar(clamp(ires, (uint)0, (uint)255));
}
......@@ -239,7 +218,8 @@ __kernel void transform(__global __const uchar * src,
const int srcStep, const int dstStep, const int lutStep,
const int cols, const int rows,
const int2 tileSize,
const int tilesX, const int tilesY)
const int tilesX, const int tilesY,
const int src_offset, const int dst_offset, int lut_offset)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
......@@ -261,15 +241,15 @@ __kernel void transform(__global __const uchar * src,
tx1 = max(tx1, 0);
tx2 = min(tx2, tilesX - 1);
const int srcVal = src[mad24(y, srcStep, x)];
const int srcVal = src[mad24(y, srcStep, x + src_offset)];
float res = 0;
res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (1.0f - ya));
res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (1.0f - ya));
res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (ya));
res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (ya));
res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
uint ires = (uint)convert_int_rte(res);
dst[mad24(y, dstStep, x)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
}
......@@ -43,38 +43,28 @@
//
//M*/
#pragma OPENCL EXTENSION cl_amd_printf : enable
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
////////////////////////////////////////////////////////////////////
///////////////////////// columnSum ////////////////////////////////
////////////////////////////////////////////////////////////////////
/// CV_32FC1
__kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep)
__kernel void columnSum_C1_D5(__global float * src, __global float * dst,
int cols, int rows, int src_step, int dst_step, int src_offset, int dst_offset)
{
const int x = get_global_id(0);
srcStep >>= 2;
dstStep >>= 2;
if (x < srcCols)
if (x < cols)
{
int srcIdx = x ;
int dstIdx = x ;
int srcIdx = x + src_offset;
int dstIdx = x + dst_offset;
float sum = 0;
for (int y = 0; y < srcRows; ++y)
for (int y = 0; y < rows; ++y)
{
sum += src[srcIdx];
dst[dstIdx] = sum;
srcIdx += srcStep;
dstIdx += dstStep;
srcIdx += src_step;
dstIdx += dst_step;
}
}
}
......@@ -48,9 +48,12 @@
#elif defined (__NVIDIA__)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
/************************************** convolve **************************************/
__kernel void convolve_D5 (__global float *src, __global float *temp1, __global float *dst,
int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight)
__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst,
int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight,
int src_offset, int dst_offset, int koffset)
{
__local float smem[16 + 2 * 8][16 + 2 * 8];
......@@ -65,7 +68,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
// 0 | 0 x | x
// -----------
......@@ -73,7 +76,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
// 0 | 0 0 | 0
// -----------
......@@ -81,7 +84,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
// x | x 0 | 0
// -----------
// x | x 0 | 0
smem[y + 16][x] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
// 0 | 0 0 | 0
// -----------
......@@ -89,21 +92,18 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
// 0 | 0 x | x
// -----------
// 0 | 0 x | x
smem[y + 16][x + 16] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
barrier(CLK_LOCAL_MEM_FENCE);
if (gx < cols && gy < rows)
{
float res = 0;
float res = 0;
for (int i = 0; i < kHeight; ++i)
{
for (int j = 0; j < kWidth; ++j)
{
res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * (k_step>>2) + j];
}
}
dst[gy*(dst_step >> 2)+gx] = res;
}
res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset];
dst[gy * dst_step + gx + dst_offset] = res;
}
}
......@@ -34,6 +34,13 @@
//
//
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
#ifdef BORDER_CONSTANT
//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii
......
This diff is collapsed.
......@@ -62,8 +62,7 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
int, // border type, or iteration
bool) // roi or not
{
int type, borderType;
int ksize;
int type, borderType, ksize;
bool useRoi;
Mat src, dst_whole, src_roi, dst_roi;
......@@ -92,8 +91,12 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
void Near(double threshold = 0.0)
{
EXPECT_MAT_NEAR(dst_whole, Mat(gdst_whole), threshold);
EXPECT_MAT_NEAR(dst_roi, Mat(gdst_roi), threshold);
Mat roi, whole;
gdst_whole.download(whole);
gdst_roi.download(roi);
EXPECT_MAT_NEAR(dst_whole, whole, threshold);
EXPECT_MAT_NEAR(dst_roi, roi, threshold);
}
};
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment