Commit 97dfd650 authored by Andrey Pavlenko's avatar Andrey Pavlenko Committed by OpenCV Buildbot

Merge pull request #1626 from ilya-lavrenov:ocl_filters

parents 5e24376a da3b7c79
...@@ -666,3 +666,17 @@ Performs linear blending of two images. ...@@ -666,3 +666,17 @@ Performs linear blending of two images.
:param weights2: Weights for second image. Must have tha same size as ``img2`` . Supports only ``CV_32F`` type. :param weights2: Weights for second image. Must have tha same size as ``img2`` . Supports only ``CV_32F`` type.
:param result: Destination image. :param result: Destination image.
ocl::medianFilter
--------------------
Blurs an image using the median filter.
.. ocv:function:: void ocl::medianFilter(const oclMat &src, oclMat &dst, int m)
:param src: input ```1-``` or ```4```-channel image; the image depth should be ```CV_8U```, ```CV_32F```.
:param dst: destination array of the same size and type as ```src```.
:param m: aperture linear size; it must be odd and greater than ```1```. Currently only ```3```, ```5``` are supported.
The function smoothes an image using the median filter with the \texttt{m} \times \texttt{m} aperture. Each channel of a multi-channel image is processed independently. In-place operation is supported.
...@@ -839,11 +839,8 @@ namespace cv ...@@ -839,11 +839,8 @@ namespace cv
//! Applies a generic geometrical transformation to an image. //! Applies a generic geometrical transformation to an image.
// Supports INTER_NEAREST, INTER_LINEAR. // Supports INTER_NEAREST, INTER_LINEAR.
// Map1 supports CV_16SC2, CV_32FC2 types. // Map1 supports CV_16SC2, CV_32FC2 types.
// Src supports CV_8UC1, CV_8UC2, CV_8UC4. // Src supports CV_8UC1, CV_8UC2, CV_8UC4.
CV_EXPORTS void remap(const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int bordertype, const Scalar &value = Scalar()); CV_EXPORTS void remap(const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int bordertype, const Scalar &value = Scalar());
//! copies 2D array to a larger destination array and pads borders with user-specifiable constant //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
...@@ -851,7 +848,7 @@ namespace cv ...@@ -851,7 +848,7 @@ namespace cv
CV_EXPORTS void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar()); CV_EXPORTS void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar());
//! Smoothes image using median filter //! Smoothes image using median filter
// The source 1- or 4-channel image. When m is 3 or 5, the image depth should be CV 8U or CV 32F. // The source 1- or 4-channel image. m should be 3 or 5, the image depth should be CV_8U or CV_32F.
CV_EXPORTS void medianFilter(const oclMat &src, oclMat &dst, int m); CV_EXPORTS void medianFilter(const oclMat &src, oclMat &dst, int m);
//! warps the image using affine transformation //! warps the image using affine transformation
......
...@@ -197,8 +197,8 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, ...@@ -197,8 +197,8 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
(src.rows == dst.rows)); (src.rows == dst.rows));
CV_Assert((src.oclchannels() == dst.oclchannels())); CV_Assert((src.oclchannels() == dst.oclchannels()));
int srcStep = src.step1() / src.oclchannels(); int srcStep = src.step / src.elemSize();
int dstStep = dst.step1() / dst.oclchannels(); int dstStep = dst.step / dst.elemSize();
int srcOffset = src.offset / src.elemSize(); int srcOffset = src.offset / src.elemSize();
int dstOffset = dst.offset / dst.elemSize(); int dstOffset = dst.offset / dst.elemSize();
...@@ -247,6 +247,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, ...@@ -247,6 +247,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s", sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s",
anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
s, rectKernel?"-D RECTKERNEL":""); s, rectKernel?"-D RECTKERNEL":"");
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
...@@ -260,6 +261,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, ...@@ -260,6 +261,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset)); args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset));
openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
} }
...@@ -351,7 +353,7 @@ Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat ...@@ -351,7 +353,7 @@ Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat
}; };
CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE); CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
CV_Assert(type == CV_8UC1 || type == CV_8UC3 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC1 || type == CV_32FC4); CV_Assert(type == CV_8UC1 || type == CV_8UC3 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC3 || type == CV_32FC4);
oclMat gpu_krnl; oclMat gpu_krnl;
normalizeKernel(kernel, gpu_krnl); normalizeKernel(kernel, gpu_krnl);
...@@ -361,9 +363,11 @@ Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat ...@@ -361,9 +363,11 @@ Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat
for(int i = 0; i < kernel.rows * kernel.cols; ++i) for(int i = 0; i < kernel.rows * kernel.cols; ++i)
if(kernel.data[i] != 1) if(kernel.data[i] != 1)
noZero = false; noZero = false;
MorphFilter_GPU* mfgpu=new MorphFilter_GPU(ksize, anchor, gpu_krnl, GPUMorfFilter_callers[op][CV_MAT_CN(type)]);
MorphFilter_GPU* mfgpu = new MorphFilter_GPU(ksize, anchor, gpu_krnl, GPUMorfFilter_callers[op][CV_MAT_CN(type)]);
if(noZero) if(noZero)
mfgpu->rectKernel = true; mfgpu->rectKernel = true;
return Ptr<BaseFilter_GPU>(mfgpu); return Ptr<BaseFilter_GPU>(mfgpu);
} }
...@@ -445,9 +449,7 @@ void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point a ...@@ -445,9 +449,7 @@ void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point a
iterations = 1; iterations = 1;
} }
else else
{
kernel = _kernel; kernel = _kernel;
}
Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations);
...@@ -462,14 +464,10 @@ void cv::ocl::erode(const oclMat &src, oclMat &dst, const Mat &kernel, Point anc ...@@ -462,14 +464,10 @@ void cv::ocl::erode(const oclMat &src, oclMat &dst, const Mat &kernel, Point anc
for (int i = 0; i < kernel.rows * kernel.cols; ++i) for (int i = 0; i < kernel.rows * kernel.cols; ++i)
if (kernel.data[i] != 0) if (kernel.data[i] != 0)
{
allZero = false; allZero = false;
}
if (allZero) if (allZero)
{
kernel.data[0] = 1; kernel.data[0] = 1;
}
morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue); morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue);
} }
...@@ -558,7 +556,7 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel ...@@ -558,7 +556,7 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
int filterWidth = ksize.width; int filterWidth = ksize.width;
bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4; // CV_32FC4 is not tuned up with filter2d_3x3 kernel bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4 && src.type() != CV_32FC3; // CV_32FC4 is not tuned up with filter2d_3x3 kernel
string kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D"; string kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D";
...@@ -649,9 +647,7 @@ Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const ...@@ -649,9 +647,7 @@ Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const
Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor, Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor,
int borderType) int borderType)
{ {
Size ksize = kernel.size(); Size ksize = kernel.size();
Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor, borderType); Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor, borderType);
return createFilter2D_GPU(linearFilter); return createFilter2D_GPU(linearFilter);
...@@ -659,11 +655,8 @@ Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType, ...@@ -659,11 +655,8 @@ Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType,
void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, int borderType) void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, int borderType)
{ {
if (ddepth < 0) if (ddepth < 0)
{
ddepth = src.depth(); ddepth = src.depth();
}
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
...@@ -1444,9 +1437,7 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do ...@@ -1444,9 +1437,7 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
int depth = CV_MAT_DEPTH(type); int depth = CV_MAT_DEPTH(type);
if (sigma2 <= 0) if (sigma2 <= 0)
{
sigma2 = sigma1; sigma2 = sigma1;
}
// automatic detection of kernel size from sigma // automatic detection of kernel size from sigma
if (ksize.width <= 0 && sigma1 > 0) if (ksize.width <= 0 && sigma1 > 0)
......
...@@ -408,20 +408,11 @@ namespace cv ...@@ -408,20 +408,11 @@ namespace cv
void medianFilter(const oclMat &src, oclMat &dst, int m) void medianFilter(const oclMat &src, oclMat &dst, int m)
{ {
CV_Assert( m % 2 == 1 && m > 1 ); CV_Assert( m % 2 == 1 && m > 1 );
CV_Assert( m <= 5 || src.depth() == CV_8U ); CV_Assert( (src.depth() == CV_8U || src.depth() == CV_32F) && (src.channels() == 1 || src.channels() == 4));
CV_Assert( src.cols <= dst.cols && src.rows <= dst.rows ); dst.create(src.size(), src.type());
if (src.data == dst.data)
{
oclMat src1;
src.copyTo(src1);
return medianFilter(src1, dst, m);
}
int srcStep = src.step1() / src.oclchannels(); int srcStep = src.step / src.elemSize(), dstStep = dst.step / dst.elemSize();
int dstStep = dst.step1() / dst.oclchannels(); int srcOffset = src.offset / src.elemSize(), dstOffset = dst.offset / dst.elemSize();
int srcOffset = src.offset / src.oclchannels() / src.elemSize1();
int dstOffset = dst.offset / dst.oclchannels() / dst.elemSize1();
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
...@@ -1518,6 +1509,7 @@ namespace cv ...@@ -1518,6 +1509,7 @@ namespace cv
float *color_weight = &_color_weight[0]; float *color_weight = &_color_weight[0];
float *space_weight = &_space_weight[0]; float *space_weight = &_space_weight[0];
int *space_ofs = &_space_ofs[0]; int *space_ofs = &_space_ofs[0];
int dst_step_in_pixel = dst.step / dst.elemSize(); int dst_step_in_pixel = dst.step / dst.elemSize();
int dst_offset_in_pixel = dst.offset / dst.elemSize(); int dst_offset_in_pixel = dst.offset / dst.elemSize();
int temp_step_in_pixel = temp.step / temp.elemSize(); int temp_step_in_pixel = temp.step / temp.elemSize();
...@@ -1548,7 +1540,7 @@ namespace cv ...@@ -1548,7 +1540,7 @@ namespace cv
if ((dst.type() == CV_8UC1) && ((dst.offset & 3) == 0) && ((dst.cols & 3) == 0)) if ((dst.type() == CV_8UC1) && ((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
{ {
kernelName = "bilateral2"; kernelName = "bilateral2";
globalThreads[0] = dst.cols / 4; globalThreads[0] = dst.cols >> 2;
} }
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
...@@ -1566,15 +1558,17 @@ namespace cv ...@@ -1566,15 +1558,17 @@ namespace cv
args.push_back( make_pair( sizeof(cl_mem), (void *)&oclcolor_weight.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&oclcolor_weight.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_weight.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_weight.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_ofs.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_ofs.data ));
openCLExecuteKernel(src.clCxt, &imgproc_bilateral, kernelName, globalThreads, localThreads, args, dst.oclchannels(), dst.depth()); openCLExecuteKernel(src.clCxt, &imgproc_bilateral, kernelName, globalThreads, localThreads, args, dst.oclchannels(), dst.depth());
} }
void bilateralFilter(const oclMat &src, oclMat &dst, int radius, double sigmaclr, double sigmaspc, int borderType) void bilateralFilter(const oclMat &src, oclMat &dst, int radius, double sigmaclr, double sigmaspc, int borderType)
{ {
dst.create( src.size(), src.type() ); dst.create( src.size(), src.type() );
if ( src.depth() == CV_8U ) if ( src.depth() == CV_8U )
oclbilateralFilter_8u( src, dst, radius, sigmaclr, sigmaspc, borderType ); oclbilateralFilter_8u( src, dst, radius, sigmaclr, sigmaspc, borderType );
else else
CV_Error( CV_StsUnsupportedFormat, "Bilateral filtering is only implemented for 8uimages" ); CV_Error( CV_StsUnsupportedFormat, "Bilateral filtering is only implemented for CV_8U images" );
} }
} }
......
...@@ -169,6 +169,7 @@ __kernel void filter2D( ...@@ -169,6 +169,7 @@ __kernel void filter2D(
int globalRow = groupStartRow + localRow; int globalRow = groupStartRow + localRow;
const int src_offset = mad24(src_offset_y, src_step, src_offset_x); const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x); const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
{ {
...@@ -208,6 +209,7 @@ __kernel void filter2D( ...@@ -208,6 +209,7 @@ __kernel void filter2D(
} }
} }
#endif #endif
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if(globalRow < rows && globalCol < cols) if(globalRow < rows && globalCol < cols)
{ {
...@@ -231,6 +233,7 @@ __kernel void filter2D( ...@@ -231,6 +233,7 @@ __kernel void filter2D(
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////Macro for define elements number per thread///////////////////////////// /////////////////////////////Macro for define elements number per thread/////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
#define ANX 1 #define ANX 1
#define ANY 1 #define ANY 1
...@@ -249,6 +252,7 @@ __kernel void filter2D( ...@@ -249,6 +252,7 @@ __kernel void filter2D(
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// /////////////////////////////////////////8uC1////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void filter2D_3x3( __kernel void filter2D_3x3(
__global T_IMG *src, __global T_IMG *src,
__global T_IMG *dst, __global T_IMG *dst,
...@@ -359,6 +363,7 @@ __kernel void filter2D_3x3( ...@@ -359,6 +363,7 @@ __kernel void filter2D_3x3(
} }
} }
} }
if(dst_rows_index < dst_rows_end) if(dst_rows_index < dst_rows_end)
{ {
T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum); T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
......
...@@ -45,6 +45,7 @@ ...@@ -45,6 +45,7 @@
//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii //BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii
#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2) #define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
#ifndef GENTYPE #ifndef GENTYPE
__kernel void morph_C1_D0(__global const uchar * restrict src, __kernel void morph_C1_D0(__global const uchar * restrict src,
__global uchar *dst, __global uchar *dst,
int src_offset_x, int src_offset_y, int src_offset_x, int src_offset_y,
...@@ -150,7 +151,9 @@ __kernel void morph_C1_D0(__global const uchar * restrict src, ...@@ -150,7 +151,9 @@ __kernel void morph_C1_D0(__global const uchar * restrict src,
} }
} }
} }
#else #else
__kernel void morph(__global const GENTYPE * restrict src, __kernel void morph(__global const GENTYPE * restrict src,
__global GENTYPE *dst, __global GENTYPE *dst,
int src_offset_x, int src_offset_y, int src_offset_x, int src_offset_y,
...@@ -221,4 +224,5 @@ __kernel void morph(__global const GENTYPE * restrict src, ...@@ -221,4 +224,5 @@ __kernel void morph(__global const GENTYPE * restrict src,
dst[out_addr] = res; dst[out_addr] = res;
} }
} }
#endif #endif
...@@ -47,25 +47,27 @@ __kernel void bilateral_C1_D0(__global uchar *dst, ...@@ -47,25 +47,27 @@ __kernel void bilateral_C1_D0(__global uchar *dst,
__constant float *space_weight, __constant float *space_weight,
__constant int *space_ofs) __constant int *space_ofs)
{ {
int gidx = get_global_id(0); int x = get_global_id(0);
int gidy = get_global_id(1); int y = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
if (y < dst_rows && x < dst_cols)
{ {
int src_addr = mad24(gidy+radius,src_step,gidx+radius); int src_index = mad24(y + radius, src_step, x + radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset); int dst_index = mad24(y, dst_step, x + dst_offset);
float sum = 0.f, wsum = 0.f; float sum = 0.f, wsum = 0.f;
int val0 = (int)src[src_addr]; int val0 = (int)src[src_index];
for(int k = 0; k < maxk; k++ ) for(int k = 0; k < maxk; k++ )
{ {
int val = (int)src[src_addr + space_ofs[k]]; int val = (int)src[src_index + space_ofs[k]];
float w = space_weight[k]*color_weight[abs(val - val0)]; float w = space_weight[k] * color_weight[abs(val - val0)];
sum += (float)(val)*w; sum += (float)(val) * w;
wsum += w; wsum += w;
} }
dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f); dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
} }
} }
__kernel void bilateral2_C1_D0(__global uchar *dst, __kernel void bilateral2_C1_D0(__global uchar *dst,
__global const uchar *src, __global const uchar *src,
const int dst_rows, const int dst_rows,
...@@ -81,25 +83,28 @@ __kernel void bilateral2_C1_D0(__global uchar *dst, ...@@ -81,25 +83,28 @@ __kernel void bilateral2_C1_D0(__global uchar *dst,
__constant float *space_weight, __constant float *space_weight,
__constant int *space_ofs) __constant int *space_ofs)
{ {
int gidx = get_global_id(0)<<2; int x = get_global_id(0) << 2;
int gidy = get_global_id(1); int y = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
if (y < dst_rows && x < dst_cols)
{ {
int src_addr = mad24(gidy+radius,src_step,gidx+radius); int src_index = mad24(y + radius, src_step, x + radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset); int dst_index = mad24(y, dst_step, x + dst_offset);
float4 sum = (float4)(0.f), wsum = (float4)(0.f); float4 sum = (float4)(0.f), wsum = (float4)(0.f);
int4 val0 = convert_int4(vload4(0,src+src_addr)); int4 val0 = convert_int4(vload4(0,src + src_index));
for(int k = 0; k < maxk; k++ ) for(int k = 0; k < maxk; k++ )
{ {
int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k])); int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k]));
float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]); float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)],
sum += convert_float4(val)*w; color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]);
sum += convert_float4(val) * w;
wsum += w; wsum += w;
} }
*(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f); *(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f);
} }
} }
__kernel void bilateral_C4_D0(__global uchar4 *dst, __kernel void bilateral_C4_D0(__global uchar4 *dst,
__global const uchar4 *src, __global const uchar4 *src,
const int dst_rows, const int dst_rows,
...@@ -115,24 +120,26 @@ __kernel void bilateral_C4_D0(__global uchar4 *dst, ...@@ -115,24 +120,26 @@ __kernel void bilateral_C4_D0(__global uchar4 *dst,
__constant float *space_weight, __constant float *space_weight,
__constant int *space_ofs) __constant int *space_ofs)
{ {
int gidx = get_global_id(0); int x = get_global_id(0);
int gidy = get_global_id(1); int y = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
if (y < dst_rows && x < dst_cols)
{ {
int src_addr = mad24(gidy+radius,src_step,gidx+radius); int src_index = mad24(y + radius, src_step, x + radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset); int dst_index = mad24(y, dst_step, x + dst_offset);
float4 sum = (float4)0.f; float4 sum = (float4)0.f;
float wsum = 0.f; float wsum = 0.f;
int4 val0 = convert_int4(src[src_addr]); int4 val0 = convert_int4(src[src_index]);
for(int k = 0; k < maxk; k++ ) for(int k = 0; k < maxk; k++ )
{ {
int4 val = convert_int4(src[src_addr + space_ofs[k]]); int4 val = convert_int4(src[src_index + space_ofs[k]]);
float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)]; float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)];
sum += convert_float4(val)*(float4)w; sum += convert_float4(val) * (float4)w;
wsum += w; wsum += w;
} }
wsum=1.f/wsum;
dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f); wsum = 1.f / wsum;
dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f);
} }
} }
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment