Commit 6ae9870d authored by Roman Donchenko's avatar Roman Donchenko Committed by OpenCV Buildbot

Merge pull request #1254 from pengx17:2.4_filter2d_

parents bcba3fc6 124ede61
......@@ -691,7 +691,7 @@ namespace cv
//! returns 2D filter with the specified kernel
// supports CV_8UC1 and CV_8UC4 types
CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
//! returns the non-separable linear filter engine
CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel,
......
......@@ -572,7 +572,7 @@ void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &ke
namespace
{
typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point, const int);
typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , const oclMat & , const Size &, const Point&, const int);
class LinearFilter_GPU : public BaseFilter_GPU
{
......@@ -591,21 +591,22 @@ public:
};
}
static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
Size &ksize, const Point anchor, const int borderType)
static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel,
const Size &ksize, const Point& anchor, const int borderType)
{
CV_Assert(src.clCxt == dst.clCxt);
CV_Assert((src.cols == dst.cols) &&
(src.rows == dst.rows));
CV_Assert((src.oclchannels() == dst.oclchannels()));
CV_Assert((borderType != 0));
CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
CV_Assert(ksize.width == ksize.height);
Context *clCxt = src.clCxt;
int cn = src.oclchannels();
int depth = src.depth();
string kernelName = "filter2D";
int filterWidth = ksize.width;
bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4; // CV_32FC4 is not tuned up with filter2d_3x3 kernel
string kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D";
size_t src_offset_x = (src.offset % src.step) / src.elemSize();
size_t src_offset_y = src.offset / src.step;
......@@ -613,55 +614,81 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
size_t dst_offset_x = (dst.offset % dst.step) / dst.elemSize();
size_t dst_offset_y = dst.offset / dst.step;
int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
{4, 4, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1},
{4, 4, 4, 4, 1, 1, 4}
};
int paddingPixels = filterWidth & (-2);
size_t localThreads[3] = {ksize_3x3 ? 256 : 16, ksize_3x3 ? 1 : 16, 1};
size_t globalThreads[3] = {src.wholecols, src.wholerows, 1};
int vector_length = vector_lengths[cn - 1][depth];
int offset_cols = (dst_offset_x) & (vector_length - 1);
int cols = dst.cols + offset_cols;
int rows = divUp(dst.rows, vector_length);
int cn = src.oclchannels();
int src_step = (int)(src.step/src.elemSize());
int dst_step = (int)(dst.step/src.elemSize());
int localWidth = localThreads[0] + paddingPixels;
int localHeight = localThreads[1] + paddingPixels;
size_t localThreads[3] = {256, 1, 1};
size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
divUp(rows, localThreads[1]) *localThreads[1], 1
};
// 260 = divup((localThreads[0] + filterWidth * 2), 4) * 4
// 6 = (ROWS_PER_GROUP_WHICH_IS_4 + filterWidth * 2)
size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize();
int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
{4, 4, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1},
{4, 4, 4, 4, 1, 1, 4}
};
int cols = dst.cols + ((dst_offset_x) & (vector_lengths[cn - 1][src.depth()] - 1));
vector< pair<size_t, const void *> > args;
args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.step));
args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back(make_pair(sizeof(cl_int), (void *)&src_step));
args.push_back(make_pair(sizeof(cl_int), (void *)&dst_step));
args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
args.push_back(make_pair(localMemSize, (void *)NULL));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_x));
args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_y));
args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back(make_pair(sizeof(cl_int), (void *)&dst.step));
args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_x));
args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_y));
args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
const int buffer_size = 100;
char opt_buffer [buffer_size] = "";
sprintf(opt_buffer, "-DANCHOR=%d -DANX=%d -DANY=%d", ksize.width, anchor.x, anchor.y);
openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth, opt_buffer);
char btype[30];
switch (borderType)
{
case 0:
sprintf(btype, "BORDER_CONSTANT");
break;
case 1:
sprintf(btype, "BORDER_REPLICATE");
break;
case 2:
sprintf(btype, "BORDER_REFLECT");
break;
case 3:
CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
return;
case 4:
sprintf(btype, "BORDER_REFLECT_101");
break;
}
int type = src.depth();
char build_options[150];
sprintf(build_options, "-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d", btype, cn, type, cn, ksize.width);
openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
}
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
Point anchor, int borderType)
const Point &anchor, int borderType)
{
static const GPUFilter2D_t GPUFilter2D_callers[] = {0, GPUFilter2D, 0, GPUFilter2D, GPUFilter2D};
CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType);
oclMat gpu_krnl;
int nDivisor;
normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, false);
normalizeAnchor(anchor, ksize);
Point norm_archor = anchor;
normalizeKernel(kernel, gpu_krnl, CV_32FC1);
normalizeAnchor(norm_archor, ksize);
return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)],
borderType));
......
......@@ -15,7 +15,9 @@
// Third party copyrights are property of their respective owners.
//
// @Authors
// Pang Erping, erping@multicorewareinc.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
......@@ -42,292 +44,228 @@
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#define BORDER_REFLECT_101
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef BORDER_REPLICATE
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
#endif
#ifdef BORDER_REFLECT_101
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
#endif
#ifdef BORDER_WRAP
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
#ifdef IMG_C_1_0
#define T_IMG uchar
#define T_IMGx4 uchar4
#define T_IMG_C1 uchar
#define CONVERT_TYPE convert_uchar_sat
#define CONVERT_TYPEx4 convert_uchar4_sat
#endif
#ifdef IMG_C_4_0
#define T_IMG uchar4
#define T_IMGx4 uchar16
#define T_IMG_C1 uchar
#define CONVERT_TYPE convert_uchar4_sat
#define CONVERT_TYPEx4 convert_uchar16_sat
#endif
#ifdef IMG_C_1_5
#define T_IMG float
#define T_IMGx4 float4
#define T_IMG_C1 float
#define CONVERT_TYPE convert_float
#define CONVERT_TYPEx4 convert_float4
#endif
#ifdef IMG_C_4_5
#define T_IMG float4
#define T_IMGx4 float16
#define T_IMG_C1 float
#define CONVERT_TYPE convert_float4
#define CONVERT_TYPEx4 convert_float16
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////Macro for define elements number per thread/////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//#define ANCHOR 3
//#define ANX 1
//#define ANY 1
#define ROWS_PER_GROUP 4
#define ROWS_PER_GROUP_BITS 2
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
#define THREADS_PER_ROW 64
#define THREADS_PER_ROW_BIT 6
#ifndef CN
#define CN 1
#endif
#define ELEMENTS_PER_THREAD 4
#define ELEMENTS_PER_THREAD_BIT 2
#if CN == 1
#define T_SUM float
#define T_SUMx4 float4
#define CONVERT_TYPE_SUM convert_float
#define CONVERT_TYPE_SUMx4 convert_float4
#define SUM_ZERO (0.0f)
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f)
#define VLOAD4 vload4
#define SX x
#define SY y
#define SZ z
#define SW w
#elif CN == 4
#define T_SUM float4
#define T_SUMx4 float16
#define CONVERT_TYPE_SUM convert_float4
#define CONVERT_TYPE_SUMx4 convert_float16
#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
#define VLOAD4 vload16
#define SX s0123
#define SY s4567
#define SZ s89ab
#define SW scdef
#endif
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4
#ifndef FILTER_SIZE
#define FILTER_SIZE 3
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y,
__global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
int cols,int rows, int operate_cols, int wholecols, int wholerows)
#define LOCAL_GROUP_SIZE 16
#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
#define FILTER_RADIUS (FILTER_SIZE >> 1)
__kernel void filter2D(
__global T_IMG *src,
__global T_IMG *dst,
int src_step,
int dst_step,
__constant float *mat_kernel,
__local T_IMG *local_data,
int wholerows,
int wholecols,
int src_offset_x,
int src_offset_y,
int dst_offset_x,
int dst_offset_y,
int cols,
int rows,
int operate_cols
)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
int lX = get_local_id(0);
int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
__local uchar local_data[LOCAL_MEM_STEP * ROWS_FETCH];
if((gY << 2) < rows)
int groupStartCol = get_group_id(0) * get_local_size(0);
int groupStartRow = get_group_id(1) * get_local_size(1);
int localCol = get_local_id(0);
int localRow = get_local_id(1);
int globalCol = groupStartCol + localCol;
int globalRow = groupStartRow + localRow;
const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
#ifdef BORDER_CONSTANT
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
{
for(int i = 0; i < ROWS_FETCH; ++i)
int curRow = groupStartRow + i;
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
{
if((rows_start_index - src_offset_y) + i < rows + ANY)
int curCol = groupStartCol + j;
if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x)
{
#ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX;
uchar data = *(src + selected_row * src_step + selected_cols);
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
data = *(src + selected_row * src_step + selected_cols);
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
uchar data = *(src + selected_row * src_step + selected_cols);
local_data[i * LOCAL_MEM_STEP + lX ] =data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
data = *(src + selected_row * src_step + selected_cols);
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#endif
local_data[(i) * LOCAL_WIDTH + j] = 0;
}
else
{
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
if(((gY << 2) < rows) && (process_col < operate_cols))
#else
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
{
int dst_cols_start = dst_offset_x;
int dst_cols_end = dst_offset_x + cols;
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
int dst_rows_end = dst_offset_y + rows;
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
int curRow = groupStartRow + i;
uchar4 dst_data = *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index));
curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y);
int4 sum = (int4)(0);
uchar4 data;
curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS);
for(int i = 0; i < ANCHOR; i++)
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
{
#pragma unroll
for(int j = 0; j < ANCHOR; j++)
int curCol = groupStartCol + j;
curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x);
curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS);
if(curRow < wholerows && curCol < wholecols)
{
if(dst_rows_index < dst_rows_end)
{
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
}
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset];
}
}
if(dst_rows_index < dst_rows_end)
{
sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x;
sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
*((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32FC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y,
__global float *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
int cols,int rows, int operate_cols, int wholecols, int wholerows)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
int lX = get_local_id(0);
int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
__local float local_data[LOCAL_MEM_STEP * ROWS_FETCH];
if(((gY << 2) < rows))
#endif
barrier(CLK_LOCAL_MEM_FENCE);
if(globalRow < rows && globalCol < cols)
{
for(int i = 0; i < ROWS_FETCH; ++i)
T_SUM sum = (T_SUM)SUM_ZERO;
int filterIdx = 0;
for(int i = 0; i < FILTER_SIZE; i++)
{
if((rows_start_index - src_offset_y) + i < rows + ANY)
{
#ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX;
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
int offset = (i + localRow) * LOCAL_WIDTH;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX] =data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#endif
for(int j = 0; j < FILTER_SIZE; j++)
{
sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++];
}
}
dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum);
}
barrier(CLK_LOCAL_MEM_FENCE);
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
if(((gY << 2) < rows) && (process_col < operate_cols))
{
int dst_cols_start = dst_offset_x;
int dst_cols_end = dst_offset_x + cols;
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
int dst_rows_end = dst_offset_y + rows;
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
}
float4 dst_data = *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2)));
/// following is specific for 3x3 kernels
float4 sum = (float4)(0);
float4 data;
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////Macro for define elements number per thread/////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#define ANX 1
#define ANY 1
for(int i = 0; i < ANCHOR; i++)
{
#pragma unroll
for(int j = 0; j < ANCHOR; j++)
{
if(dst_rows_index < dst_rows_end)
{
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
#define ROWS_PER_GROUP 4
#define ROWS_PER_GROUP_BITS 2
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data);
}
}
}
#define THREADS_PER_ROW 64
#define THREADS_PER_ROW_BIT 6
if(dst_rows_index < dst_rows_end)
{
sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x;
sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
#define ELEMENTS_PER_THREAD 4
#define ELEMENTS_PER_THREAD_BIT 2
*((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
}
}
}
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y,
__global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
int cols,int rows, int operate_cols, int wholecols, int wholerows)
__kernel void filter2D_3x3(
__global T_IMG *src,
__global T_IMG *dst,
int src_step,
int dst_step,
__constant float *mat_kernel,
__local T_IMG *local_data,
int wholerows,
int wholecols,
int src_offset_x,
int src_offset_y,
int dst_offset_x,
int dst_offset_y,
int cols,
int rows,
int operate_cols
)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
......@@ -341,9 +279,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
__local uchar4 local_data[LOCAL_MEM_STEP * ROWS_FETCH];
if(((gY << 2) < rows))
if((gY << 2) < rows)
{
for(int i = 0; i < ROWS_FETCH; ++i)
{
......@@ -353,19 +289,19 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX;
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = src[mad24(selected_row, src_step, selected_cols)];
con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
......@@ -374,17 +310,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
local_data[i * LOCAL_MEM_STEP + lX] =data;
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
data = src[mad24(selected_row, src_step, selected_cols)];
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
}
#endif
}
......@@ -401,131 +337,40 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int dst_rows_end = dst_offset_y + rows;
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index);
uchar16 dst_data;
dst_data = *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2)));
T_IMGx4 dst_data = *(__global T_IMGx4 *)dst;
int16 sum = (int16)(0);
uchar16 data;
T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4;
T_IMGx4 data;
for(int i = 0; i < ANCHOR; i++)
for(int i = 0; i < FILTER_SIZE; i++)
{
#pragma unroll
for(int j = 0; j < ANCHOR; j++)
for(int j = 0; j < FILTER_SIZE; j++)
{
if(dst_rows_index < dst_rows_end)
{
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols));
sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data));
}
}
}
if(dst_rows_index < dst_rows_end)
{
uchar16 sum1 = convert_uchar16_sat(sum);
sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?
sum1.s0123 : dst_data.s0123;
sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))?
sum1.s4567 : dst_data.s4567;
sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))?
sum1.s89ab : dst_data.s89ab;
sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))?
sum1.scdef : dst_data.scdef;
*((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1;
T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ?
tmp_dst.SX : dst_data.SX;
tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ?
tmp_dst.SY : dst_data.SY;
tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ?
tmp_dst.SZ : dst_data.SZ;
tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ?
tmp_dst.SW : dst_data.SW;
*(__global T_IMGx4 *)dst = tmp_dst;
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32FC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#define ROWS_FETCH_C4 (1 + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
#define LOCAL_MEM_STEP_C4 260 //divup((get_local_size(0) + anX * 2), 4) * 4)
__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y,
__global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
int cols,int rows, int operate_cols, int wholecols, int wholerows)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
int lX = get_local_id(0);
int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0);
int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + gY - ANY;
__local float4 local_data[LOCAL_MEM_STEP_C4 * ROWS_FETCH_C4];
if((gY < rows) && (gX < (operate_cols + ANX + ANX)))
{
for(int i = 0; i < ROWS_FETCH_C4; ++i)
{
if((rows_start_index - src_offset_y) + i < rows + ANY)
{
#ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX;
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
local_data[i * LOCAL_MEM_STEP_C4 + lX] =data;
if(lX < (ANX << 1))
{
selected_cols = cols_start_index_group + lX + groupX_size;
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
}
#endif
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if((gY < rows) && (gX < operate_cols))
{
int dst_cols_index = dst_offset_x + gX;
int dst_rows_index = dst_offset_y + gY;
float4 sum = (float4)(0);
for(int i = 0; i < ANCHOR; i++)
{
for(int j = 0; j < ANCHOR; j++)
{
int local_cols = lX + j;
sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]);
}
}
*((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum;
}
}
......@@ -324,6 +324,35 @@ TEST_P(GaussianBlur, Mat)
////////////////////////////////////////////////////////////////////////////////////////////////////
// Filter2D
struct Filter2D : FilterTestBase
{
int type;
cv::Size ksize;
int bordertype;
Point anchor;
virtual void SetUp()
{
type = GET_PARAM(0);
ksize = GET_PARAM(1);
bordertype = GET_PARAM(3);
Init(type);
anchor = Point(-1,-1);
}
};
TEST_P(Filter2D, Mat)
{
cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
for(int j = 0; j < LOOP_TIMES; j++)
{
random_roi();
cv::filter2D(mat1_roi, dst_roi, -1, kernel, anchor, 0.0, bordertype);
cv::ocl::filter2D(gmat1, gdst, -1, kernel, anchor, bordertype);
Near(1);
}
}
INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7)),
......@@ -331,7 +360,7 @@ INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine(
INSTANTIATE_TEST_CASE_P(Filter, Laplacian, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(Size(3, 3)),
Values(Size(0, 0)), //not use
......@@ -365,4 +394,10 @@ INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
INSTANTIATE_TEST_CASE_P(Filter, Filter2D, testing::Combine(
Values(CV_8UC1, CV_32FC1, CV_32FC4),
Values(Size(3, 3), Size(15, 15), Size(25, 25)),
Values(Size(0, 0)), //not use
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REFLECT101, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT)));
#endif // HAVE_OPENCL
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment