Commit 396921dd authored by Li Peng's avatar Li Peng

5x5 gaussian blur optimization

Add new 5x5 gaussian blur kernel for CV_8UC1 format,
it is 50% ~ 70% faster than current ocl kernel in the perf test.
Signed-off-by: 's avatarLi Peng <peng.li@intel.com>
parent c48d7f86
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#define DIG(a) a,
__constant float kx[] = { KERNEL_MATRIX_X };
__constant float ky[] = { KERNEL_MATRIX_Y };
#define OP(y, x) (convert_float4(arr[y * 5 + x]) * ky[y] * kx[x])
#define FILL_ARR(s1, s2, n, e1, e2) \
arr[5 * n + 0] = row_s ? (uchar4)(s1, s2, line[n].s23) : (uchar4)(line[n].s0123); \
arr[5 * n + 1] = row_s ? (uchar4)(s2, line[n].s234) : (uchar4)(line[n].s1234); \
arr[5 * n + 2] = (uchar4)(line[n].s2345); \
arr[5 * n + 3] = row_e ? (uchar4)(line[n].s345, e1) : (uchar4)(line[n].s3456); \
arr[5 * n + 4] = row_e ? (uchar4)(line[n].s45, e1, e2) : (uchar4)(line[n].s4567);
__kernel void gaussianBlur5x5_8UC1_cols4(__global const uchar* src, int src_step,
__global uint* dst, int dst_step, int rows, int cols)
{
int x = get_global_id(0) * 4;
int y = get_global_id(1);
if (x >= cols || y >= rows) return;
uchar8 line[5];
int offset, src_index;
src_index = x + (y - 2) * src_step - 2;
offset = max(0, src_index + 2 * src_step);
line[2] = vload8(0, src + offset);
if (offset == 0) line[2] = (uchar8)(0, 0, line[2].s0123, line[2].s45);
#if defined BORDER_CONSTANT || defined BORDER_REPLICATE
uchar8 tmp;
#ifdef BORDER_CONSTANT
tmp = (uchar8)0;
#elif defined BORDER_REPLICATE
tmp = line[2];
#endif
line[0] = line[1] = tmp;
if (y > 1)
{
offset = max(0, src_index);
line[0] = vload8(0, src + offset);
if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
}
if (y > 0)
{
offset = max(0, src_index + src_step);
line[1] = vload8(0, src + offset);
if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45);
}
line[3] = (y == (rows - 1)) ? tmp : vload8(0, src + src_index + 3 * src_step);
line[4] = (y >= (rows - 2)) ? tmp : vload8(0, src + src_index + 4 * src_step);
#elif BORDER_REFLECT
int t;
t = (y <= 1) ? (abs(y - 1) - y + 2) : 0;
offset = max(0, src_index + t * src_step);
line[0] = vload8(0, src + offset);
if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
if (y == 0)
line[1] = line[2];
else
{
offset = max(0, src_index + 1 * src_step);
line[1] = vload8(0, src + offset);
if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[0].s45);
}
line[3] = (y == (rows - 1)) ? line[2] : vload8(0, src + src_index + 3 * src_step);
t = (y >= (rows - 2)) ? (abs(y - (rows - 1)) - (y - (rows - 2)) + 2) : 4;
line[4] = vload8(0, src + src_index + t * src_step);
#elif BORDER_REFLECT_101
if (y == 1)
line[0] = line[2];
else
{
offset = (y == 0) ? (src_index + 4 * src_step) : max(0, src_index);
line[0] = vload8(0, src + offset);
if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
}
offset = (y == 0) ? (src_index + 3 * src_step) : max(0, src_index + 1 * src_step);
line[1] = vload8(0, src + offset);
if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45);
line[3] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 3) * src_step);
if (y == (rows - 2))
line[4] = line[2];
else
{
line[4] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 4) * src_step);
}
#endif
bool row_s = (x == 0);
bool row_e = ((x + 4) == cols);
uchar4 arr[25];
uchar s, e;
#ifdef BORDER_CONSTANT
s = e = 0;
FILL_ARR(s, s, 0, e, e);
FILL_ARR(s, s, 1, e, e);
FILL_ARR(s, s, 2, e, e);
FILL_ARR(s, s, 3, e, e);
FILL_ARR(s, s, 4, e, e);
#elif defined BORDER_REPLICATE
s = line[0].s2;
e = line[0].s5;
FILL_ARR(s, s, 0, e, e);
s = line[1].s2;
e = line[1].s5;
FILL_ARR(s, s, 1, e, e);
s = line[2].s2;
e = line[2].s5;
FILL_ARR(s, s, 2, e, e);
s = line[3].s2;
e = line[3].s5;
FILL_ARR(s, s, 3, e, e);
s = line[4].s2;
e = line[4].s5;
FILL_ARR(s, s, 4, e, e);
#elif BORDER_REFLECT
uchar s1, s2;
uchar e1, e2;
s1 = line[0].s3;
s2 = line[0].s2;
e1 = line[0].s5;
e2 = line[0].s4;
FILL_ARR(s1, s2, 0, e1, e2);
s1 = line[1].s3;
s2 = line[1].s2;
e1 = line[1].s5;
e2 = line[1].s4;
FILL_ARR(s1, s2, 1, e1, e2);
s1 = line[2].s3;
s2 = line[2].s2;
e1 = line[2].s5;
e2 = line[2].s4;
FILL_ARR(s1, s2, 2, e1, e2);
s1 = line[3].s3;
s2 = line[3].s2;
e1 = line[3].s5;
e2 = line[3].s4;
FILL_ARR(s1, s2, 3, e1, e2);
s1 = line[4].s3;
s2 = line[4].s2;
e1 = line[4].s5;
e2 = line[4].s4;
FILL_ARR(s1, s2, 4, e1, e2);
#elif BORDER_REFLECT_101
s = line[0].s4;
e = line[0].s3;
FILL_ARR(s, e, 0, s, e);
s = line[1].s4;
e = line[1].s3;
FILL_ARR(s, e, 1, s, e);
s = line[2].s4;
e = line[2].s3;
FILL_ARR(s, e, 2, s, e);
s = line[3].s4;
e = line[3].s3;
FILL_ARR(s, e, 3, s, e);
s = line[4].s4;
e = line[4].s3;
FILL_ARR(s, e, 4, s, e);
#endif
float4 sum;
sum = OP(0, 0) + OP(0, 1) + OP(0, 2) + OP(0, 3) + OP(0, 4) +
OP(1, 0) + OP(1, 1) + OP(1, 2) + OP(1, 3) + OP(1, 4) +
OP(2, 0) + OP(2, 1) + OP(2, 2) + OP(2, 3) + OP(2, 4) +
OP(3, 0) + OP(3, 1) + OP(3, 2) + OP(3, 3) + OP(3, 4) +
OP(4, 0) + OP(4, 1) + OP(4, 2) + OP(4, 3) + OP(4, 4);
int dst_index = (x / 4) + y * (dst_step / 4);
dst[dst_index] = as_uint(convert_uchar4_sat_rte(sum));
}
......@@ -2135,15 +2135,16 @@ namespace cv
{
#ifdef HAVE_OPENCL
static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddepth,
InputArray _kernelX, InputArray _kernelY, int borderType)
static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth,
InputArray _kernelX, InputArray _kernelY, int borderType)
{
const ocl::Device & dev = ocl::Device::getDefault();
int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
if ( !(dev.isIntel() && (type == CV_8UC1) &&
(_src.offset() == 0) && (_src.step() % 4 == 0) &&
(_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)) )
((ksize.width == 5 && (_src.cols() % 4 == 0)) ||
(ksize.width == 3 && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)))) )
return false;
Mat kernelX = _kernelX.getMat().reshape(1, 1);
......@@ -2160,8 +2161,16 @@ static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddep
size_t globalsize[2] = { 0, 0 };
size_t localsize[2] = { 0, 0 };
globalsize[0] = size.width / 16;
globalsize[1] = size.height / 2;
if (ksize.width == 3)
{
globalsize[0] = size.width / 16;
globalsize[1] = size.height / 2;
}
else if (ksize.width == 5)
{
globalsize[0] = size.width / 4;
globalsize[1] = size.height / 1;
}
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
char build_opts[1024];
......@@ -2169,7 +2178,13 @@ static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddep
ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(),
ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str());
ocl::Kernel kernel("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts);
ocl::Kernel kernel;
if (ksize.width == 3)
kernel.create("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts);
else if (ksize.width == 5)
kernel.create("gaussianBlur5x5_8UC1_cols4", cv::ocl::imgproc::gaussianBlur5x5_oclsrc, build_opts);
if (kernel.empty())
return false;
......@@ -2436,9 +2451,10 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 &&
ksize.width == 3 && ksize.height == 3 &&
((ksize.width == 3 && ksize.height == 3) ||
(ksize.width == 5 && ksize.height == 5)) &&
(size_t)_src.rows() > ky.total() && (size_t)_src.cols() > kx.total(),
ocl_GaussianBlur3x3_8UC1(_src, _dst, CV_MAT_DEPTH(type), kx, ky, borderType));
ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType));
sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType );
}
......
......@@ -342,7 +342,7 @@ OCL_TEST_P(GaussianBlurTest, Mat)
}
}
PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType,
PARAM_TEST_CASE(GaussianBlur_multicols_Base, MatType,
int, // kernel size
Size, // dx, dy
BorderType, // border type
......@@ -372,11 +372,18 @@ PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType,
void random_roi()
{
size = Size(3, 3);
size = Size(ksize, ksize);
Size roiSize = randomSize(size.width, MAX_VALUE, size.height, MAX_VALUE);
roiSize.width = std::max(size.width + 13, roiSize.width & (~0xf));
roiSize.height = std::max(size.height + 1, roiSize.height & (~0x1));
if (ksize == 3)
{
roiSize.width = std::max((size.width + 15) & 0x10, roiSize.width & (~0xf));
roiSize.height = std::max(size.height + 1, roiSize.height & (~0x1));
}
else if (ksize == 5)
{
roiSize.width = std::max((size.width + 3) & 0x4, roiSize.width & (~0x3));
}
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256);
......@@ -402,9 +409,9 @@ PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType,
}
};
typedef GaussianBlur3x3_cols16_rows2_Base GaussianBlur3x3_cols16_rows2;
typedef GaussianBlur_multicols_Base GaussianBlur_multicols;
OCL_TEST_P(GaussianBlur3x3_cols16_rows2, Mat)
OCL_TEST_P(GaussianBlur_multicols, Mat)
{
Size kernelSize(ksize, ksize);
......@@ -710,9 +717,9 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
Bool(),
Values(1))); // not used
OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur3x3_cols16_rows2, Combine(
OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur_multicols, Combine(
Values((MatType)CV_8UC1),
Values(3), // kernel size
Values(3, 5), // kernel size
Values(Size(0, 0)), // not used
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
Values(0.0), // not used
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment