Commit 5e8a3cde authored by Alexander Alekhin's avatar Alexander Alekhin Committed by OpenCV Buildbot

Merge pull request #2757 from ElenaGvozdeva:ocl_flip

parents 72385979 437927b7
...@@ -610,6 +610,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, ...@@ -610,6 +610,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS }; enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
...@@ -628,9 +629,12 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) ...@@ -628,9 +629,12 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
else else
kernelName = "arithm_flip_rows_cols", flipType = FLIP_BOTH; kernelName = "arithm_flip_rows_cols", flipType = FLIP_BOTH;
ocl::Device dev = ocl::Device::getDefault();
int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
ocl::Kernel k(kernelName, ocl::core::flip_oclsrc, ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
format( "-D T=%s -D T1=%s -D cn=%d", ocl::memopTypeToStr(type), format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d", ocl::memopTypeToStr(type),
ocl::memopTypeToStr(depth), cn)); ocl::memopTypeToStr(depth), cn, pxPerWIy));
if (k.empty()) if (k.empty())
return false; return false;
...@@ -645,10 +649,13 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) ...@@ -645,10 +649,13 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
k.args(ocl::KernelArg::ReadOnlyNoSize(src), k.args(ocl::KernelArg::ReadOnlyNoSize(src),
ocl::KernelArg::WriteOnly(dst), rows, cols); ocl::KernelArg::WriteOnly(dst), rows, cols);
size_t maxWorkGroupSize = ocl::Device::getDefault().maxWorkGroupSize(); size_t maxWorkGroupSize = dev.maxWorkGroupSize();
CV_Assert(maxWorkGroupSize % 4 == 0); CV_Assert(maxWorkGroupSize % 4 == 0);
size_t globalsize[2] = { cols, rows }, localsize[2] = { maxWorkGroupSize / 4, 4 }; size_t globalsize[2] = { cols, rows }, localsize[2] = { maxWorkGroupSize / 4, 4 };
return k.run(2, globalsize, flipType == FLIP_COLS ? localsize : NULL, false); globalsize[1] = DIVUP(globalsize[1], pxPerWIy);
return k.run(2, globalsize, (flipType == FLIP_COLS) && (!dev.isIntel()) ? localsize : NULL, false);
} }
#endif #endif
......
...@@ -54,15 +54,29 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int ...@@ -54,15 +54,29 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int
int rows, int cols, int thread_rows, int thread_cols) int rows, int cols, int thread_rows, int thread_cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1)*PIX_PER_WI_Y;
if (x < cols && y < thread_rows) if (x < cols)
{ {
T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset))); int src_index1 = mad24(rows - y0 - 1, src_step, mad24(x, TSIZE, src_offset));
int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));
int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(x, TSIZE, dst_offset));
storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); #pragma unroll
storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset))); for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)
{
T src0 = loadpix(srcptr + src_index0);
T src1 = loadpix(srcptr + src_index1);
storepix(src1, dstptr + dst_index0);
storepix(src0, dstptr + dst_index1);
src_index0 += src_step;
src_index1 -= src_step;
dst_index0 += dst_step;
dst_index1 -= dst_step;
}
} }
} }
...@@ -71,16 +85,29 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, ...@@ -71,16 +85,29 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step,
int rows, int cols, int thread_rows, int thread_cols) int rows, int cols, int thread_rows, int thread_cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1)*PIX_PER_WI_Y;
if (x < cols && y < thread_rows) if (x < cols)
{ {
int x1 = cols - x - 1; int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); int src_index1 = mad24(rows - y0 - 1, src_step, mad24(cols - x - 1, TSIZE, src_offset));
T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset))); int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));
int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));
#pragma unroll
for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)
{
T src0 = loadpix(srcptr + src_index0);
T src1 = loadpix(srcptr + src_index1);
storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset))); storepix(src1, dstptr + dst_index0);
storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); storepix(src0, dstptr + dst_index1);
src_index0 += src_step;
src_index1 -= src_step;
dst_index0 += dst_step;
dst_index1 -= dst_step;
}
} }
} }
...@@ -89,15 +116,28 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int ...@@ -89,15 +116,28 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int
int rows, int cols, int thread_rows, int thread_cols) int rows, int cols, int thread_rows, int thread_cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1)*PIX_PER_WI_Y;
if (x < thread_cols && y < rows) if (x < thread_cols)
{ {
int x1 = cols - x - 1; int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); int src_index1 = mad24(y0, src_step, mad24(cols - x - 1, TSIZE, src_offset));
T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset))); int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));
int dst_index1 = mad24(y0, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));
#pragma unroll
for (int y = y0, y1 = min(rows, y0 + PIX_PER_WI_Y); y < y1; ++y)
{
T src0 = loadpix(srcptr + src_index0);
T src1 = loadpix(srcptr + src_index1);
storepix(src1, dstptr + dst_index0);
storepix(src0, dstptr + dst_index1);
storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset))); src_index0 += src_step;
storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); src_index1 += src_step;
dst_index0 += dst_step;
dst_index1 += dst_step;
}
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment