Commit 54e4ef65 authored by Ilya Lavrenov's avatar Ilya Lavrenov

optimized cv::transpose inplace

parent 9c8b9fc7
...@@ -2973,8 +2973,10 @@ static inline int divUp(int a, int b) ...@@ -2973,8 +2973,10 @@ static inline int divUp(int a, int b)
static bool ocl_transpose( InputArray _src, OutputArray _dst ) static bool ocl_transpose( InputArray _src, OutputArray _dst )
{ {
const ocl::Device & dev = ocl::Device::getDefault();
const int TILE_DIM = 32, BLOCK_ROWS = 8; const int TILE_DIM = 32, BLOCK_ROWS = 8;
int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type),
rowsPerWI = dev.isIntel() ? 4 : 1;
UMat src = _src.getUMat(); UMat src = _src.getUMat();
_dst.create(src.cols, src.rows, type); _dst.create(src.cols, src.rows, type);
...@@ -2990,9 +2992,9 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst ) ...@@ -2990,9 +2992,9 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
} }
ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc, ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d", format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d -D rowsPerWI=%d",
ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
cn, TILE_DIM, BLOCK_ROWS)); cn, TILE_DIM, BLOCK_ROWS, rowsPerWI));
if (k.empty()) if (k.empty())
return false; return false;
...@@ -3003,7 +3005,13 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst ) ...@@ -3003,7 +3005,13 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
ocl::KernelArg::WriteOnlyNoSize(dst)); ocl::KernelArg::WriteOnlyNoSize(dst));
size_t localsize[2] = { TILE_DIM, BLOCK_ROWS }; size_t localsize[2] = { TILE_DIM, BLOCK_ROWS };
size_t globalsize[2] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS }; size_t globalsize[2] = { src.cols, inplace ? (src.rows + rowsPerWI - 1) / rowsPerWI : (divUp(src.rows, TILE_DIM) * BLOCK_ROWS) };
if (inplace && dev.isIntel())
{
localsize[0] = 16;
localsize[1] = dev.maxWorkGroupSize() / localsize[0];
}
return k.run(2, globalsize, localsize, false); return k.run(2, globalsize, localsize, false);
} }
......
...@@ -117,18 +117,24 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off ...@@ -117,18 +117,24 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
__kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_offset, int src_rows) __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_offset, int src_rows)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * rowsPerWI;
if (y < src_rows && x < y) if (x < y + rowsPerWI)
{ {
int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset)); int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));
int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset)); int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset));
T tmp;
__global const uchar * src = srcptr + src_index; #pragma unroll
__global uchar * dst = srcptr + dst_index; for (int i = 0; i < rowsPerWI; ++i, ++y, src_index += src_step, dst_index += TSIZE)
if (y < src_rows && x < y)
{
__global uchar * src = srcptr + src_index;
__global uchar * dst = srcptr + dst_index;
T tmp = loadpix(dst); tmp = loadpix(dst);
storepix(loadpix(src), dst); storepix(loadpix(src), dst);
storepix(tmp, src); storepix(tmp, src);
}
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment