Commit cbf63b07 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #2851 from ilya-lavrenov:tapi_set_identity

parents 910d8f8e 6dd658a0
...@@ -2758,21 +2758,30 @@ namespace cv { ...@@ -2758,21 +2758,30 @@ namespace cv {
static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s ) static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
{ {
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = cn;
sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn), if (cn == 1)
{
kercn = std::min(ocl::predictOptimalVectorWidth(_m), 4);
if (kercn != 4)
kercn = 1;
}
int sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc, ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type), format("-D T=%s -D T1=%s -D cn=%d -D ST=%s -D kercn=%d -D rowsPerWI=%d",
ocl::memopTypeToStr(depth), cn, ocl::memopTypeToStr(sctype))); ocl::memopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::memopTypeToStr(depth), cn,
ocl::memopTypeToStr(sctype),
kercn, rowsPerWI));
if (k.empty()) if (k.empty())
return false; return false;
UMat m = _m.getUMat(); UMat m = _m.getUMat();
k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)), k.args(ocl::KernelArg::WriteOnly(m, cn, kercn),
rowsPerWI); ocl::KernelArg::Constant(Mat(1, 1, sctype, s)));
size_t globalsize[2] = { m.cols, (m.rows + rowsPerWI - 1) / rowsPerWI }; size_t globalsize[2] = { m.cols * cn / kercn, (m.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }
......
...@@ -43,20 +43,18 @@ ...@@ -43,20 +43,18 @@
// //
//M*/ //M*/
#if cn != 3 #if kercn != 3
#define loadpix(addr) *(__global const T *)(addr)
#define storepix(val, addr) *(__global T *)(addr) = val #define storepix(val, addr) *(__global T *)(addr) = val
#define TSIZE (int)sizeof(T) #define TSIZE (int)sizeof(T)
#define scalar scalar_ #define scalar scalar_
#else #else
#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr)) #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
#define TSIZE ((int)sizeof(T1)*3) #define TSIZE ((int)sizeof(T1)*3)
#define scalar (T)(scalar_.x, scalar_.y, scalar_.z) #define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
#endif #endif
__kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols, __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
ST scalar_, int rowsPerWI) ST scalar_)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y0 = get_global_id(1) * rowsPerWI; int y0 = get_global_id(1) * rowsPerWI;
...@@ -65,7 +63,35 @@ __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, ...@@ -65,7 +63,35 @@ __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset,
{ {
int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset)); int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step) #if kercn == cn
storepix(x == y ? scalar : (T)(0), srcptr + src_index); #pragma unroll
for (int y = y0, i = 0, y1 = min(rows, y0 + rowsPerWI); i < rowsPerWI; ++y, ++i, src_index += src_step)
if (y < y1)
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
#elif kercn == 4 && cn == 1
if (y0 < rows)
{
storepix(x == y0 >> 2 ? (T)(scalar, 0, 0, 0) : (T)(0), srcptr + src_index);
if (++y0 < rows)
{
src_index += src_step;
storepix(x == y0 >> 2 ? (T)(0, scalar, 0, 0) : (T)(0), srcptr + src_index);
if (++y0 < rows)
{
src_index += src_step;
storepix(x == y0 >> 2 ? (T)(0, 0, scalar, 0) : (T)(0), srcptr + src_index);
if (++y0 < rows)
{
src_index += src_step;
storepix(x == y0 >> 2 ? (T)(0, 0, 0, scalar) : (T)(0), srcptr + src_index);
}
}
}
}
#else
#error "Incorrect combination of cn && kercn"
#endif
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment