Commit ae56eaa5 authored by Andrey Pavlenko's avatar Andrey Pavlenko Committed by OpenCV Buildbot

Merge pull request #1609 from ilya-lavrenov:ocl_some_optimization

parents 0870d3d7 b18101b1
...@@ -56,6 +56,23 @@ ...@@ -56,6 +56,23 @@
using namespace cv; using namespace cv;
using namespace cv::ocl; using namespace cv::ocl;
static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
{
CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
int elemSize1 = sizeMap[depth];
int bufSize = elemSize1 * ocn;
std::vector<uchar> _buf(bufSize);
uchar * buf = &_buf[0];
scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
return _buf;
}
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
/////////////// add subtract multiply divide min max ///////////////////////// /////////////// add subtract multiply divide min max /////////////////////////
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
...@@ -84,7 +101,7 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const ...@@ -84,7 +101,7 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize(); int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
int maskstep1 = mask.step, maskoffset1 = mask.offset / mask.elemSize(); int maskstep1 = mask.step, maskoffset1 = mask.offset / mask.elemSize();
int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize(); int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
oclMat m; std::vector<uchar> m;
size_t localThreads[3] = { 16, 16, 1 }; size_t localThreads[3] = { 16, 16, 1 };
size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
...@@ -132,10 +149,9 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const ...@@ -132,10 +149,9 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
if (haveScalar) if (haveScalar)
{ {
const int WDepthMap[] = { CV_16S, CV_16S, CV_32S, CV_32S, CV_32S, CV_32F, CV_64F }; const int WDepthMap[] = { CV_16S, CV_16S, CV_32S, CV_32S, CV_32S, CV_32F, CV_64F };
m.create(1, 1, CV_MAKE_TYPE(WDepthMap[WDepth], oclChannels)); m = scalarToVector(scalar, WDepthMap[WDepth], oclChannels, src1.channels());
m.setTo(scalar);
args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data )); args.push_back( make_pair( m.size(), (void *)&m[0]));
kernelName += "_scalar"; kernelName += "_scalar";
} }
...@@ -1329,6 +1345,13 @@ static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName ...@@ -1329,6 +1345,13 @@ static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName
enum { AND = 0, OR, XOR }; enum { AND = 0, OR, XOR };
static std::string to_string(int value)
{
std::ostringstream stream;
stream << value;
return stream.str();
}
static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Scalar& src3, const oclMat &mask, static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Scalar& src3, const oclMat &mask,
oclMat &dst, int operationType) oclMat &dst, int operationType)
{ {
...@@ -1337,17 +1360,20 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca ...@@ -1337,17 +1360,20 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size())); CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
int elemSize = dst.elemSize();
int cols1 = dst.cols * elemSize;
oclMat m; oclMat m;
const char operationMap[] = { '&', '|', '^' }; const char operationMap[] = { '&', '|', '^' };
std::string kernelName("arithm_bitwise_binary"); std::string kernelName("arithm_bitwise_binary");
std::string buildOptions = format("-D Operation=%c", operationMap[operationType]);
int vlen = std::min<int>(8, src1.elemSize1() * src1.oclchannels());
std::string vlenstr = vlen > 1 ? to_string(vlen) : "";
std::string buildOptions = format("-D Operation=%c -D vloadn=vload%s -D vstoren=vstore%s -D elemSize=%d -D vlen=%d"
" -D ucharv=uchar%s",
operationMap[operationType], vlenstr.c_str(), vlenstr.c_str(),
(int)src1.elemSize(), vlen, vlenstr.c_str());
size_t localThreads[3] = { 16, 16, 1 }; size_t localThreads[3] = { 16, 16, 1 };
size_t globalThreads[3] = { cols1, dst.rows, 1 }; size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
...@@ -1360,7 +1386,6 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca ...@@ -1360,7 +1386,6 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
m.setTo(src3); m.setTo(src3);
args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&elemSize ) );
kernelName += "_scalar"; kernelName += "_scalar";
} }
...@@ -1377,9 +1402,6 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca ...@@ -1377,9 +1402,6 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
args.push_back( make_pair( sizeof(cl_int), (void *)&mask.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&mask.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&mask.offset )); args.push_back( make_pair( sizeof(cl_int), (void *)&mask.offset ));
if (!src2.empty())
args.push_back( make_pair( sizeof(cl_int), (void *)&elemSize ));
kernelName += "_mask"; kernelName += "_mask";
} }
...@@ -1387,7 +1409,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca ...@@ -1387,7 +1409,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
openCLExecuteKernel(src1.clCxt, mask.empty() ? (!src2.empty() ? &arithm_bitwise_binary : &arithm_bitwise_binary_scalar) : openCLExecuteKernel(src1.clCxt, mask.empty() ? (!src2.empty() ? &arithm_bitwise_binary : &arithm_bitwise_binary_scalar) :
...@@ -1400,12 +1422,12 @@ void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst) ...@@ -1400,12 +1422,12 @@ void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
{ {
if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_OpenCLDoubleNotSupported, "selected device doesn't support double"); CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
return; return;
} }
dst.create(src.size(), src.type()); dst.create(src.size(), src.type());
bitwise_unary_run(src, dst, "arithm_bitwise_not", &arithm_bitwise_not); bitwise_unary_run(src, dst, "arithm_bitwise_not", &arithm_bitwise_not);
} }
void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
......
...@@ -62,7 +62,7 @@ ...@@ -62,7 +62,7 @@
#if defined (FUNC_MUL) #if defined (FUNC_MUL)
#if defined (HAVE_SCALAR) #if defined (HAVE_SCALAR)
#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index])); #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar * convertToWT(src2[src2_index]));
#else #else
#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * convertToWT(src2[src2_index])); #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * convertToWT(src2[src2_index]));
#endif #endif
...@@ -72,7 +72,7 @@ ...@@ -72,7 +72,7 @@
#if defined (HAVE_SCALAR) #if defined (HAVE_SCALAR)
#define EXPRESSION T zero = (T)(0); \ #define EXPRESSION T zero = (T)(0); \
dst[dst_index] = src2[src2_index] == zero ? zero : \ dst[dst_index] = src2[src2_index] == zero ? zero : \
convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index])); convertToT(convertToWT(src1[src1_index]) * scalar / convertToWT(src2[src2_index]));
#else #else
#define EXPRESSION T zero = (T)(0); \ #define EXPRESSION T zero = (T)(0); \
dst[dst_index] = src2[src2_index] == zero ? zero : \ dst[dst_index] = src2[src2_index] == zero ? zero : \
...@@ -123,7 +123,7 @@ __kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_off ...@@ -123,7 +123,7 @@ __kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_off
// add mat with scale // add mat with scale
__kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset, __kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset,
__global T *src2, int src2_step, int src2_offset, __global T *src2, int src2_step, int src2_offset,
__global WT *scalar, WT scalar,
__global T *dst, int dst_step, int dst_offset, __global T *dst, int dst_step, int dst_offset,
int cols, int rows) int cols, int rows)
{ {
......
...@@ -52,20 +52,20 @@ ...@@ -52,20 +52,20 @@
#endif #endif
#if defined (FUNC_ADD) #if defined (FUNC_ADD)
#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar[0]); #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar);
#endif #endif
#if defined (FUNC_SUB) #if defined (FUNC_SUB)
#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar[0]); #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar);
#endif #endif
#if defined (FUNC_MUL) #if defined (FUNC_MUL)
#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0]); #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar);
#endif #endif
#if defined (FUNC_DIV) #if defined (FUNC_DIV)
#define EXPRESSION T zero = (T)(0); \ #define EXPRESSION T zero = (T)(0); \
dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index])); dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar / convertToWT(src1[src1_index]));
#endif #endif
#if defined (FUNC_ABS) #if defined (FUNC_ABS)
...@@ -75,7 +75,7 @@ ...@@ -75,7 +75,7 @@
#endif #endif
#if defined (FUNC_ABS_DIFF) #if defined (FUNC_ABS_DIFF)
#define EXPRESSION WT value = convertToWT(src1[src1_index]) - scalar[0]; \ #define EXPRESSION WT value = convertToWT(src1[src1_index]) - scalar; \
value = value > (WT)(0) ? value : -value; \ value = value > (WT)(0) ? value : -value; \
dst[dst_index] = convertToT(value); dst[dst_index] = convertToT(value);
#endif #endif
...@@ -85,7 +85,7 @@ ...@@ -85,7 +85,7 @@
/////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////
__kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset, __kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset,
__global WT *scalar, WT scalar,
__global T *dst, int dst_step, int dst_offset, __global T *dst, int dst_step, int dst_offset,
int cols, int rows) int cols, int rows)
{ {
......
...@@ -52,15 +52,15 @@ ...@@ -52,15 +52,15 @@
#endif #endif
#if defined (FUNC_ADD) #if defined (FUNC_ADD)
#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar[0]); #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar);
#endif #endif
#if defined (FUNC_SUB) #if defined (FUNC_SUB)
#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar[0]); #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar);
#endif #endif
#if defined (FUNC_MUL) #if defined (FUNC_MUL)
#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0]); #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar);
#endif #endif
#if defined (FUNC_DIV) #if defined (FUNC_DIV)
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
/////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////
__kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset, __kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset,
__global WT *scalar, WT scalar,
__global uchar *mask, int mask_step, int mask_offset, __global uchar *mask, int mask_step, int mask_offset,
__global T *dst, int dst_step, int dst_offset, __global T *dst, int dst_step, int dst_offset,
int cols, int rows) int cols, int rows)
......
...@@ -51,17 +51,32 @@ ...@@ -51,17 +51,32 @@
__kernel void arithm_bitwise_binary(__global uchar * src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_binary(__global uchar * src1, int src1_step, int src1_offset,
__global uchar * src2, int src2_step, int src2_offset, __global uchar * src2, int src2_step, int src2_offset,
__global uchar * dst, int dst_step, int dst_offset, __global uchar * dst, int dst_step, int dst_offset,
int cols1, int rows) int cols, int rows)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols1 && y < rows) if (x < cols && y < rows)
{ {
#if elemSize > 1
x *= elemSize;
#endif
int src1_index = mad24(y, src1_step, x + src1_offset); int src1_index = mad24(y, src1_step, x + src1_offset);
int src2_index = mad24(y, src2_step, x + src2_offset); int src2_index = mad24(y, src2_step, x + src2_offset);
int dst_index = mad24(y, dst_step, dst_offset + x); int dst_index = mad24(y, dst_step, x + dst_offset);
#if elemSize > 1
#pragma unroll
for (int i = 0; i < elemSize; i += vlen)
{
ucharv t0 = vloadn(0, src1 + src1_index + i);
ucharv t1 = vloadn(0, src2 + src2_index + i);
ucharv t2 = t0 Operation t1;
vstoren(t2, 0, dst + dst_index + i);
}
#else
dst[dst_index] = src1[src1_index] Operation src2[src2_index]; dst[dst_index] = src1[src1_index] Operation src2[src2_index];
#endif
} }
} }
...@@ -50,7 +50,7 @@ ...@@ -50,7 +50,7 @@
__kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, int src1_offset,
__global uchar * src2, int src2_step, int src2_offset, __global uchar * src2, int src2_step, int src2_offset,
__global uchar * mask, int mask_step, int mask_offset, int elemSize, __global uchar * mask, int mask_step, int mask_offset,
__global uchar * dst, int dst_step, int dst_offset, __global uchar * dst, int dst_step, int dst_offset,
int cols1, int rows) int cols1, int rows)
{ {
...@@ -59,15 +59,30 @@ __kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, i ...@@ -59,15 +59,30 @@ __kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, i
if (x < cols1 && y < rows) if (x < cols1 && y < rows)
{ {
int mask_index = mad24(y, mask_step, mask_offset + (x / elemSize)); int mask_index = mad24(y, mask_step, mask_offset + x);
if (mask[mask_index]) if (mask[mask_index])
{ {
#if elemSize > 1
x *= elemSize;
#endif
int src1_index = mad24(y, src1_step, x + src1_offset); int src1_index = mad24(y, src1_step, x + src1_offset);
int src2_index = mad24(y, src2_step, x + src2_offset); int src2_index = mad24(y, src2_step, x + src2_offset);
int dst_index = mad24(y, dst_step, x + dst_offset); int dst_index = mad24(y, dst_step, x + dst_offset);
#if elemSize > 1
#pragma unroll
for (int i = 0; i < elemSize; i += vlen)
{
ucharv t0 = vloadn(0, src1 + src1_index + i);
ucharv t1 = vloadn(0, src2 + src2_index + i);
ucharv t2 = t0 Operation t1;
vstoren(t2, 0, dst + dst_index + i);
}
#else
dst[dst_index] = src1[src1_index] Operation src2[src2_index]; dst[dst_index] = src1[src1_index] Operation src2[src2_index];
#endif
} }
} }
} }
...@@ -50,19 +50,33 @@ ...@@ -50,19 +50,33 @@
__kernel void arithm_bitwise_binary_scalar( __kernel void arithm_bitwise_binary_scalar(
__global uchar *src1, int src1_step, int src1_offset, __global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int elemSize, __global uchar *src2,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
int cols1, int rows) int cols, int rows)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols1 && y < rows) if (x < cols && y < rows)
{ {
#if elemSize > 1
x *= elemSize;
#endif
int src1_index = mad24(y, src1_step, src1_offset + x); int src1_index = mad24(y, src1_step, src1_offset + x);
int src2_index = x % elemSize;
int dst_index = mad24(y, dst_step, dst_offset + x); int dst_index = mad24(y, dst_step, dst_offset + x);
dst[dst_index] = src1[src1_index] Operation src2[src2_index]; #if elemSize > 1
#pragma unroll
for (int i = 0; i < elemSize; i += vlen)
{
ucharv t0 = vloadn(0, src1 + src1_index + i);
ucharv t1 = vloadn(0, src2 + i);
ucharv t2 = t0 Operation t1;
vstoren(t2, 0, dst + dst_index + i);
}
#else
dst[dst_index] = src1[src1_index] Operation src2[0];
#endif
} }
} }
...@@ -56,7 +56,7 @@ ...@@ -56,7 +56,7 @@
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int elemSize, __global uchar *src2,
__global uchar *mask, int mask_step, int mask_offset, __global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
int cols, int rows) int cols, int rows)
...@@ -66,14 +66,29 @@ __kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_s ...@@ -66,14 +66,29 @@ __kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_s
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
int mask_index = mad24(y, mask_step, (x / elemSize) + mask_offset); int mask_index = mad24(y, mask_step, x + mask_offset);
if (mask[mask_index]) if (mask[mask_index])
{ {
#if elemSize > 1
x *= elemSize;
#endif
int src1_index = mad24(y, src1_step, x + src1_offset); int src1_index = mad24(y, src1_step, x + src1_offset);
int src2_index = x % elemSize;
int dst_index = mad24(y, dst_step, x + dst_offset); int dst_index = mad24(y, dst_step, x + dst_offset);
dst[dst_index] = src1[src1_index] Operation src2[src2_index]; #if elemSize > 1
#pragma unroll
for (int i = 0; i < elemSize; i += vlen)
{
ucharv t0 = vloadn(0, src1 + src1_index + i);
ucharv t1 = vloadn(0, src2 + i);
ucharv t2 = t0 Operation t1;
vstoren(t2, 0, dst + dst_index + i);
}
#else
dst[dst_index] = src1[src1_index] Operation src2[0];
#endif
} }
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment