Commit 7f2662b3 authored by Ilya Lavrenov's avatar Ilya Lavrenov

fixes

parent 5403bdd2
......@@ -5,7 +5,7 @@
// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//#define CV_OPENCL_RUN_ASSERT
#define CV_OPENCL_RUN_ASSERT
#ifdef HAVE_OPENCL
......
......@@ -15,16 +15,16 @@
#ifdef DEPTH_0
#define MIN_VAL 0
#define MAX_VAL 255
#define MAX_VAL UCHAR_MAX
#elif defined DEPTH_1
#define MIN_VAL -128
#define MAX_VAL 127
#define MIN_VAL SCHAR_MIN
#define MAX_VAL SCHAR_MAX
#elif defined DEPTH_2
#define MIN_VAL 0
#define MAX_VAL 65535
#define MAX_VAL USHRT_MAX
#elif defined DEPTH_3
#define MIN_VAL -32768
#define MAX_VAL 32767
#define MIN_VAL SHRT_MIN
#define MAX_VAL SHRT_MAX
#elif defined DEPTH_4
#define MIN_VAL INT_MIN
#define MAX_VAL INT_MAX
......@@ -39,6 +39,14 @@
#define noconvert
#define INDEX_MAX UINT_MAX
#if kercn != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define srcTSIZE (int)sizeof(srcT1)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define srcTSIZE ((int)sizeof(srcT1))
#endif
#ifdef NEED_MINLOC
#define CALC_MINLOC(inc) minloc = id + inc
#else
......@@ -154,22 +162,22 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif
{
#ifdef HAVE_SRC_CONT
src_index = mul24(id, (int)sizeof(srcT1));
src_index = mul24(id, srcTSIZE);
#else
src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
#endif
temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
temp = convertToDT(loadpix(srcptr + src_index));
#ifdef OP_ABS
temp = temp >= (dstT)(0) ? temp : -temp;
#endif
#ifdef HAVE_SRC2
#ifdef HAVE_SRC2_CONT
src2_index = mul24(id, (int)sizeof(srcT1));
src2_index = mul24(id, srcTSIZE);
#else
src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
#endif
temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
temp2 = convertToDT(loadpix(src2ptr + src2_index));
temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
#ifdef OP_CALC2
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
......@@ -202,8 +210,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#elif kercn >= 2
CALC_P(s0, 0)
CALC_P(s1, 1)
#if kercn >= 4
#if kercn >= 3
CALC_P(s2, 2)
#if kercn >= 4
CALC_P(s3, 3)
#if kercn >= 8
CALC_P(s4, 4)
......@@ -222,6 +231,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif
#endif
#endif
#endif
#endif
}
}
......@@ -335,9 +345,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif
#ifdef NEED_MAXLOC
*(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
#endif
#ifdef OP_CALC2
pos = mad24(groupnum, (int)sizeof(uint), pos);
#endif
#endif
#ifdef OP_CALC2
*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
#endif
}
......
......@@ -148,11 +148,9 @@
#ifdef OP_CALC2
#define DECLARE_LOCAL_MEM \
__local dstT localmem[WGS2_ALIGNED]; \
__local dstT localmem2[WGS2_ALIGNED]
__local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED]
#define DEFINE_ACCUMULATOR \
dstT accumulator = (dstT)(0); \
dstT accumulator2 = (dstT)(0)
dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0)
#else
#define DECLARE_LOCAL_MEM \
__local dstT localmem[WGS2_ALIGNED]
......@@ -163,10 +161,10 @@
#ifdef HAVE_SRC2
#ifdef OP_CALC2
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp -= temp2; \
temp = temp > (dstT)(0) ? temp : -temp; \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
FUNC(accumulator2, temp2); \
FUNC(accumulator, temp)
#else
......@@ -258,6 +256,7 @@
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
FUNC(accumulator, temp); \
FUNC(accumulator2, temp2)
#elif kercn == 2
......@@ -265,6 +264,7 @@
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator2, temp2.s0); \
......@@ -274,6 +274,7 @@
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
......@@ -287,6 +288,7 @@
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
......@@ -308,6 +310,7 @@
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
......@@ -452,6 +455,20 @@
#endif
#endif
#ifdef OP_CALC2
#define SET_LOCAL_1 \
localmem[lid] = accumulator; \
localmem2[lid] = accumulator2
#define REDUCE_LOCAL_1 \
localmem[lid - WGS2_ALIGNED] += accumulator; \
localmem2[lid - WGS2_ALIGNED] += accumulator2
#define REDUCE_LOCAL_2 \
localmem[lid] += localmem[lid2]; \
localmem2[lid] += localmem2[lid2]
#define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid); \
storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid))
#else
#define SET_LOCAL_1 \
localmem[lid] = accumulator
#define REDUCE_LOCAL_1 \
......@@ -460,6 +477,7 @@
localmem[lid] += localmem[lid2]
#define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid)
#endif
// countNonZero stuff
#elif defined OP_COUNT_NON_ZERO
......@@ -516,20 +534,6 @@
accumulator += value.sF == zero ? zero : one
#endif
#ifdef OP_CALC2
#define SET_LOCAL_1 \
localmem[lid] = accumulator; \
localmem2[lid] = accumulator2; \
#define REDUCE_LOCAL_1 \
localmem[lid - WGS2_ALIGNED] += accumulator; \
localmem2[lid - WGS2_ALIGNED] += accumulator2
#define REDUCE_LOCAL_2 \
localmem[lid] += localmem[lid2]; \
localmem2[lid] += localmem2[lid2]
#define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid); \
storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
#else
#define SET_LOCAL_1 \
localmem[lid] = accumulator
#define REDUCE_LOCAL_1 \
......@@ -538,7 +542,6 @@
localmem[lid] += localmem[lid2]
#define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid)
#endif
// norm (NORM_INF) with cn > 1 and mask
#elif defined OP_NORM_INF_MASK
......
......@@ -550,9 +550,9 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
Mat mres = db.getMat(ACCESS_READ);
if (calc2)
const_cast<Scalar &>(res2) = func(mres.colRange(dbsize, dbsize));
const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize));
res = func(mres.colRange(0, dbsize));
res = func(mres.colRange(0, ngroups));
return true;
}
return false;
......@@ -1434,10 +1434,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
haveSrc2 = _src2.kind() != _InputArray::NONE;
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src));
CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
(cn >= 1 && _mask.empty() && !minLoc && !maxLoc) );
CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
(cn >= 1 && (!haveMask || haveSrc2) && !minLoc && !maxLoc) );
if (ddepth < 0)
ddepth = depth;
......@@ -1484,6 +1484,8 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
printf("%s\n", opts.c_str());
ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
if (k.empty())
return false;
......@@ -2556,9 +2558,9 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
{
Scalar sc1, sc2;
int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
bool relative = (normType & NORM_RELATIVE) != 0,
normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
bool relative = (normType & NORM_RELATIVE) != 0;
normType &= ~NORM_RELATIVE;
bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
if ( !(normType == NORM_INF || normsum) )
return false;
......@@ -2608,8 +2610,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
#ifdef HAVE_OPENCL
double _result = 0;
CV_OCL_RUN_(_src1.isUMat() && _src2.isUMat() &&
_src1.dims() <= 2 && _src2.dims() <= 2,
CV_OCL_RUN_(_src1.isUMat(),
ocl_norm(_src1, _src2, normType, _mask, _result),
_result)
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment