fixes

7f2662b3 · Ilya Lavrenov · 5403bdd2 · 7f2662b3 · 7f2662b3 · 7f2662b3
Commit 7f2662b3 authored Jun 08, 2014 by Ilya Lavrenov
4 changed files
--- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
+++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
@@ -5,7 +5,7 @@
 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.

-//#define CV_OPENCL_RUN_ASSERT
+#define CV_OPENCL_RUN_ASSERT

 #ifdef HAVE_OPENCL


--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -15,16 +15,16 @@

 #ifdef DEPTH_0
 #define MIN_VAL 0
-#define MAX_VAL 255
+#define MAX_VAL UCHAR_MAX
 #elif defined DEPTH_1
-#define MIN_VAL -128
-#define MAX_VAL 127
+#define MIN_VAL SCHAR_MIN
+#define MAX_VAL SCHAR_MAX
 #elif defined DEPTH_2
 #define MIN_VAL 0
-#define MAX_VAL 65535
+#define MAX_VAL USHRT_MAX
 #elif defined DEPTH_3
-#define MIN_VAL -32768
-#define MAX_VAL 32767
+#define MIN_VAL SHRT_MIN
+#define MAX_VAL SHRT_MAX
 #elif defined DEPTH_4
 #define MIN_VAL INT_MIN
 #define MAX_VAL INT_MAX
@@ -39,6 +39,14 @@
 #define noconvert
 #define INDEX_MAX UINT_MAX

+#if kercn != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define srcTSIZE (int)sizeof(srcT1)
+#else
+#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
+#define srcTSIZE ((int)sizeof(srcT1))
+#endif
+
 #ifdef NEED_MINLOC
 #define CALC_MINLOC(inc) minloc = id + inc
 #else
@@ -154,22 +162,22 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
        {
 #ifdef HAVE_SRC_CONT
-            src_index = mul24(id, (int)sizeof(srcT1));
+            src_index = mul24(id, srcTSIZE);
 #else
-            src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
+            src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
 #endif
-            temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
+            temp = convertToDT(loadpix(srcptr + src_index));
 #ifdef OP_ABS
            temp = temp >= (dstT)(0) ? temp : -temp;
 #endif

 #ifdef HAVE_SRC2
 #ifdef HAVE_SRC2_CONT
-            src2_index = mul24(id, (int)sizeof(srcT1));
+            src2_index = mul24(id, srcTSIZE);
 #else
-            src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
+            src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
 #endif
-            temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
+            temp2 = convertToDT(loadpix(src2ptr + src2_index));
            temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
 #ifdef OP_CALC2
            temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
@@ -202,8 +210,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #elif kercn >= 2
            CALC_P(s0, 0)
            CALC_P(s1, 1)
-#if kercn >= 4
+#if kercn >= 3
            CALC_P(s2, 2)
+#if kercn >= 4
            CALC_P(s3, 3)
 #if kercn >= 8
            CALC_P(s4, 4)
@@ -222,6 +231,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #endif
 #endif
+#endif
 #endif
        }
    }
@@ -335,9 +345,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #ifdef NEED_MAXLOC
        *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
-#endif
 #ifdef OP_CALC2
        pos = mad24(groupnum, (int)sizeof(uint), pos);
+#endif
+#endif
+#ifdef OP_CALC2
        *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
 #endif
    }

--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -148,11 +148,9 @@

 #ifdef OP_CALC2
 #define DECLARE_LOCAL_MEM \
-    __local dstT localmem[WGS2_ALIGNED]; \
-    __local dstT localmem2[WGS2_ALIGNED]
+    __local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED]
 #define DEFINE_ACCUMULATOR \
-    dstT accumulator = (dstT)(0); \
-    dstT accumulator2 = (dstT)(0)
+    dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0)
 #else
 #define DECLARE_LOCAL_MEM \
    __local dstT localmem[WGS2_ALIGNED]
@@ -163,10 +161,10 @@
 #ifdef HAVE_SRC2
 #ifdef OP_CALC2
 #define PROCESS_ELEMS \
-    dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
    dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp -= temp2; \
-    temp = temp > (dstT)(0) ? temp : -temp; \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
    FUNC(accumulator2, temp2); \
    FUNC(accumulator, temp)
 #else
@@ -258,6 +256,7 @@
    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
    FUNC(accumulator, temp); \
    FUNC(accumulator2, temp2)
 #elif kercn == 2
@@ -265,6 +264,7 @@
    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
    FUNC(accumulator, temp.s0); \
    FUNC(accumulator, temp.s1); \
    FUNC(accumulator2, temp2.s0); \
@@ -274,6 +274,7 @@
    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
    FUNC(accumulator, temp.s0); \
    FUNC(accumulator, temp.s1); \
    FUNC(accumulator, temp.s2); \
@@ -287,6 +288,7 @@
    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
    FUNC(accumulator, temp.s0); \
    FUNC(accumulator, temp.s1); \
    FUNC(accumulator, temp.s2); \
@@ -308,6 +310,7 @@
    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
    FUNC(accumulator, temp.s0); \
    FUNC(accumulator, temp.s1); \
    FUNC(accumulator, temp.s2); \
@@ -452,6 +455,20 @@
 #endif
 #endif

+#ifdef OP_CALC2
+#define SET_LOCAL_1 \
+    localmem[lid] = accumulator; \
+    localmem2[lid] = accumulator2
+#define REDUCE_LOCAL_1 \
+    localmem[lid - WGS2_ALIGNED] += accumulator; \
+    localmem2[lid - WGS2_ALIGNED] += accumulator2
+#define REDUCE_LOCAL_2 \
+    localmem[lid] += localmem[lid2]; \
+    localmem2[lid] += localmem2[lid2]
+#define CALC_RESULT \
+    storepix(localmem[0], dstptr + dstTSIZE * gid); \
+    storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid))
+#else
 #define SET_LOCAL_1 \
    localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -460,6 +477,7 @@
    localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
    storepix(localmem[0], dstptr + dstTSIZE * gid)
+#endif

 // countNonZero stuff
 #elif defined OP_COUNT_NON_ZERO
@@ -516,20 +534,6 @@
    accumulator += value.sF == zero ? zero : one
 #endif

-#ifdef OP_CALC2
-#define SET_LOCAL_1 \
-    localmem[lid] = accumulator; \
-    localmem2[lid] = accumulator2; \
-#define REDUCE_LOCAL_1 \
-    localmem[lid - WGS2_ALIGNED] += accumulator; \
-    localmem2[lid - WGS2_ALIGNED] += accumulator2
-#define REDUCE_LOCAL_2 \
-    localmem[lid] += localmem[lid2]; \
-    localmem2[lid] += localmem2[lid2]
-#define CALC_RESULT \
-    storepix(localmem[0], dstptr + dstTSIZE * gid); \
-    storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
-#else
 #define SET_LOCAL_1 \
    localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -538,7 +542,6 @@
    localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
    storepix(localmem[0], dstptr + dstTSIZE * gid)
-#endif

 // norm (NORM_INF) with cn > 1 and mask
 #elif defined OP_NORM_INF_MASK

--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -550,9 +550,9 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask

        Mat mres = db.getMat(ACCESS_READ);
        if (calc2)
-            const_cast<Scalar &>(res2) = func(mres.colRange(dbsize, dbsize));
+            const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize));

-        res = func(mres.colRange(0, dbsize));
+        res = func(mres.colRange(0, ngroups));
        return true;
    }
    return false;
@@ -1434,10 +1434,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
    bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
        haveSrc2 = _src2.kind() != _InputArray::NONE;
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
+            kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src));

-    CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
-              (cn >= 1 && _mask.empty() && !minLoc && !maxLoc) );
+    CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
+              (cn >= 1 && (!haveMask || haveSrc2) && !minLoc && !maxLoc) );

    if (ddepth < 0)
        ddepth = depth;
@@ -1484,6 +1484,8 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                         haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");

+    printf("%s\n", opts.c_str());
+
    ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
    if (k.empty())
        return false;
@@ -2556,9 +2558,9 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
 {
    Scalar sc1, sc2;
    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool relative = (normType & NORM_RELATIVE) != 0,
-        normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
+    bool relative = (normType & NORM_RELATIVE) != 0;
    normType &= ~NORM_RELATIVE;
+    bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;

    if ( !(normType == NORM_INF || normsum) )
        return false;
@@ -2608,8 +2610,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m

 #ifdef HAVE_OPENCL
    double _result = 0;
-    CV_OCL_RUN_(_src1.isUMat() && _src2.isUMat() &&
-                _src1.dims() <= 2 && _src2.dims() <= 2,
+    CV_OCL_RUN_(_src1.isUMat(),
                ocl_norm(_src1, _src2, normType, _mask, _result),
                _result)
 #endif