optimized cv::norm with 2 args

20409958 · Ilya Lavrenov · 1a7a262f · 20409958 · 20409958 · 20409958
Commit 20409958 authored Jun 07, 2014 by Ilya Lavrenov
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 287 additions and 13 deletions

minmaxloc.cl modules/core/src/opencl/minmaxloc.cl +71 -10

reduce.cl modules/core/src/opencl/reduce.cl +216 -3

stat.cpp modules/core/src/stat.cpp +0 -0

No files found.
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -73,14 +73,26 @@
 #define CALC_MAX(p, inc)
 #endif

+#ifdef OP_CALC2
+#define CALC_MAX2(p) \
+    if (maxval2 < temp.p) \
+        maxval2 = temp.p
+#else
+#define CALC_MAX2(p)
+#endif
+
 #define CALC_P(p, inc) \
    CALC_MIN(p, inc) \
-    CALC_MAX(p, inc)
+    CALC_MAX(p, inc) \
+    CALC_MAX2(p)

 __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols,
                        int total, int groupnum, __global uchar * dstptr
 #ifdef HAVE_MASK
                        , __global const uchar * mask, int mask_step, int mask_offset
+#endif
+#ifdef HAVE_SRC2
+                        , __global const uchar * src2ptr, int src2_step, int src2_offset
 #endif
                        )
 {
@@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #ifdef HAVE_MASK
    mask += mask_offset;
 #endif
+#ifdef HAVE_SRC2
+    src2ptr += src2_offset;
+#endif

 #ifdef NEED_MINVAL
    __local dstT1 localmem_min[WGS2_ALIGNED];
+    dstT1 minval = MAX_VAL;
 #ifdef NEED_MINLOC
    __local uint localmem_minloc[WGS2_ALIGNED];
+    uint minloc = INDEX_MAX;
 #endif
 #endif
 #ifdef NEED_MAXVAL
+    dstT1 maxval = MIN_VAL;
    __local dstT1 localmem_max[WGS2_ALIGNED];
 #ifdef NEED_MAXLOC
    __local uint localmem_maxloc[WGS2_ALIGNED];
+    uint maxloc = INDEX_MAX;
+#endif
 #endif
+#ifdef OP_CALC2
+    __local dstT1 localmem_max2[WGS2_ALIGNED];
+    dstT1 maxval2 = MIN_VAL;
 #endif

-    dstT1 minval = MAX_VAL, maxval = MIN_VAL;
-    dstT temp;
-    uint minloc = INDEX_MAX, maxloc = INDEX_MAX;
    int src_index;
 #ifdef HAVE_MASK
    int mask_index;
 #endif
+#ifdef HAVE_SRC2
+    int src2_index;
+#endif

-    for (int grain = groupnum * WGS * kercn; id < total; id += grain)
-    {
-#ifdef HAVE_SRC_CONT
-        src_index = mul24(id, (int)sizeof(srcT1));
-#else
-        src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
+    dstT temp;
+#ifdef HAVE_SRC2
+    dstT temp2;
 #endif

+    for (int grain = groupnum * WGS * kercn; id < total; id += grain)
+    {
 #ifdef HAVE_MASK
 #ifdef HAVE_MASK_CONT
        mask_index = id;
@@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
        if (mask[mask_index])
 #endif
        {
+#ifdef HAVE_SRC_CONT
+            src_index = mul24(id, (int)sizeof(srcT1));
+#else
+            src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
+#endif
            temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
+#ifdef OP_ABS
+            temp = temp >= (dstT)(0) ? temp : -temp;
+#endif
+
+#ifdef HAVE_SRC2
+#ifdef HAVE_SRC2_CONT
+            src2_index = mul24(id, (int)sizeof(srcT1));
+#else
+            src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
+#endif
+            temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
+            temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
+#endif
+
 #if kercn == 1
 #ifdef NEED_MINVAL
            if (minval > temp)
@@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
                maxloc = id;
 #endif
            }
+#ifdef OP_CALC2
+            temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
+            if (maxval2 < temp2)
+                maxval2 = temp2;
+#endif
 #endif
 #elif kercn >= 2
            CALC_P(s0, 0)
@@ -191,6 +237,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #ifdef NEED_MAXLOC
        localmem_maxloc[lid] = maxloc;
+#endif
+#ifdef OP_CALC2
+        localmem_max2[lid] = maxval2;
 #endif
    }
    barrier(CLK_LOCAL_MEM_FENCE);
@@ -221,6 +270,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
                localmem_max[lid3] = maxval;
        }
+#endif
+#ifdef OP_CALC2
+        if (localmem_max2[lid3] < maxval2)
+            localmem_max2[lid3] = maxval2;
 #endif
    }
    barrier(CLK_LOCAL_MEM_FENCE);
@@ -254,6 +307,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
                    localmem_max[lid] = localmem_max[lid2];
            }
+#endif
+#ifdef OP_CALC2
+            if (localmem_max2[lid] < localmem_max2[lid2])
+                localmem_max2[lid] = localmem_max2[lid2];
 #endif
        }
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -276,6 +333,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #ifdef NEED_MAXLOC
        *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
+#endif
+#ifdef OP_CALC2
+        pos = mad24(groupnum, (int)sizeof(uint), pos);
+        *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
 #endif
    }
 }
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -109,13 +109,22 @@
 #endif

 #ifdef HAVE_MASK
+#ifdef HAVE_SRC2
+#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset
+#else
 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
+#endif
+#else
+#ifdef HAVE_SRC2
+#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset
 #else
 #define EXTRA_PARAMS
 #endif
+#endif

 // accumulative reduction stuff
 #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT
+
 #ifdef OP_DOT
 #if ddepth <= 4
 #define FUNC(a, b, c) a = mad24(b, c, a)
@@ -137,18 +146,48 @@
 #endif
 #endif

+#ifdef OP_CALC2
+#define DECLARE_LOCAL_MEM \
+    __local dstT localmem[WGS2_ALIGNED]; \
+    __local dstT localmem2[WGS2_ALIGNED]
+#define DEFINE_ACCUMULATOR \
+    dstT accumulator = (dstT)(0); \
+    dstT accumulator2 = (dstT)(0)
+#else
 #define DECLARE_LOCAL_MEM \
    __local dstT localmem[WGS2_ALIGNED]
 #define DEFINE_ACCUMULATOR \
    dstT accumulator = (dstT)(0)
+#endif
+
+#ifdef HAVE_SRC2
+#ifdef OP_CALC2
+#define PROCESS_ELEMS \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
+    dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp -= temp2; \
+    temp = temp > (dstT)(0) ? temp : -temp; \
+    FUNC(accumulator2, temp2); \
+    FUNC(accumulator, temp)
+#else
+#define PROCESS_ELEMS \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp)
+#endif
+#else
+#define PROCESS_ELEMS \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
+    FUNC(accumulator, temp)
+#endif

 #ifdef HAVE_MASK
 #define REDUCE_GLOBAL \
    MASK_INDEX; \
    if (mask[mask_index]) \
    { \
-        dstT temp = convertToDT(loadpix(srcptr + src_index)); \
-        FUNC(accumulator, temp); \
+        PROCESS_ELEMS; \
    }
 #elif defined OP_DOT

@@ -211,7 +250,158 @@
    FUNC(accumulator, temp.sF, temp2.sF)
 #endif

-#else
+#else // sum or norm with 2 args
+#ifdef HAVE_SRC2
+#ifdef OP_CALC2 // norm relative
+#if kercn == 1
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp); \
+    FUNC(accumulator2, temp2)
+#elif kercn == 2
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1)
+#elif kercn == 4
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1); \
+    FUNC(accumulator2, temp2.s2); \
+    FUNC(accumulator2, temp2.s3)
+#elif kercn == 8
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1); \
+    FUNC(accumulator2, temp2.s2); \
+    FUNC(accumulator2, temp2.s3); \
+    FUNC(accumulator2, temp2.s4); \
+    FUNC(accumulator2, temp2.s5); \
+    FUNC(accumulator2, temp2.s6); \
+    FUNC(accumulator2, temp2.s7)
+#elif kercn == 16
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7); \
+    FUNC(accumulator, temp.s8); \
+    FUNC(accumulator, temp.s9); \
+    FUNC(accumulator, temp.sA); \
+    FUNC(accumulator, temp.sB); \
+    FUNC(accumulator, temp.sC); \
+    FUNC(accumulator, temp.sD); \
+    FUNC(accumulator, temp.sE); \
+    FUNC(accumulator, temp.sF); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1); \
+    FUNC(accumulator2, temp2.s2); \
+    FUNC(accumulator2, temp2.s3); \
+    FUNC(accumulator2, temp2.s4); \
+    FUNC(accumulator2, temp2.s5); \
+    FUNC(accumulator2, temp2.s6); \
+    FUNC(accumulator2, temp2.s7); \
+    FUNC(accumulator2, temp2.s8); \
+    FUNC(accumulator2, temp2.s9); \
+    FUNC(accumulator2, temp2.sA); \
+    FUNC(accumulator2, temp2.sB); \
+    FUNC(accumulator2, temp2.sC); \
+    FUNC(accumulator2, temp2.sD); \
+    FUNC(accumulator2, temp2.sE); \
+    FUNC(accumulator2, temp2.sF)
+#endif
+#else // norm with 2 args
+#if kercn == 1
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp)
+#elif kercn == 2
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1)
+#elif kercn == 4
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3)
+#elif kercn == 8
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7)
+#elif kercn == 16
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7); \
+    FUNC(accumulator, temp.s8); \
+    FUNC(accumulator, temp.s9); \
+    FUNC(accumulator, temp.sA); \
+    FUNC(accumulator, temp.sB); \
+    FUNC(accumulator, temp.sC); \
+    FUNC(accumulator, temp.sD); \
+    FUNC(accumulator, temp.sE); \
+    FUNC(accumulator, temp.sF)
+#endif
+#endif
+
+#else // sum
 #if kercn == 1
 #define REDUCE_GLOBAL \
    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
@@ -260,6 +450,7 @@
    FUNC(accumulator, temp.sF)
 #endif
 #endif
+#endif

 #define SET_LOCAL_1 \
    localmem[lid] = accumulator
@@ -325,6 +516,20 @@
    accumulator += value.sF == zero ? zero : one
 #endif

+#ifdef OP_CALC2
+#define SET_LOCAL_1 \
+    localmem[lid] = accumulator; \
+    localmem2[lid] = accumulator2; \
+#define REDUCE_LOCAL_1 \
+    localmem[lid - WGS2_ALIGNED] += accumulator; \
+    localmem2[lid - WGS2_ALIGNED] += accumulator2
+#define REDUCE_LOCAL_2 \
+    localmem[lid] += localmem[lid2]; \
+    localmem2[lid] += localmem2[lid2]
+#define CALC_RESULT \
+    storepix(localmem[0], dstptr + dstTSIZE * gid); \
+    storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
+#else
 #define SET_LOCAL_1 \
    localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -333,6 +538,7 @@
    localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
    storepix(localmem[0], dstptr + dstTSIZE * gid)
+#endif

 // norm (NORM_INF) with cn > 1 and mask
 #elif defined OP_NORM_INF_MASK
@@ -384,6 +590,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
        int src_index = mul24(id, srcTSIZE);
 #else
        int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
+#endif
+#ifdef HAVE_SRC2
+#ifdef HAVE_SRC2_CONT
+        int src2_index = mul24(id, srcTSIZE);
+#else
+        int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
+#endif
 #endif
        REDUCE_GLOBAL;
    }

--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp