Commit 20409958 authored by Ilya Lavrenov's avatar Ilya Lavrenov

optimized cv::norm with 2 args

parent 1a7a262f
...@@ -73,14 +73,26 @@ ...@@ -73,14 +73,26 @@
#define CALC_MAX(p, inc) #define CALC_MAX(p, inc)
#endif #endif
#ifdef OP_CALC2
#define CALC_MAX2(p) \
if (maxval2 < temp.p) \
maxval2 = temp.p
#else
#define CALC_MAX2(p)
#endif
#define CALC_P(p, inc) \ #define CALC_P(p, inc) \
CALC_MIN(p, inc) \ CALC_MIN(p, inc) \
CALC_MAX(p, inc) CALC_MAX(p, inc) \
CALC_MAX2(p)
__kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols, __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols,
int total, int groupnum, __global uchar * dstptr int total, int groupnum, __global uchar * dstptr
#ifdef HAVE_MASK #ifdef HAVE_MASK
, __global const uchar * mask, int mask_step, int mask_offset , __global const uchar * mask, int mask_step, int mask_offset
#endif
#ifdef HAVE_SRC2
, __global const uchar * src2ptr, int src2_step, int src2_offset
#endif #endif
) )
{ {
...@@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off ...@@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#ifdef HAVE_MASK #ifdef HAVE_MASK
mask += mask_offset; mask += mask_offset;
#endif #endif
#ifdef HAVE_SRC2
src2ptr += src2_offset;
#endif
#ifdef NEED_MINVAL #ifdef NEED_MINVAL
__local dstT1 localmem_min[WGS2_ALIGNED]; __local dstT1 localmem_min[WGS2_ALIGNED];
dstT1 minval = MAX_VAL;
#ifdef NEED_MINLOC #ifdef NEED_MINLOC
__local uint localmem_minloc[WGS2_ALIGNED]; __local uint localmem_minloc[WGS2_ALIGNED];
uint minloc = INDEX_MAX;
#endif #endif
#endif #endif
#ifdef NEED_MAXVAL #ifdef NEED_MAXVAL
dstT1 maxval = MIN_VAL;
__local dstT1 localmem_max[WGS2_ALIGNED]; __local dstT1 localmem_max[WGS2_ALIGNED];
#ifdef NEED_MAXLOC #ifdef NEED_MAXLOC
__local uint localmem_maxloc[WGS2_ALIGNED]; __local uint localmem_maxloc[WGS2_ALIGNED];
uint maxloc = INDEX_MAX;
#endif
#endif #endif
#ifdef OP_CALC2
__local dstT1 localmem_max2[WGS2_ALIGNED];
dstT1 maxval2 = MIN_VAL;
#endif #endif
dstT1 minval = MAX_VAL, maxval = MIN_VAL;
dstT temp;
uint minloc = INDEX_MAX, maxloc = INDEX_MAX;
int src_index; int src_index;
#ifdef HAVE_MASK #ifdef HAVE_MASK
int mask_index; int mask_index;
#endif #endif
#ifdef HAVE_SRC2
int src2_index;
#endif
for (int grain = groupnum * WGS * kercn; id < total; id += grain) dstT temp;
{ #ifdef HAVE_SRC2
#ifdef HAVE_SRC_CONT dstT temp2;
src_index = mul24(id, (int)sizeof(srcT1));
#else
src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
#endif #endif
for (int grain = groupnum * WGS * kercn; id < total; id += grain)
{
#ifdef HAVE_MASK #ifdef HAVE_MASK
#ifdef HAVE_MASK_CONT #ifdef HAVE_MASK_CONT
mask_index = id; mask_index = id;
...@@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off ...@@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
if (mask[mask_index]) if (mask[mask_index])
#endif #endif
{ {
#ifdef HAVE_SRC_CONT
src_index = mul24(id, (int)sizeof(srcT1));
#else
src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
#endif
temp = convertToDT(*(__global const srcT *)(srcptr + src_index)); temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
#ifdef OP_ABS
temp = temp >= (dstT)(0) ? temp : -temp;
#endif
#ifdef HAVE_SRC2
#ifdef HAVE_SRC2_CONT
src2_index = mul24(id, (int)sizeof(srcT1));
#else
src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
#endif
temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
#endif
#if kercn == 1 #if kercn == 1
#ifdef NEED_MINVAL #ifdef NEED_MINVAL
if (minval > temp) if (minval > temp)
...@@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off ...@@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
maxloc = id; maxloc = id;
#endif #endif
} }
#ifdef OP_CALC2
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
if (maxval2 < temp2)
maxval2 = temp2;
#endif
#endif #endif
#elif kercn >= 2 #elif kercn >= 2
CALC_P(s0, 0) CALC_P(s0, 0)
...@@ -191,6 +237,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off ...@@ -191,6 +237,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif #endif
#ifdef NEED_MAXLOC #ifdef NEED_MAXLOC
localmem_maxloc[lid] = maxloc; localmem_maxloc[lid] = maxloc;
#endif
#ifdef OP_CALC2
localmem_max2[lid] = maxval2;
#endif #endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -221,6 +270,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off ...@@ -221,6 +270,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif #endif
localmem_max[lid3] = maxval; localmem_max[lid3] = maxval;
} }
#endif
#ifdef OP_CALC2
if (localmem_max2[lid3] < maxval2)
localmem_max2[lid3] = maxval2;
#endif #endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -254,6 +307,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off ...@@ -254,6 +307,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif #endif
localmem_max[lid] = localmem_max[lid2]; localmem_max[lid] = localmem_max[lid2];
} }
#endif
#ifdef OP_CALC2
if (localmem_max2[lid] < localmem_max2[lid2])
localmem_max2[lid] = localmem_max2[lid2];
#endif #endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -276,6 +333,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off ...@@ -276,6 +333,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif #endif
#ifdef NEED_MAXLOC #ifdef NEED_MAXLOC
*(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0]; *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
#endif
#ifdef OP_CALC2
pos = mad24(groupnum, (int)sizeof(uint), pos);
*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
#endif #endif
} }
} }
...@@ -109,13 +109,22 @@ ...@@ -109,13 +109,22 @@
#endif #endif
#ifdef HAVE_MASK #ifdef HAVE_MASK
#ifdef HAVE_SRC2
#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset
#else
#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
#endif
#else
#ifdef HAVE_SRC2
#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset
#else #else
#define EXTRA_PARAMS #define EXTRA_PARAMS
#endif #endif
#endif
// accumulative reduction stuff // accumulative reduction stuff
#if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT
#ifdef OP_DOT #ifdef OP_DOT
#if ddepth <= 4 #if ddepth <= 4
#define FUNC(a, b, c) a = mad24(b, c, a) #define FUNC(a, b, c) a = mad24(b, c, a)
...@@ -137,18 +146,48 @@ ...@@ -137,18 +146,48 @@
#endif #endif
#endif #endif
#ifdef OP_CALC2
#define DECLARE_LOCAL_MEM \
__local dstT localmem[WGS2_ALIGNED]; \
__local dstT localmem2[WGS2_ALIGNED]
#define DEFINE_ACCUMULATOR \
dstT accumulator = (dstT)(0); \
dstT accumulator2 = (dstT)(0)
#else
#define DECLARE_LOCAL_MEM \ #define DECLARE_LOCAL_MEM \
__local dstT localmem[WGS2_ALIGNED] __local dstT localmem[WGS2_ALIGNED]
#define DEFINE_ACCUMULATOR \ #define DEFINE_ACCUMULATOR \
dstT accumulator = (dstT)(0) dstT accumulator = (dstT)(0)
#endif
#ifdef HAVE_SRC2
#ifdef OP_CALC2
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp -= temp2; \
temp = temp > (dstT)(0) ? temp : -temp; \
FUNC(accumulator2, temp2); \
FUNC(accumulator, temp)
#else
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp)
#endif
#else
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp)
#endif
#ifdef HAVE_MASK #ifdef HAVE_MASK
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
MASK_INDEX; \ MASK_INDEX; \
if (mask[mask_index]) \ if (mask[mask_index]) \
{ \ { \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \ PROCESS_ELEMS; \
FUNC(accumulator, temp); \
} }
#elif defined OP_DOT #elif defined OP_DOT
...@@ -211,7 +250,158 @@ ...@@ -211,7 +250,158 @@
FUNC(accumulator, temp.sF, temp2.sF) FUNC(accumulator, temp.sF, temp2.sF)
#endif #endif
#else #else // sum or norm with 2 args
#ifdef HAVE_SRC2
#ifdef OP_CALC2 // norm relative
#if kercn == 1
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp); \
FUNC(accumulator2, temp2)
#elif kercn == 2
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1)
#elif kercn == 4
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1); \
FUNC(accumulator2, temp2.s2); \
FUNC(accumulator2, temp2.s3)
#elif kercn == 8
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1); \
FUNC(accumulator2, temp2.s2); \
FUNC(accumulator2, temp2.s3); \
FUNC(accumulator2, temp2.s4); \
FUNC(accumulator2, temp2.s5); \
FUNC(accumulator2, temp2.s6); \
FUNC(accumulator2, temp2.s7)
#elif kercn == 16
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7); \
FUNC(accumulator, temp.s8); \
FUNC(accumulator, temp.s9); \
FUNC(accumulator, temp.sA); \
FUNC(accumulator, temp.sB); \
FUNC(accumulator, temp.sC); \
FUNC(accumulator, temp.sD); \
FUNC(accumulator, temp.sE); \
FUNC(accumulator, temp.sF); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1); \
FUNC(accumulator2, temp2.s2); \
FUNC(accumulator2, temp2.s3); \
FUNC(accumulator2, temp2.s4); \
FUNC(accumulator2, temp2.s5); \
FUNC(accumulator2, temp2.s6); \
FUNC(accumulator2, temp2.s7); \
FUNC(accumulator2, temp2.s8); \
FUNC(accumulator2, temp2.s9); \
FUNC(accumulator2, temp2.sA); \
FUNC(accumulator2, temp2.sB); \
FUNC(accumulator2, temp2.sC); \
FUNC(accumulator2, temp2.sD); \
FUNC(accumulator2, temp2.sE); \
FUNC(accumulator2, temp2.sF)
#endif
#else // norm with 2 args
#if kercn == 1
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp)
#elif kercn == 2
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1)
#elif kercn == 4
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3)
#elif kercn == 8
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7)
#elif kercn == 16
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7); \
FUNC(accumulator, temp.s8); \
FUNC(accumulator, temp.s9); \
FUNC(accumulator, temp.sA); \
FUNC(accumulator, temp.sB); \
FUNC(accumulator, temp.sC); \
FUNC(accumulator, temp.sD); \
FUNC(accumulator, temp.sE); \
FUNC(accumulator, temp.sF)
#endif
#endif
#else // sum
#if kercn == 1 #if kercn == 1
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
...@@ -260,6 +450,7 @@ ...@@ -260,6 +450,7 @@
FUNC(accumulator, temp.sF) FUNC(accumulator, temp.sF)
#endif #endif
#endif #endif
#endif
#define SET_LOCAL_1 \ #define SET_LOCAL_1 \
localmem[lid] = accumulator localmem[lid] = accumulator
...@@ -325,6 +516,20 @@ ...@@ -325,6 +516,20 @@
accumulator += value.sF == zero ? zero : one accumulator += value.sF == zero ? zero : one
#endif #endif
#ifdef OP_CALC2
#define SET_LOCAL_1 \
localmem[lid] = accumulator; \
localmem2[lid] = accumulator2; \
#define REDUCE_LOCAL_1 \
localmem[lid - WGS2_ALIGNED] += accumulator; \
localmem2[lid - WGS2_ALIGNED] += accumulator2
#define REDUCE_LOCAL_2 \
localmem[lid] += localmem[lid2]; \
localmem2[lid] += localmem2[lid2]
#define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid); \
storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
#else
#define SET_LOCAL_1 \ #define SET_LOCAL_1 \
localmem[lid] = accumulator localmem[lid] = accumulator
#define REDUCE_LOCAL_1 \ #define REDUCE_LOCAL_1 \
...@@ -333,6 +538,7 @@ ...@@ -333,6 +538,7 @@
localmem[lid] += localmem[lid2] localmem[lid] += localmem[lid2]
#define CALC_RESULT \ #define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid) storepix(localmem[0], dstptr + dstTSIZE * gid)
#endif
// norm (NORM_INF) with cn > 1 and mask // norm (NORM_INF) with cn > 1 and mask
#elif defined OP_NORM_INF_MASK #elif defined OP_NORM_INF_MASK
...@@ -384,6 +590,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset ...@@ -384,6 +590,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
int src_index = mul24(id, srcTSIZE); int src_index = mul24(id, srcTSIZE);
#else #else
int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE)); int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
#endif
#ifdef HAVE_SRC2
#ifdef HAVE_SRC2_CONT
int src2_index = mul24(id, srcTSIZE);
#else
int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
#endif
#endif #endif
REDUCE_GLOBAL; REDUCE_GLOBAL;
} }
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment