Commit 20409958 authored by Ilya Lavrenov's avatar Ilya Lavrenov

optimized cv::norm with 2 args

parent 1a7a262f
......@@ -73,14 +73,26 @@
#define CALC_MAX(p, inc)
#endif
#ifdef OP_CALC2
#define CALC_MAX2(p) \
if (maxval2 < temp.p) \
maxval2 = temp.p
#else
#define CALC_MAX2(p)
#endif
#define CALC_P(p, inc) \
CALC_MIN(p, inc) \
CALC_MAX(p, inc)
CALC_MAX(p, inc) \
CALC_MAX2(p)
__kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols,
int total, int groupnum, __global uchar * dstptr
#ifdef HAVE_MASK
, __global const uchar * mask, int mask_step, int mask_offset
#endif
#ifdef HAVE_SRC2
, __global const uchar * src2ptr, int src2_step, int src2_offset
#endif
)
{
......@@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#ifdef HAVE_MASK
mask += mask_offset;
#endif
#ifdef HAVE_SRC2
src2ptr += src2_offset;
#endif
#ifdef NEED_MINVAL
__local dstT1 localmem_min[WGS2_ALIGNED];
dstT1 minval = MAX_VAL;
#ifdef NEED_MINLOC
__local uint localmem_minloc[WGS2_ALIGNED];
uint minloc = INDEX_MAX;
#endif
#endif
#ifdef NEED_MAXVAL
dstT1 maxval = MIN_VAL;
__local dstT1 localmem_max[WGS2_ALIGNED];
#ifdef NEED_MAXLOC
__local uint localmem_maxloc[WGS2_ALIGNED];
uint maxloc = INDEX_MAX;
#endif
#endif
#ifdef OP_CALC2
__local dstT1 localmem_max2[WGS2_ALIGNED];
dstT1 maxval2 = MIN_VAL;
#endif
dstT1 minval = MAX_VAL, maxval = MIN_VAL;
dstT temp;
uint minloc = INDEX_MAX, maxloc = INDEX_MAX;
int src_index;
#ifdef HAVE_MASK
int mask_index;
#endif
#ifdef HAVE_SRC2
int src2_index;
#endif
for (int grain = groupnum * WGS * kercn; id < total; id += grain)
{
#ifdef HAVE_SRC_CONT
src_index = mul24(id, (int)sizeof(srcT1));
#else
src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
dstT temp;
#ifdef HAVE_SRC2
dstT temp2;
#endif
for (int grain = groupnum * WGS * kercn; id < total; id += grain)
{
#ifdef HAVE_MASK
#ifdef HAVE_MASK_CONT
mask_index = id;
......@@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
if (mask[mask_index])
#endif
{
#ifdef HAVE_SRC_CONT
src_index = mul24(id, (int)sizeof(srcT1));
#else
src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
#endif
temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
#ifdef OP_ABS
temp = temp >= (dstT)(0) ? temp : -temp;
#endif
#ifdef HAVE_SRC2
#ifdef HAVE_SRC2_CONT
src2_index = mul24(id, (int)sizeof(srcT1));
#else
src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
#endif
temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
#endif
#if kercn == 1
#ifdef NEED_MINVAL
if (minval > temp)
......@@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
maxloc = id;
#endif
}
#ifdef OP_CALC2
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
if (maxval2 < temp2)
maxval2 = temp2;
#endif
#endif
#elif kercn >= 2
CALC_P(s0, 0)
......@@ -191,6 +237,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif
#ifdef NEED_MAXLOC
localmem_maxloc[lid] = maxloc;
#endif
#ifdef OP_CALC2
localmem_max2[lid] = maxval2;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -221,6 +270,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif
localmem_max[lid3] = maxval;
}
#endif
#ifdef OP_CALC2
if (localmem_max2[lid3] < maxval2)
localmem_max2[lid3] = maxval2;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -254,6 +307,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif
localmem_max[lid] = localmem_max[lid2];
}
#endif
#ifdef OP_CALC2
if (localmem_max2[lid] < localmem_max2[lid2])
localmem_max2[lid] = localmem_max2[lid2];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -276,6 +333,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif
#ifdef NEED_MAXLOC
*(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
#endif
#ifdef OP_CALC2
pos = mad24(groupnum, (int)sizeof(uint), pos);
*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
#endif
}
}
......@@ -109,13 +109,22 @@
#endif
#ifdef HAVE_MASK
#ifdef HAVE_SRC2
#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset
#else
#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
#endif
#else
#ifdef HAVE_SRC2
#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset
#else
#define EXTRA_PARAMS
#endif
#endif
// accumulative reduction stuff
#if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT
#ifdef OP_DOT
#if ddepth <= 4
#define FUNC(a, b, c) a = mad24(b, c, a)
......@@ -137,18 +146,48 @@
#endif
#endif
#ifdef OP_CALC2
#define DECLARE_LOCAL_MEM \
__local dstT localmem[WGS2_ALIGNED]; \
__local dstT localmem2[WGS2_ALIGNED]
#define DEFINE_ACCUMULATOR \
dstT accumulator = (dstT)(0); \
dstT accumulator2 = (dstT)(0)
#else
#define DECLARE_LOCAL_MEM \
__local dstT localmem[WGS2_ALIGNED]
#define DEFINE_ACCUMULATOR \
dstT accumulator = (dstT)(0)
#endif
#ifdef HAVE_SRC2
#ifdef OP_CALC2
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp -= temp2; \
temp = temp > (dstT)(0) ? temp : -temp; \
FUNC(accumulator2, temp2); \
FUNC(accumulator, temp)
#else
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp)
#endif
#else
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp)
#endif
#ifdef HAVE_MASK
#define REDUCE_GLOBAL \
MASK_INDEX; \
if (mask[mask_index]) \
{ \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp); \
PROCESS_ELEMS; \
}
#elif defined OP_DOT
......@@ -211,7 +250,158 @@
FUNC(accumulator, temp.sF, temp2.sF)
#endif
#else
#else // sum or norm with 2 args
#ifdef HAVE_SRC2
#ifdef OP_CALC2 // norm relative
#if kercn == 1
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp); \
FUNC(accumulator2, temp2)
#elif kercn == 2
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1)
#elif kercn == 4
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1); \
FUNC(accumulator2, temp2.s2); \
FUNC(accumulator2, temp2.s3)
#elif kercn == 8
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1); \
FUNC(accumulator2, temp2.s2); \
FUNC(accumulator2, temp2.s3); \
FUNC(accumulator2, temp2.s4); \
FUNC(accumulator2, temp2.s5); \
FUNC(accumulator2, temp2.s6); \
FUNC(accumulator2, temp2.s7)
#elif kercn == 16
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7); \
FUNC(accumulator, temp.s8); \
FUNC(accumulator, temp.s9); \
FUNC(accumulator, temp.sA); \
FUNC(accumulator, temp.sB); \
FUNC(accumulator, temp.sC); \
FUNC(accumulator, temp.sD); \
FUNC(accumulator, temp.sE); \
FUNC(accumulator, temp.sF); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1); \
FUNC(accumulator2, temp2.s2); \
FUNC(accumulator2, temp2.s3); \
FUNC(accumulator2, temp2.s4); \
FUNC(accumulator2, temp2.s5); \
FUNC(accumulator2, temp2.s6); \
FUNC(accumulator2, temp2.s7); \
FUNC(accumulator2, temp2.s8); \
FUNC(accumulator2, temp2.s9); \
FUNC(accumulator2, temp2.sA); \
FUNC(accumulator2, temp2.sB); \
FUNC(accumulator2, temp2.sC); \
FUNC(accumulator2, temp2.sD); \
FUNC(accumulator2, temp2.sE); \
FUNC(accumulator2, temp2.sF)
#endif
#else // norm with 2 args
#if kercn == 1
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp)
#elif kercn == 2
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1)
#elif kercn == 4
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3)
#elif kercn == 8
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7)
#elif kercn == 16
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7); \
FUNC(accumulator, temp.s8); \
FUNC(accumulator, temp.s9); \
FUNC(accumulator, temp.sA); \
FUNC(accumulator, temp.sB); \
FUNC(accumulator, temp.sC); \
FUNC(accumulator, temp.sD); \
FUNC(accumulator, temp.sE); \
FUNC(accumulator, temp.sF)
#endif
#endif
#else // sum
#if kercn == 1
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
......@@ -260,6 +450,7 @@
FUNC(accumulator, temp.sF)
#endif
#endif
#endif
#define SET_LOCAL_1 \
localmem[lid] = accumulator
......@@ -325,6 +516,20 @@
accumulator += value.sF == zero ? zero : one
#endif
#ifdef OP_CALC2
#define SET_LOCAL_1 \
localmem[lid] = accumulator; \
localmem2[lid] = accumulator2; \
#define REDUCE_LOCAL_1 \
localmem[lid - WGS2_ALIGNED] += accumulator; \
localmem2[lid - WGS2_ALIGNED] += accumulator2
#define REDUCE_LOCAL_2 \
localmem[lid] += localmem[lid2]; \
localmem2[lid] += localmem2[lid2]
#define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid); \
storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
#else
#define SET_LOCAL_1 \
localmem[lid] = accumulator
#define REDUCE_LOCAL_1 \
......@@ -333,6 +538,7 @@
localmem[lid] += localmem[lid2]
#define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid)
#endif
// norm (NORM_INF) with cn > 1 and mask
#elif defined OP_NORM_INF_MASK
......@@ -384,6 +590,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
int src_index = mul24(id, srcTSIZE);
#else
int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
#endif
#ifdef HAVE_SRC2
#ifdef HAVE_SRC2_CONT
int src2_index = mul24(id, srcTSIZE);
#else
int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
#endif
#endif
REDUCE_GLOBAL;
}
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment