Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
20409958
Commit
20409958
authored
Jun 07, 2014
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
optimized cv::norm with 2 args
parent
1a7a262f
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
287 additions
and
13 deletions
+287
-13
minmaxloc.cl
modules/core/src/opencl/minmaxloc.cl
+71
-10
reduce.cl
modules/core/src/opencl/reduce.cl
+216
-3
stat.cpp
modules/core/src/stat.cpp
+0
-0
No files found.
modules/core/src/opencl/minmaxloc.cl
View file @
20409958
...
...
@@ -73,14 +73,26 @@
#
define
CALC_MAX
(
p,
inc
)
#
endif
#
ifdef
OP_CALC2
#
define
CALC_MAX2
(
p
)
\
if
(
maxval2
<
temp.p
)
\
maxval2
=
temp.p
#
else
#
define
CALC_MAX2
(
p
)
#
endif
#
define
CALC_P
(
p,
inc
)
\
CALC_MIN
(
p,
inc
)
\
CALC_MAX
(
p,
inc
)
CALC_MAX
(
p,
inc
)
\
CALC_MAX2
(
p
)
__kernel
void
minmaxloc
(
__global
const
uchar
*
srcptr,
int
src_step,
int
src_offset,
int
cols,
int
total,
int
groupnum,
__global
uchar
*
dstptr
#
ifdef
HAVE_MASK
,
__global
const
uchar
*
mask,
int
mask_step,
int
mask_offset
#
endif
#
ifdef
HAVE_SRC2
,
__global
const
uchar
*
src2ptr,
int
src2_step,
int
src2_offset
#
endif
)
{
...
...
@@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#
ifdef
HAVE_MASK
mask
+=
mask_offset
;
#
endif
#
ifdef
HAVE_SRC2
src2ptr
+=
src2_offset
;
#
endif
#
ifdef
NEED_MINVAL
__local
dstT1
localmem_min[WGS2_ALIGNED]
;
dstT1
minval
=
MAX_VAL
;
#
ifdef
NEED_MINLOC
__local
uint
localmem_minloc[WGS2_ALIGNED]
;
uint
minloc
=
INDEX_MAX
;
#
endif
#
endif
#
ifdef
NEED_MAXVAL
dstT1
maxval
=
MIN_VAL
;
__local
dstT1
localmem_max[WGS2_ALIGNED]
;
#
ifdef
NEED_MAXLOC
__local
uint
localmem_maxloc[WGS2_ALIGNED]
;
uint
maxloc
=
INDEX_MAX
;
#
endif
#
endif
#
ifdef
OP_CALC2
__local
dstT1
localmem_max2[WGS2_ALIGNED]
;
dstT1
maxval2
=
MIN_VAL
;
#
endif
dstT1
minval
=
MAX_VAL,
maxval
=
MIN_VAL
;
dstT
temp
;
uint
minloc
=
INDEX_MAX,
maxloc
=
INDEX_MAX
;
int
src_index
;
#
ifdef
HAVE_MASK
int
mask_index
;
#
endif
#
ifdef
HAVE_SRC2
int
src2_index
;
#
endif
for
(
int
grain
=
groupnum
*
WGS
*
kercn
; id < total; id += grain)
{
#
ifdef
HAVE_SRC_CONT
src_index
=
mul24
(
id,
(
int
)
sizeof
(
srcT1
))
;
#
else
src_index
=
mad24
(
id
/
cols,
src_step,
mul24
(
id
%
cols,
(
int
)
sizeof
(
srcT1
)))
;
dstT
temp
;
#
ifdef
HAVE_SRC2
dstT
temp2
;
#
endif
for
(
int
grain
=
groupnum
*
WGS
*
kercn
; id < total; id += grain)
{
#
ifdef
HAVE_MASK
#
ifdef
HAVE_MASK_CONT
mask_index
=
id
;
...
...
@@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
if
(
mask[mask_index]
)
#
endif
{
#
ifdef
HAVE_SRC_CONT
src_index
=
mul24
(
id,
(
int
)
sizeof
(
srcT1
))
;
#
else
src_index
=
mad24
(
id
/
cols,
src_step,
mul24
(
id
%
cols,
(
int
)
sizeof
(
srcT1
)))
;
#
endif
temp
=
convertToDT
(
*
(
__global
const
srcT
*
)(
srcptr
+
src_index
))
;
#
ifdef
OP_ABS
temp
=
temp
>=
(
dstT
)(
0
)
?
temp
:
-temp
;
#
endif
#
ifdef
HAVE_SRC2
#
ifdef
HAVE_SRC2_CONT
src2_index
=
mul24
(
id,
(
int
)
sizeof
(
srcT1
))
;
#
else
src2_index
=
mad24
(
id
/
cols,
src2_step,
mul24
(
id
%
cols,
(
int
)
sizeof
(
srcT1
)))
;
#
endif
temp2
=
convertToDT
(
*
(
__global
const
srcT
*
)(
src2ptr
+
src2_index
))
;
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
;
#
endif
#
if
kercn
==
1
#
ifdef
NEED_MINVAL
if
(
minval
>
temp
)
...
...
@@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
maxloc
=
id
;
#
endif
}
#
ifdef
OP_CALC2
temp2
=
temp2
>=
(
dstT
)(
0
)
?
temp2
:
-temp2
;
if
(
maxval2
<
temp2
)
maxval2
=
temp2
;
#
endif
#
endif
#
elif
kercn
>=
2
CALC_P
(
s0,
0
)
...
...
@@ -191,6 +237,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#
endif
#
ifdef
NEED_MAXLOC
localmem_maxloc[lid]
=
maxloc
;
#
endif
#
ifdef
OP_CALC2
localmem_max2[lid]
=
maxval2
;
#
endif
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -221,6 +270,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#
endif
localmem_max[lid3]
=
maxval
;
}
#
endif
#
ifdef
OP_CALC2
if
(
localmem_max2[lid3]
<
maxval2
)
localmem_max2[lid3]
=
maxval2
;
#
endif
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -254,6 +307,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#
endif
localmem_max[lid]
=
localmem_max[lid2]
;
}
#
endif
#
ifdef
OP_CALC2
if
(
localmem_max2[lid]
<
localmem_max2[lid2]
)
localmem_max2[lid]
=
localmem_max2[lid2]
;
#
endif
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -276,6 +333,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#
endif
#
ifdef
NEED_MAXLOC
*
(
__global
uint
*
)(
dstptr
+
mad24
(
gid,
(
int
)
sizeof
(
uint
)
,
pos
))
=
localmem_maxloc[0]
;
#
endif
#
ifdef
OP_CALC2
pos
=
mad24
(
groupnum,
(
int
)
sizeof
(
uint
)
,
pos
)
;
*
(
__global
dstT1
*
)(
dstptr
+
mad24
(
gid,
(
int
)
sizeof
(
dstT1
)
,
pos
))
=
localmem_max2[0]
;
#
endif
}
}
modules/core/src/opencl/reduce.cl
View file @
20409958
...
...
@@ -109,13 +109,22 @@
#
endif
#
ifdef
HAVE_MASK
#
ifdef
HAVE_SRC2
#
define
EXTRA_PARAMS
,
__global
const
uchar
*
mask,
int
mask_step,
int
mask_offset,
__global
const
uchar
*
src2ptr,
int
src2_step,
int
src2_offset
#
else
#
define
EXTRA_PARAMS
,
__global
const
uchar
*
mask,
int
mask_step,
int
mask_offset
#
endif
#
else
#
ifdef
HAVE_SRC2
#
define
EXTRA_PARAMS
,
__global
const
uchar
*
src2ptr,
int
src2_step,
int
src2_offset
#
else
#
define
EXTRA_PARAMS
#
endif
#
endif
//
accumulative
reduction
stuff
#
if
defined
OP_SUM
|
| defined OP_SUM_ABS || defined OP_SUM_SQR |
|
defined
OP_DOT
#
ifdef
OP_DOT
#
if
ddepth
<=
4
#
define
FUNC
(
a,
b,
c
)
a
=
mad24
(
b,
c,
a
)
...
...
@@ -137,18 +146,48 @@
#
endif
#
endif
#
ifdef
OP_CALC2
#
define
DECLARE_LOCAL_MEM
\
__local
dstT
localmem[WGS2_ALIGNED]
; \
__local
dstT
localmem2[WGS2_ALIGNED]
#
define
DEFINE_ACCUMULATOR
\
dstT
accumulator
=
(
dstT
)(
0
)
; \
dstT
accumulator2
=
(
dstT
)(
0
)
#
else
#
define
DECLARE_LOCAL_MEM
\
__local
dstT
localmem[WGS2_ALIGNED]
#
define
DEFINE_ACCUMULATOR
\
dstT
accumulator
=
(
dstT
)(
0
)
#
endif
#
ifdef
HAVE_SRC2
#
ifdef
OP_CALC2
#
define
PROCESS_ELEMS
\
dstT
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
-
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
dstT
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
-=
temp2
; \
temp
=
temp
>
(
dstT
)(
0
)
?
temp
:
-temp
; \
FUNC
(
accumulator2,
temp2
)
; \
FUNC
(
accumulator,
temp
)
#
else
#
define
PROCESS_ELEMS
\
dstT
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstT
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp
)
#
endif
#
else
#
define
PROCESS_ELEMS
\
dstT
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
FUNC
(
accumulator,
temp
)
#
endif
#
ifdef
HAVE_MASK
#
define
REDUCE_GLOBAL
\
MASK_INDEX
; \
if
(
mask[mask_index]
)
\
{
\
dstT
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
FUNC
(
accumulator,
temp
)
; \
PROCESS_ELEMS
; \
}
#
elif
defined
OP_DOT
...
...
@@ -211,7 +250,158 @@
FUNC
(
accumulator,
temp.sF,
temp2.sF
)
#
endif
#
else
#
else
//
sum
or
norm
with
2
args
#
ifdef
HAVE_SRC2
#
ifdef
OP_CALC2
//
norm
relative
#
if
kercn
==
1
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp
)
; \
FUNC
(
accumulator2,
temp2
)
#
elif
kercn
==
2
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp.s0
)
; \
FUNC
(
accumulator,
temp.s1
)
; \
FUNC
(
accumulator2,
temp2.s0
)
; \
FUNC
(
accumulator2,
temp2.s1
)
#
elif
kercn
==
4
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp.s0
)
; \
FUNC
(
accumulator,
temp.s1
)
; \
FUNC
(
accumulator,
temp.s2
)
; \
FUNC
(
accumulator,
temp.s3
)
; \
FUNC
(
accumulator2,
temp2.s0
)
; \
FUNC
(
accumulator2,
temp2.s1
)
; \
FUNC
(
accumulator2,
temp2.s2
)
; \
FUNC
(
accumulator2,
temp2.s3
)
#
elif
kercn
==
8
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp.s0
)
; \
FUNC
(
accumulator,
temp.s1
)
; \
FUNC
(
accumulator,
temp.s2
)
; \
FUNC
(
accumulator,
temp.s3
)
; \
FUNC
(
accumulator,
temp.s4
)
; \
FUNC
(
accumulator,
temp.s5
)
; \
FUNC
(
accumulator,
temp.s6
)
; \
FUNC
(
accumulator,
temp.s7
)
; \
FUNC
(
accumulator2,
temp2.s0
)
; \
FUNC
(
accumulator2,
temp2.s1
)
; \
FUNC
(
accumulator2,
temp2.s2
)
; \
FUNC
(
accumulator2,
temp2.s3
)
; \
FUNC
(
accumulator2,
temp2.s4
)
; \
FUNC
(
accumulator2,
temp2.s5
)
; \
FUNC
(
accumulator2,
temp2.s6
)
; \
FUNC
(
accumulator2,
temp2.s7
)
#
elif
kercn
==
16
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp.s0
)
; \
FUNC
(
accumulator,
temp.s1
)
; \
FUNC
(
accumulator,
temp.s2
)
; \
FUNC
(
accumulator,
temp.s3
)
; \
FUNC
(
accumulator,
temp.s4
)
; \
FUNC
(
accumulator,
temp.s5
)
; \
FUNC
(
accumulator,
temp.s6
)
; \
FUNC
(
accumulator,
temp.s7
)
; \
FUNC
(
accumulator,
temp.s8
)
; \
FUNC
(
accumulator,
temp.s9
)
; \
FUNC
(
accumulator,
temp.sA
)
; \
FUNC
(
accumulator,
temp.sB
)
; \
FUNC
(
accumulator,
temp.sC
)
; \
FUNC
(
accumulator,
temp.sD
)
; \
FUNC
(
accumulator,
temp.sE
)
; \
FUNC
(
accumulator,
temp.sF
)
; \
FUNC
(
accumulator2,
temp2.s0
)
; \
FUNC
(
accumulator2,
temp2.s1
)
; \
FUNC
(
accumulator2,
temp2.s2
)
; \
FUNC
(
accumulator2,
temp2.s3
)
; \
FUNC
(
accumulator2,
temp2.s4
)
; \
FUNC
(
accumulator2,
temp2.s5
)
; \
FUNC
(
accumulator2,
temp2.s6
)
; \
FUNC
(
accumulator2,
temp2.s7
)
; \
FUNC
(
accumulator2,
temp2.s8
)
; \
FUNC
(
accumulator2,
temp2.s9
)
; \
FUNC
(
accumulator2,
temp2.sA
)
; \
FUNC
(
accumulator2,
temp2.sB
)
; \
FUNC
(
accumulator2,
temp2.sC
)
; \
FUNC
(
accumulator2,
temp2.sD
)
; \
FUNC
(
accumulator2,
temp2.sE
)
; \
FUNC
(
accumulator2,
temp2.sF
)
#
endif
#
else
//
norm
with
2
args
#
if
kercn
==
1
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp
)
#
elif
kercn
==
2
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp.s0
)
; \
FUNC
(
accumulator,
temp.s1
)
#
elif
kercn
==
4
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp.s0
)
; \
FUNC
(
accumulator,
temp.s1
)
; \
FUNC
(
accumulator,
temp.s2
)
; \
FUNC
(
accumulator,
temp.s3
)
#
elif
kercn
==
8
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp.s0
)
; \
FUNC
(
accumulator,
temp.s1
)
; \
FUNC
(
accumulator,
temp.s2
)
; \
FUNC
(
accumulator,
temp.s3
)
; \
FUNC
(
accumulator,
temp.s4
)
; \
FUNC
(
accumulator,
temp.s5
)
; \
FUNC
(
accumulator,
temp.s6
)
; \
FUNC
(
accumulator,
temp.s7
)
#
elif
kercn
==
16
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
dstTK
temp2
=
convertToDT
(
loadpix
(
src2ptr
+
src2_index
))
; \
temp
=
temp
>
temp2
?
temp
-
temp2
:
(
temp2
-
temp
)
; \
FUNC
(
accumulator,
temp.s0
)
; \
FUNC
(
accumulator,
temp.s1
)
; \
FUNC
(
accumulator,
temp.s2
)
; \
FUNC
(
accumulator,
temp.s3
)
; \
FUNC
(
accumulator,
temp.s4
)
; \
FUNC
(
accumulator,
temp.s5
)
; \
FUNC
(
accumulator,
temp.s6
)
; \
FUNC
(
accumulator,
temp.s7
)
; \
FUNC
(
accumulator,
temp.s8
)
; \
FUNC
(
accumulator,
temp.s9
)
; \
FUNC
(
accumulator,
temp.sA
)
; \
FUNC
(
accumulator,
temp.sB
)
; \
FUNC
(
accumulator,
temp.sC
)
; \
FUNC
(
accumulator,
temp.sD
)
; \
FUNC
(
accumulator,
temp.sE
)
; \
FUNC
(
accumulator,
temp.sF
)
#
endif
#
endif
#
else
//
sum
#
if
kercn
==
1
#
define
REDUCE_GLOBAL
\
dstTK
temp
=
convertToDT
(
loadpix
(
srcptr
+
src_index
))
; \
...
...
@@ -260,6 +450,7 @@
FUNC
(
accumulator,
temp.sF
)
#
endif
#
endif
#
endif
#
define
SET_LOCAL_1
\
localmem[lid]
=
accumulator
...
...
@@ -325,6 +516,20 @@
accumulator
+=
value.sF
==
zero
?
zero
:
one
#
endif
#
ifdef
OP_CALC2
#
define
SET_LOCAL_1
\
localmem[lid]
=
accumulator
; \
localmem2[lid]
=
accumulator2
; \
#
define
REDUCE_LOCAL_1
\
localmem[lid
-
WGS2_ALIGNED]
+=
accumulator
; \
localmem2[lid
-
WGS2_ALIGNED]
+=
accumulator2
#
define
REDUCE_LOCAL_2
\
localmem[lid]
+=
localmem[lid2]
; \
localmem2[lid]
+=
localmem2[lid2]
#
define
CALC_RESULT
\
storepix
(
localmem[0],
dstptr
+
dstTSIZE
*
gid
)
; \
storepix
(
localmem2[0],
dstptr
+
mad24
(
groupnum,
srcTSIZE,
dstTSIZE
*
gid
))
#
else
#
define
SET_LOCAL_1
\
localmem[lid]
=
accumulator
#
define
REDUCE_LOCAL_1
\
...
...
@@ -333,6 +538,7 @@
localmem[lid]
+=
localmem[lid2]
#
define
CALC_RESULT
\
storepix
(
localmem[0],
dstptr
+
dstTSIZE
*
gid
)
#
endif
//
norm
(
NORM_INF
)
with
cn
>
1
and
mask
#
elif
defined
OP_NORM_INF_MASK
...
...
@@ -384,6 +590,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
int
src_index
=
mul24
(
id,
srcTSIZE
)
;
#
else
int
src_index
=
mad24
(
id
/
cols,
src_step,
mul24
(
id
%
cols,
srcTSIZE
))
;
#
endif
#
ifdef
HAVE_SRC2
#
ifdef
HAVE_SRC2_CONT
int
src2_index
=
mul24
(
id,
srcTSIZE
)
;
#
else
int
src2_index
=
mad24
(
id
/
cols,
src2_step,
mul24
(
id
%
cols,
srcTSIZE
))
;
#
endif
#
endif
REDUCE_GLOBAL
;
}
...
...
modules/core/src/stat.cpp
View file @
20409958
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment