Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
a350b767
Commit
a350b767
authored
Jul 09, 2014
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
optimization of cv::accumulate**
parent
4b84d5d2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
59 additions
and
26 deletions
+59
-26
accum.cpp
modules/imgproc/src/accum.cpp
+18
-10
accumulate.cl
modules/imgproc/src/opencl/accumulate.cl
+41
-16
No files found.
modules/imgproc/src/accum.cpp
View file @
a350b767
...
...
@@ -369,11 +369,17 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
CV_Assert
(
op_type
==
ACCUMULATE
||
op_type
==
ACCUMULATE_SQUARE
||
op_type
==
ACCUMULATE_PRODUCT
||
op_type
==
ACCUMULATE_WEIGHTED
);
int
stype
=
_src
.
type
(),
cn
=
CV_MAT_CN
(
stype
);
int
sdepth
=
CV_MAT_DEPTH
(
stype
),
ddepth
=
_dst
.
depth
();
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
,
haveMask
=
!
_mask
.
empty
();
const
ocl
::
Device
&
dev
=
ocl
::
Device
::
getDefault
();
int
vectorWidths
[]
=
{
4
,
4
,
2
,
2
,
1
,
1
,
1
,
-
1
};
int
stype
=
_src
.
type
(),
sdepth
=
CV_MAT_DEPTH
(
stype
),
cn
=
CV_MAT_CN
(
stype
),
ddepth
=
_dst
.
depth
();
int
pcn
=
std
::
max
(
vectorWidths
[
sdepth
],
vectorWidths
[
ddepth
]),
sesz
=
CV_ELEM_SIZE
(
sdepth
)
*
pcn
,
desz
=
CV_ELEM_SIZE
(
ddepth
)
*
pcn
,
rowsPerWI
=
dev
.
isIntel
()
?
4
:
1
;
bool
doubleSupport
=
dev
.
doubleFPConfig
()
>
0
,
haveMask
=
!
_mask
.
empty
(),
usepcn
=
_src
.
offset
()
%
sesz
==
0
&&
_src
.
step
()
%
sesz
==
0
&&
(
_src
.
cols
()
*
cn
)
%
pcn
==
0
&&
_src2
.
offset
()
%
desz
==
0
&&
_src2
.
step
()
%
desz
==
0
&&
_dst
.
offset
()
%
pcn
==
0
&&
_dst
.
step
()
%
desz
==
0
&&
!
haveMask
;
int
kercn
=
usepcn
?
pcn
:
haveMask
?
cn
:
1
;
if
(
!
doubleSupport
&&
(
sdepth
==
CV_64F
||
ddepth
==
CV_64F
))
return
false
;
...
...
@@ -381,11 +387,13 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
const
char
*
const
opMap
[
4
]
=
{
"ACCUMULATE"
,
"ACCUMULATE_SQUARE"
,
"ACCUMULATE_PRODUCT"
,
"ACCUMULATE_WEIGHTED"
};
char
cvt
[
40
];
ocl
::
Kernel
k
(
"accumulate"
,
ocl
::
imgproc
::
accumulate_oclsrc
,
format
(
"-D %s%s -D srcT
=%s -D cn=%d -D dstT=%s
%s"
,
format
(
"-D %s%s -D srcT
1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=
%s"
,
opMap
[
op_type
],
haveMask
?
" -D HAVE_MASK"
:
""
,
ocl
::
typeToStr
(
sdepth
),
cn
,
ocl
::
typeToStr
(
ddepth
),
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
));
ocl
::
typeToStr
(
sdepth
),
kercn
,
ocl
::
typeToStr
(
ddepth
),
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
,
rowsPerWI
,
ocl
::
convertTypeStr
(
sdepth
,
ddepth
,
1
,
cvt
)));
if
(
k
.
empty
())
return
false
;
...
...
@@ -393,7 +401,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
ocl
::
KernelArg
srcarg
=
ocl
::
KernelArg
::
ReadOnlyNoSize
(
src
),
src2arg
=
ocl
::
KernelArg
::
ReadOnlyNoSize
(
src2
),
dstarg
=
ocl
::
KernelArg
::
ReadWrite
(
dst
),
dstarg
=
ocl
::
KernelArg
::
ReadWrite
(
dst
,
cn
,
kercn
),
maskarg
=
ocl
::
KernelArg
::
ReadOnlyNoSize
(
mask
);
int
argidx
=
k
.
set
(
0
,
srcarg
);
...
...
@@ -410,7 +418,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
if
(
haveMask
)
k
.
set
(
argidx
,
maskarg
);
size_t
globalsize
[
2
]
=
{
src
.
cols
,
src
.
rows
};
size_t
globalsize
[
2
]
=
{
src
.
cols
*
cn
/
kercn
,
(
src
.
rows
+
rowsPerWI
-
1
)
/
rowsPerWI
};
return
k
.
run
(
2
,
globalsize
,
NULL
,
false
);
}
...
...
modules/imgproc/src/opencl/accumulate.cl
View file @
a350b767
...
...
@@ -13,13 +13,18 @@
#
endif
#
endif
#
define
SRC_TSIZE
cn
*
(
int
)
sizeof
(
srcT1
)
#
define
DST_TSIZE
cn
*
(
int
)
sizeof
(
dstT1
)
#
define
noconvert
__kernel
void
accumulate
(
__global
const
uchar
*
srcptr,
int
src_step,
int
src_offset,
#
ifdef
ACCUMULATE_PRODUCT
__global
const
uchar
*
src2ptr,
int
src2_step,
int
src2_offset,
#
endif
__global
uchar
*
dstptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols
#
ifdef
ACCUMULATE_WEIGHTED
,
dstT
alpha
,
dstT
1
alpha
#
endif
#
ifdef
HAVE_MASK
,
__global
const
uchar
*
mask,
int
mask_step,
int
mask_offset
...
...
@@ -27,39 +32,59 @@ __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_of
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
*
rowsPerWI
;
if
(
x
<
dst_cols
&&
y
<
dst_rows
)
if
(
x
<
dst_cols
)
{
int
src_index
=
mad24
(
y,
src_step,
src_offset
+
x
*
cn
*
(
int
)
sizeof
(
srcT
))
;
int
src_index
=
mad24
(
y,
src_step,
mad24
(
x,
SRC_TSIZE,
src_offset
))
;
#
ifdef
HAVE_MASK
int
mask_index
=
mad24
(
y,
mask_step,
mask_offset
+
x
)
;
mask
+=
mask_index
;
#
endif
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
*
cn
*
(
int
)
sizeof
(
dstT
))
;
__global
const
srcT
*
src
=
(
__global
const
srcT
*
)(
srcptr
+
src_index
)
;
#
ifdef
ACCUMULATE_PRODUCT
int
src2_index
=
mad24
(
y,
src2_step,
src2_offset
+
x
*
cn
*
(
int
)
sizeof
(
srcT
))
;
__global
const
srcT
*
src2
=
(
__global
const
srcT
*
)(
src2ptr
+
src2_index
)
;
int
src2_index
=
mad24
(
y,
src2_step,
mad24
(
x,
SRC_TSIZE,
src2_offset
))
;
#
endif
__global
dstT
*
dst
=
(
__global
dstT
*
)(
dstptr
+
dst_index
)
;
int
dst_index
=
mad24
(
y,
dst_step,
mad24
(
x,
DST_TSIZE,
dst_offset
)
)
;
#
pragma
unroll
for
(
int
c
=
0
; c < cn; ++c)
for
(
int
i
=
0
; i < rowsPerWI; ++i)
if
(
y
<
dst_rows
)
{
__global
const
srcT1
*
src
=
(
__global
const
srcT1
*
)(
srcptr
+
src_index
)
;
#
ifdef
ACCUMULATE_PRODUCT
__global
const
srcT1
*
src2
=
(
__global
const
srcT1
*
)(
src2ptr
+
src2_index
)
;
#
endif
__global
dstT1
*
dst
=
(
__global
dstT1
*
)(
dstptr
+
dst_index
)
;
#
ifdef
HAVE_MASK
if
(
mask[0]
)
if
(
mask[0]
)
#
endif
#
pragma
unroll
for
(
int
c
=
0
; c < cn; ++c)
{
#
ifdef
ACCUMULATE
dst[c]
+=
src[c]
;
dst[c]
+=
convertToDT
(
src[c]
)
;
#
elif
defined
ACCUMULATE_SQUARE
dst[c]
+=
src[c]
*
src[c]
;
dstT1
val
=
convertToDT
(
src[c]
)
;
dst[c]
=
fma
(
val,
val,
dst[c]
)
;
#
elif
defined
ACCUMULATE_PRODUCT
dst[c]
+=
src[c]
*
src2[c]
;
dst[c]
=
fma
(
convertToDT
(
src[c]
)
,
convertToDT
(
src2[c]
)
,
dst[c]
)
;
#
elif
defined
ACCUMULATE_WEIGHTED
dst[c]
=
(
1
-
alpha
)
*
dst[c]
+
src[c]
*
alpha
;
dst[c]
=
fma
(
1
-
alpha,
dst[c],
src[c]
*
alpha
)
;
#
else
#
error
"Unknown accumulation type"
#
endif
}
src_index
+=
src_step
;
#
ifdef
ACCUMULATE_PRODUCT
src2_index
+=
src2_step
;
#
endif
#
ifdef
HAVE_MASK
mask
+=
mask_step
;
#
endif
dst_index
+=
dst_step
;
++y
;
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment