Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
b332152b
Commit
b332152b
authored
Aug 28, 2014
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #2956 from ilya-lavrenov:tapi_accumulate
parents
d6681597
7017b125
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
99 additions
and
52 deletions
+99
-52
ocl.hpp
modules/core/include/opencv2/core/ocl.hpp
+21
-1
ocl.cpp
modules/core/src/ocl.cpp
+26
-25
accum.cpp
modules/imgproc/src/accum.cpp
+11
-10
accumulate.cl
modules/imgproc/src/opencl/accumulate.cl
+41
-16
No files found.
modules/core/include/opencv2/core/ocl.hpp
View file @
b332152b
...
...
@@ -598,9 +598,29 @@ CV_EXPORTS const char* typeToStr(int t);
CV_EXPORTS
const
char
*
memopTypeToStr
(
int
t
);
CV_EXPORTS
String
kernelToStr
(
InputArray
_kernel
,
int
ddepth
=
-
1
,
const
char
*
name
=
NULL
);
CV_EXPORTS
void
getPlatfomsInfo
(
std
::
vector
<
PlatformInfo
>&
platform_info
);
enum
OclVectorStrategy
{
// all matrices have its own vector width
OCL_VECTOR_OWN
=
0
,
// all matrices have maximal vector width among all matrices
// (useful for cases when matrices have different data types)
OCL_VECTOR_MAX
=
1
,
// default strategy
OCL_VECTOR_DEFAULT
=
OCL_VECTOR_OWN
};
CV_EXPORTS
int
predictOptimalVectorWidth
(
InputArray
src1
,
InputArray
src2
=
noArray
(),
InputArray
src3
=
noArray
(),
InputArray
src4
=
noArray
(),
InputArray
src5
=
noArray
(),
InputArray
src6
=
noArray
(),
InputArray
src7
=
noArray
(),
InputArray
src8
=
noArray
(),
InputArray
src9
=
noArray
());
InputArray
src7
=
noArray
(),
InputArray
src8
=
noArray
(),
InputArray
src9
=
noArray
(),
OclVectorStrategy
strat
=
OCL_VECTOR_DEFAULT
);
// with OCL_VECTOR_MAX strategy
CV_EXPORTS
int
predictOptimalVectorWidthMax
(
InputArray
src1
,
InputArray
src2
=
noArray
(),
InputArray
src3
=
noArray
(),
InputArray
src4
=
noArray
(),
InputArray
src5
=
noArray
(),
InputArray
src6
=
noArray
(),
InputArray
src7
=
noArray
(),
InputArray
src8
=
noArray
(),
InputArray
src9
=
noArray
());
CV_EXPORTS
void
buildOptionsAddMatrixDescription
(
String
&
buildOptions
,
const
String
&
name
,
InputArray
_m
);
...
...
modules/core/src/ocl.cpp
View file @
b332152b
...
...
@@ -4451,42 +4451,46 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
if (!src.empty()) \
{ \
CV_Assert(src.isMat() || src.isUMat()); \
int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
Size csize = src.size(); \
cols.push_back(ccn * csize.width); \
if (ctype != type) \
int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
if (cwidth < ckercn || ckercn <= 0) \
return 1; \
cols.push_back(cwidth); \
if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
return 1; \
offsets.push_back(src.offset()); \
steps.push_back(src.step()); \
dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
kercns.push_back(ckercn); \
} \
} \
while ((void)0, 0)
int
predictOptimalVectorWidth
(
InputArray
src1
,
InputArray
src2
,
InputArray
src3
,
InputArray
src4
,
InputArray
src5
,
InputArray
src6
,
InputArray
src7
,
InputArray
src8
,
InputArray
src9
)
InputArray
src7
,
InputArray
src8
,
InputArray
src9
,
OclVectorStrategy
strat
)
{
int
type
=
src1
.
type
(),
depth
=
CV_MAT_DEPTH
(
type
),
cn
=
CV_MAT_CN
(
type
),
esz1
=
CV_ELEM_SIZE1
(
depth
);
Size
ssize
=
src1
.
size
();
const
ocl
::
Device
&
d
=
ocl
::
Device
::
getDefault
();
int
ref_type
=
src1
.
type
();
int
vectorWidths
[]
=
{
d
.
preferredVectorWidthChar
(),
d
.
preferredVectorWidthChar
(),
d
.
preferredVectorWidthShort
(),
d
.
preferredVectorWidthShort
(),
d
.
preferredVectorWidthInt
(),
d
.
preferredVectorWidthFloat
(),
d
.
preferredVectorWidthDouble
(),
-
1
}
,
kercn
=
vectorWidths
[
depth
]
;
d
.
preferredVectorWidthDouble
(),
-
1
};
// if the device says don't use vectors
if
(
vectorWidths
[
0
]
==
1
)
{
// it's heuristic
int
vectorWidthsOthers
[]
=
{
16
,
16
,
8
,
8
,
1
,
1
,
1
,
-
1
};
kercn
=
vectorWidthsOthers
[
depth
];
vectorWidths
[
CV_8U
]
=
vectorWidths
[
CV_8S
]
=
16
;
vectorWidths
[
CV_16U
]
=
vectorWidths
[
CV_16S
]
=
8
;
vectorWidths
[
CV_32S
]
=
vectorWidths
[
CV_32F
]
=
vectorWidths
[
CV_64F
]
=
1
;
}
if
(
ssize
.
width
*
cn
<
kercn
||
kercn
<=
0
)
return
1
;
std
::
vector
<
size_t
>
offsets
,
steps
,
cols
;
std
::
vector
<
int
>
dividers
,
kercns
;
PROCESS_SRC
(
src1
);
PROCESS_SRC
(
src2
);
PROCESS_SRC
(
src3
);
...
...
@@ -4498,27 +4502,24 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
PROCESS_SRC
(
src9
);
size_t
size
=
offsets
.
size
();
int
wsz
=
kercn
*
esz1
;
std
::
vector
<
int
>
dividers
(
size
,
wsz
);
for
(
size_t
i
=
0
;
i
<
size
;
++
i
)
while
(
offsets
[
i
]
%
dividers
[
i
]
!=
0
||
steps
[
i
]
%
dividers
[
i
]
!=
0
||
cols
[
i
]
%
divider
s
[
i
]
!=
0
)
dividers
[
i
]
>>=
1
;
while
(
offsets
[
i
]
%
dividers
[
i
]
!=
0
||
steps
[
i
]
%
dividers
[
i
]
!=
0
||
cols
[
i
]
%
kercn
s
[
i
]
!=
0
)
dividers
[
i
]
>>=
1
,
kercns
[
i
]
>>=
1
;
// default strategy
for
(
size_t
i
=
0
;
i
<
size
;
++
i
)
if
(
dividers
[
i
]
!=
wsz
)
{
kercn
=
1
;
break
;
}
// another strategy
// width = *std::min_element(dividers.begin(), dividers.end());
int
kercn
=
*
std
::
min_element
(
kercns
.
begin
(),
kercns
.
end
());
return
kercn
;
}
int
predictOptimalVectorWidthMax
(
InputArray
src1
,
InputArray
src2
,
InputArray
src3
,
InputArray
src4
,
InputArray
src5
,
InputArray
src6
,
InputArray
src7
,
InputArray
src8
,
InputArray
src9
)
{
return
predictOptimalVectorWidth
(
src1
,
src2
,
src3
,
src4
,
src5
,
src6
,
src7
,
src8
,
src9
,
OCL_VECTOR_MAX
);
}
#undef PROCESS_SRC
...
...
modules/imgproc/src/accum.cpp
View file @
b332152b
...
...
@@ -369,11 +369,10 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
CV_Assert
(
op_type
==
ACCUMULATE
||
op_type
==
ACCUMULATE_SQUARE
||
op_type
==
ACCUMULATE_PRODUCT
||
op_type
==
ACCUMULATE_WEIGHTED
);
int
stype
=
_src
.
type
(),
cn
=
CV_MAT_CN
(
stype
);
int
sdepth
=
CV_MAT_DEPTH
(
stype
),
ddepth
=
_dst
.
depth
();
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
,
haveMask
=
!
_mask
.
empty
();
const
ocl
::
Device
&
dev
=
ocl
::
Device
::
getDefault
();
bool
haveMask
=
!
_mask
.
empty
(),
doubleSupport
=
dev
.
doubleFPConfig
()
>
0
;
int
stype
=
_src
.
type
(),
sdepth
=
CV_MAT_DEPTH
(
stype
),
cn
=
CV_MAT_CN
(
stype
),
ddepth
=
_dst
.
depth
();
int
kercn
=
haveMask
?
cn
:
ocl
::
predictOptimalVectorWidthMax
(
_src
,
_src2
,
_dst
),
rowsPerWI
=
dev
.
isIntel
()
?
4
:
1
;
if
(
!
doubleSupport
&&
(
sdepth
==
CV_64F
||
ddepth
==
CV_64F
))
return
false
;
...
...
@@ -381,11 +380,13 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
const
char
*
const
opMap
[
4
]
=
{
"ACCUMULATE"
,
"ACCUMULATE_SQUARE"
,
"ACCUMULATE_PRODUCT"
,
"ACCUMULATE_WEIGHTED"
};
char
cvt
[
40
];
ocl
::
Kernel
k
(
"accumulate"
,
ocl
::
imgproc
::
accumulate_oclsrc
,
format
(
"-D %s%s -D srcT
=%s -D cn=%d -D dstT=%s
%s"
,
format
(
"-D %s%s -D srcT
1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=
%s"
,
opMap
[
op_type
],
haveMask
?
" -D HAVE_MASK"
:
""
,
ocl
::
typeToStr
(
sdepth
),
cn
,
ocl
::
typeToStr
(
ddepth
),
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
));
ocl
::
typeToStr
(
sdepth
),
kercn
,
ocl
::
typeToStr
(
ddepth
),
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
,
rowsPerWI
,
ocl
::
convertTypeStr
(
sdepth
,
ddepth
,
1
,
cvt
)));
if
(
k
.
empty
())
return
false
;
...
...
@@ -393,7 +394,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
ocl
::
KernelArg
srcarg
=
ocl
::
KernelArg
::
ReadOnlyNoSize
(
src
),
src2arg
=
ocl
::
KernelArg
::
ReadOnlyNoSize
(
src2
),
dstarg
=
ocl
::
KernelArg
::
ReadWrite
(
dst
),
dstarg
=
ocl
::
KernelArg
::
ReadWrite
(
dst
,
cn
,
kercn
),
maskarg
=
ocl
::
KernelArg
::
ReadOnlyNoSize
(
mask
);
int
argidx
=
k
.
set
(
0
,
srcarg
);
...
...
@@ -410,7 +411,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
if
(
haveMask
)
k
.
set
(
argidx
,
maskarg
);
size_t
globalsize
[
2
]
=
{
src
.
cols
,
src
.
rows
};
size_t
globalsize
[
2
]
=
{
src
.
cols
*
cn
/
kercn
,
(
src
.
rows
+
rowsPerWI
-
1
)
/
rowsPerWI
};
return
k
.
run
(
2
,
globalsize
,
NULL
,
false
);
}
...
...
modules/imgproc/src/opencl/accumulate.cl
View file @
b332152b
...
...
@@ -13,13 +13,18 @@
#
endif
#
endif
#
define
SRC_TSIZE
cn
*
(
int
)
sizeof
(
srcT1
)
#
define
DST_TSIZE
cn
*
(
int
)
sizeof
(
dstT1
)
#
define
noconvert
__kernel
void
accumulate
(
__global
const
uchar
*
srcptr,
int
src_step,
int
src_offset,
#
ifdef
ACCUMULATE_PRODUCT
__global
const
uchar
*
src2ptr,
int
src2_step,
int
src2_offset,
#
endif
__global
uchar
*
dstptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols
#
ifdef
ACCUMULATE_WEIGHTED
,
dstT
alpha
,
dstT
1
alpha
#
endif
#
ifdef
HAVE_MASK
,
__global
const
uchar
*
mask,
int
mask_step,
int
mask_offset
...
...
@@ -27,39 +32,59 @@ __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_of
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
*
rowsPerWI
;
if
(
x
<
dst_cols
&&
y
<
dst_rows
)
if
(
x
<
dst_cols
)
{
int
src_index
=
mad24
(
y,
src_step,
src_offset
+
x
*
cn
*
(
int
)
sizeof
(
srcT
))
;
int
src_index
=
mad24
(
y,
src_step,
mad24
(
x,
SRC_TSIZE,
src_offset
))
;
#
ifdef
HAVE_MASK
int
mask_index
=
mad24
(
y,
mask_step,
mask_offset
+
x
)
;
mask
+=
mask_index
;
#
endif
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
*
cn
*
(
int
)
sizeof
(
dstT
))
;
__global
const
srcT
*
src
=
(
__global
const
srcT
*
)(
srcptr
+
src_index
)
;
#
ifdef
ACCUMULATE_PRODUCT
int
src2_index
=
mad24
(
y,
src2_step,
src2_offset
+
x
*
cn
*
(
int
)
sizeof
(
srcT
))
;
__global
const
srcT
*
src2
=
(
__global
const
srcT
*
)(
src2ptr
+
src2_index
)
;
int
src2_index
=
mad24
(
y,
src2_step,
mad24
(
x,
SRC_TSIZE,
src2_offset
))
;
#
endif
__global
dstT
*
dst
=
(
__global
dstT
*
)(
dstptr
+
dst_index
)
;
int
dst_index
=
mad24
(
y,
dst_step,
mad24
(
x,
DST_TSIZE,
dst_offset
)
)
;
#
pragma
unroll
for
(
int
c
=
0
; c < cn; ++c)
for
(
int
i
=
0
; i < rowsPerWI; ++i)
if
(
y
<
dst_rows
)
{
__global
const
srcT1
*
src
=
(
__global
const
srcT1
*
)(
srcptr
+
src_index
)
;
#
ifdef
ACCUMULATE_PRODUCT
__global
const
srcT1
*
src2
=
(
__global
const
srcT1
*
)(
src2ptr
+
src2_index
)
;
#
endif
__global
dstT1
*
dst
=
(
__global
dstT1
*
)(
dstptr
+
dst_index
)
;
#
ifdef
HAVE_MASK
if
(
mask[0]
)
if
(
mask[0]
)
#
endif
#
pragma
unroll
for
(
int
c
=
0
; c < cn; ++c)
{
#
ifdef
ACCUMULATE
dst[c]
+=
src[c]
;
dst[c]
+=
convertToDT
(
src[c]
)
;
#
elif
defined
ACCUMULATE_SQUARE
dst[c]
+=
src[c]
*
src[c]
;
dstT1
val
=
convertToDT
(
src[c]
)
;
dst[c]
=
fma
(
val,
val,
dst[c]
)
;
#
elif
defined
ACCUMULATE_PRODUCT
dst[c]
+=
src[c]
*
src2[c]
;
dst[c]
=
fma
(
convertToDT
(
src[c]
)
,
convertToDT
(
src2[c]
)
,
dst[c]
)
;
#
elif
defined
ACCUMULATE_WEIGHTED
dst[c]
=
(
1
-
alpha
)
*
dst[c]
+
src[c]
*
alpha
;
dst[c]
=
fma
(
1
-
alpha,
dst[c],
src[c]
*
alpha
)
;
#
else
#
error
"Unknown accumulation type"
#
endif
}
src_index
+=
src_step
;
#
ifdef
ACCUMULATE_PRODUCT
src2_index
+=
src2_step
;
#
endif
#
ifdef
HAVE_MASK
mask
+=
mask_step
;
#
endif
dst_index
+=
dst_step
;
++y
;
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment