Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
adc15c2b
Commit
adc15c2b
authored
Mar 13, 2014
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
improved performance of cv::ocl::sum
parent
3ebfe600
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
26 additions
and
28 deletions
+26
-28
arithm.cpp
modules/ocl/src/arithm.cpp
+21
-21
gftt.cpp
modules/ocl/src/gftt.cpp
+1
-1
arithm_sum.cl
modules/ocl/src/opencl/arithm_sum.cl
+4
-6
No files found.
modules/ocl/src/arithm.cpp
View file @
adc15c2b
...
...
@@ -313,32 +313,28 @@ void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int
enum
{
SUM
=
0
,
ABS_SUM
,
SQR_SUM
};
static
void
arithmetic_sum_buffer_run
(
const
oclMat
&
src
,
cl_mem
&
dst
,
int
groupnum
,
int
type
,
int
ddepth
)
static
void
arithmetic_sum_buffer_run
(
const
oclMat
&
src
,
cl_mem
&
dst
,
int
groupnum
,
int
type
,
int
ddepth
,
int
vlen
)
{
int
ochannels
=
src
.
oclchannels
();
int
all_cols
=
src
.
step
/
src
.
elemSize
();
int
pre_cols
=
(
src
.
offset
%
src
.
step
)
/
src
.
elemSize
();
int
sec_cols
=
all_cols
-
(
src
.
offset
%
src
.
step
+
src
.
cols
*
src
.
elemSize
()
-
1
)
/
src
.
elemSize
()
-
1
;
int
invalid_cols
=
pre_cols
+
sec_cols
;
int
cols
=
all_cols
-
invalid_cols
,
elemnum
=
cols
*
src
.
rows
;;
int
offset
=
src
.
offset
/
src
.
elemSize
();
int
vElemSize
=
vlen
*
src
.
elemSize
();
int
src_offset
=
src
.
offset
/
vElemSize
,
src_step
=
src
.
step
/
vElemSize
;
int
src_cols
=
src
.
cols
/
vlen
,
total
=
src
.
size
().
area
()
/
vlen
;
vlen
*=
src
.
oclchannels
();
const
char
*
const
typeMap
[]
=
{
"uchar"
,
"char"
,
"ushort"
,
"short"
,
"int"
,
"float"
,
"double"
};
const
char
*
const
funcMap
[]
=
{
"FUNC_SUM"
,
"FUNC_ABS_SUM"
,
"FUNC_SQR_SUM"
};
const
char
*
const
channelMap
[]
=
{
" "
,
" "
,
"2"
,
"4"
,
"4"
};
const
char
*
const
channelMap
[]
=
{
" "
,
" "
,
"2"
,
"4"
,
"4"
,
""
,
""
,
""
,
"8"
};
string
buildOptions
=
format
(
"-D srcT=%s%s -D dstT=%s%s -D convertToDstT=convert_%s%s -D %s"
,
typeMap
[
src
.
depth
()],
channelMap
[
ochannels
],
typeMap
[
ddepth
],
channelMap
[
ochannels
],
typeMap
[
ddepth
],
channelMap
[
ochannels
],
funcMap
[
type
]);
typeMap
[
src
.
depth
()],
channelMap
[
vlen
],
typeMap
[
ddepth
],
channelMap
[
vlen
],
typeMap
[
ddepth
],
channelMap
[
vlen
],
funcMap
[
type
]);
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
invalid_cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
elemnum
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
groupnum
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src_offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src_cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
total
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
groupnum
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst
));
size_t
globalThreads
[
3
]
=
{
groupnum
*
256
,
1
,
1
};
...
...
@@ -360,7 +356,11 @@ Scalar arithmetic_sum(const oclMat &src, int type, int ddepth)
size_t
groupnum
=
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
;
CV_Assert
(
groupnum
!=
0
);
int
dbsize
=
groupnum
*
src
.
oclchannels
();
int
vlen
=
8
/
src
.
channels
(),
vElemSize
=
vlen
*
src
.
elemSize1
();
while
(
src
.
offset
%
vElemSize
!=
0
||
src
.
step
%
vElemSize
!=
0
||
src
.
cols
%
vlen
!=
0
)
vlen
>>=
1
,
vElemSize
>>=
1
;
int
dbsize
=
groupnum
*
src
.
oclchannels
()
*
vlen
;
Context
*
clCxt
=
src
.
clCxt
;
AutoBuffer
<
T
>
_buf
(
dbsize
);
...
...
@@ -368,12 +368,12 @@ Scalar arithmetic_sum(const oclMat &src, int type, int ddepth)
memset
(
p
,
0
,
dbsize
*
sizeof
(
T
));
cl_mem
dstBuffer
=
openCLCreateBuffer
(
clCxt
,
CL_MEM_WRITE_ONLY
,
dbsize
*
sizeof
(
T
));
arithmetic_sum_buffer_run
(
src
,
dstBuffer
,
groupnum
,
type
,
ddepth
);
arithmetic_sum_buffer_run
(
src
,
dstBuffer
,
groupnum
,
type
,
ddepth
,
vlen
);
openCLReadBuffer
(
clCxt
,
dstBuffer
,
(
void
*
)
p
,
dbsize
*
sizeof
(
T
));
openCLFree
(
dstBuffer
);
Scalar
s
=
Scalar
::
all
(
0.0
);
for
(
int
i
=
0
;
i
<
dbsize
;)
for
(
int
i
=
0
;
i
<
dbsize
;
)
for
(
int
j
=
0
;
j
<
src
.
oclchannels
();
j
++
,
i
++
)
s
.
val
[
j
]
+=
p
[
i
];
...
...
modules/ocl/src/gftt.cpp
View file @
adc15c2b
...
...
@@ -158,8 +158,8 @@ static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero)
// first parallel pass
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src_offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src_offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
total
));
...
...
modules/ocl/src/opencl/arithm_sum.cl
View file @
adc15c2b
...
...
@@ -63,21 +63,19 @@
/**************************************Array
buffer
SUM**************************************/
__kernel
void
arithm_op_sum
(
int
cols,int
invalid_cols,int
offset,int
elemnum,int
groupnum
,
__global
srcT
*src,
__global
dstT
*
dst
)
__kernel
void
arithm_op_sum
(
__global
srcT
*
src,
int
src_step,
int
src_offset,
int
src_cols
,
int
total,
int
groupnum,
__global
dstT
*
dst
)
{
int
lid
=
get_local_id
(
0
)
;
int
gid
=
get_group_id
(
0
)
;
int
id
=
get_global_id
(
0
)
;
int
idx
=
offset
+
id
+
(
id
/
cols
)
*
invalid_cols
;
__local
dstT
localmem_sum[128]
;
dstT
sum
=
(
dstT
)(
0
)
,
temp
;
for
(
int
grainSize
=
groupnum
<<
8
; id <
elemnum
; id += grainSize)
for
(
int
grainSize
=
groupnum
<<
8
; id <
total
; id += grainSize)
{
idx
=
offset
+
id
+
(
id
/
cols
)
*
invalid_cols
;
temp
=
convertToDstT
(
src[idx]
)
;
temp
=
convertToDstT
(
src[mad24
(
id
/
src_cols,
src_step,
id
%
src_cols
+
src_offset
)
]
)
;
FUNC
(
temp,
sum
)
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment