Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
089cf423
Commit
089cf423
authored
Oct 30, 2013
by
Andrey Pavlenko
Committed by
OpenCV Buildbot
Oct 30, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1717 from alalek:ocl_adjust_worksize
parents
9751b320
7b0f018a
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
240 additions
and
184 deletions
+240
-184
util.hpp
modules/ocl/include/opencv2/ocl/private/util.hpp
+4
-0
cl_operations.cpp
modules/ocl/src/cl_operations.cpp
+16
-4
filtering.cpp
modules/ocl/src/filtering.cpp
+220
-180
No files found.
modules/ocl/include/opencv2/ocl/private/util.hpp
View file @
089cf423
...
...
@@ -103,7 +103,11 @@ CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
const
cv
::
ocl
::
ProgramEntry
*
source
,
std
::
string
kernelName
);
CV_EXPORTS
cl_kernel
openCLGetKernelFromSource
(
const
Context
*
clCxt
,
const
cv
::
ocl
::
ProgramEntry
*
source
,
std
::
string
kernelName
,
const
char
*
build_options
);
CV_EXPORTS
cl_kernel
openCLGetKernelFromSource
(
Context
*
ctx
,
const
cv
::
ocl
::
ProgramEntry
*
source
,
string
kernelName
,
int
channels
,
int
depth
,
const
char
*
build_options
);
CV_EXPORTS
void
openCLVerifyKernel
(
const
Context
*
clCxt
,
cl_kernel
kernel
,
size_t
*
localThreads
);
CV_EXPORTS
void
openCLExecuteKernel
(
Context
*
ctx
,
cl_kernel
kernel
,
size_t
globalThreads
[
3
],
size_t
localThreads
[
3
],
std
::
vector
<
std
::
pair
<
size_t
,
const
void
*>
>
&
args
);
CV_EXPORTS
void
openCLExecuteKernel
(
Context
*
clCxt
,
const
cv
::
ocl
::
ProgramEntry
*
source
,
string
kernelName
,
std
::
vector
<
std
::
pair
<
size_t
,
const
void
*>
>
&
args
,
int
globalcols
,
int
globalrows
,
size_t
blockSize
=
16
,
int
kernel_expand_depth
=
-
1
,
int
kernel_expand_channel
=
-
1
);
CV_EXPORTS
void
openCLExecuteKernel_
(
Context
*
clCxt
,
const
cv
::
ocl
::
ProgramEntry
*
source
,
std
::
string
kernelName
,
...
...
modules/ocl/src/cl_operations.cpp
View file @
089cf423
...
...
@@ -336,8 +336,7 @@ static std::string removeDuplicatedWhiteSpaces(const char * buildOptions)
return
opt
;
}
void
openCLExecuteKernel_
(
Context
*
ctx
,
const
cv
::
ocl
::
ProgramEntry
*
source
,
string
kernelName
,
size_t
globalThreads
[
3
],
size_t
localThreads
[
3
],
vector
<
pair
<
size_t
,
const
void
*>
>
&
args
,
int
channels
,
cl_kernel
openCLGetKernelFromSource
(
Context
*
ctx
,
const
cv
::
ocl
::
ProgramEntry
*
source
,
string
kernelName
,
int
channels
,
int
depth
,
const
char
*
build_options
)
{
//construct kernel name
...
...
@@ -350,10 +349,14 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str
idxStr
<<
"_D"
<<
depth
;
kernelName
+=
idxStr
.
str
();
cl_kernel
kernel
;
std
::
string
fixedOptions
=
removeDuplicatedWhiteSpaces
(
build_options
);
kernel
=
openCLGetKernelFromSource
(
ctx
,
source
,
kernelName
,
fixedOptions
.
c_str
());
cl_kernel
kernel
=
openCLGetKernelFromSource
(
ctx
,
source
,
kernelName
,
fixedOptions
.
c_str
());
return
kernel
;
}
void
openCLExecuteKernel
(
Context
*
ctx
,
cl_kernel
kernel
,
size_t
globalThreads
[
3
],
size_t
localThreads
[
3
],
vector
<
pair
<
size_t
,
const
void
*>
>
&
args
)
{
if
(
localThreads
!=
NULL
)
{
globalThreads
[
0
]
=
roundUp
(
globalThreads
[
0
],
localThreads
[
0
]);
...
...
@@ -399,6 +402,15 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str
openCLSafeCall
(
clReleaseKernel
(
kernel
));
}
void
openCLExecuteKernel_
(
Context
*
ctx
,
const
cv
::
ocl
::
ProgramEntry
*
source
,
string
kernelName
,
size_t
globalThreads
[
3
],
size_t
localThreads
[
3
],
vector
<
pair
<
size_t
,
const
void
*>
>
&
args
,
int
channels
,
int
depth
,
const
char
*
build_options
)
{
cl_kernel
kernel
=
openCLGetKernelFromSource
(
ctx
,
source
,
kernelName
,
channels
,
depth
,
build_options
);
openCLExecuteKernel
(
ctx
,
kernel
,
globalThreads
,
localThreads
,
args
);
}
void
openCLExecuteKernel
(
Context
*
ctx
,
const
cv
::
ocl
::
ProgramEntry
*
source
,
string
kernelName
,
size_t
globalThreads
[
3
],
size_t
localThreads
[
3
],
vector
<
pair
<
size_t
,
const
void
*>
>
&
args
,
int
channels
,
int
depth
)
...
...
modules/ocl/src/filtering.cpp
View file @
089cf423
...
...
@@ -578,104 +578,124 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel,
kernelDataFloat
.
size
()
*
sizeof
(
float
),
1
,
clMemcpyHostToDevice
);
}
size_t
BLOCK_SIZE
=
src
.
clCxt
->
getDeviceInfo
().
maxWorkItemSizes
[
0
];
size_t
tryWorkItems
=
src
.
clCxt
->
getDeviceInfo
().
maxWorkItemSizes
[
0
];
do
{
size_t
BLOCK_SIZE
=
tryWorkItems
;
while
(
BLOCK_SIZE
>
32
&&
BLOCK_SIZE
>=
(
size_t
)
ksize
.
width
*
2
&&
BLOCK_SIZE
>
(
size_t
)
src
.
cols
*
2
)
BLOCK_SIZE
/=
2
;
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
size_t
BLOCK_SIZE_Y
=
1
;
size_t
BLOCK_SIZE_Y
=
1
;
#else
size_t
BLOCK_SIZE_Y
=
8
;
// TODO Check heuristic value on devices
while
(
BLOCK_SIZE_Y
<
BLOCK_SIZE
/
8
&&
BLOCK_SIZE_Y
*
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
*
32
<
(
size_t
)
src
.
rows
)
BLOCK_SIZE_Y
*=
2
;
size_t
BLOCK_SIZE_Y
=
8
;
// TODO Check heuristic value on devices
while
(
BLOCK_SIZE_Y
<
BLOCK_SIZE
/
8
&&
BLOCK_SIZE_Y
*
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
*
32
<
(
size_t
)
src
.
rows
)
BLOCK_SIZE_Y
*=
2
;
#endif
CV_Assert
((
size_t
)
ksize
.
width
<=
BLOCK_SIZE
);
CV_Assert
((
size_t
)
ksize
.
width
<=
BLOCK_SIZE
);
bool
isIsolatedBorder
=
(
borderType
&
BORDER_ISOLATED
)
!=
0
;
bool
isIsolatedBorder
=
(
borderType
&
BORDER_ISOLATED
)
!=
0
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
cl_uint
stepBytes
=
src
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
stepBytes
));
int
offsetXBytes
=
src
.
offset
%
src
.
step
;
int
offsetX
=
offsetXBytes
/
src
.
elemSize
();
CV_Assert
((
int
)(
offsetX
*
src
.
elemSize
())
==
offsetXBytes
);
int
offsetY
=
src
.
offset
/
src
.
step
;
int
endX
=
(
offsetX
+
src
.
cols
);
int
endY
=
(
offsetY
+
src
.
rows
);
cl_int
rect
[
4
]
=
{
offsetX
,
offsetY
,
endX
,
endY
};
if
(
!
isIsolatedBorder
)
{
rect
[
2
]
=
src
.
wholecols
;
rect
[
3
]
=
src
.
wholerows
;
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
rect
[
0
]));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
cl_uint
_stepBytes
=
dst
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
_stepBytes
));
int
_offsetXBytes
=
dst
.
offset
%
dst
.
step
;
int
_offsetX
=
_offsetXBytes
/
dst
.
elemSize
();
CV_Assert
((
int
)(
_offsetX
*
dst
.
elemSize
())
==
_offsetXBytes
);
int
_offsetY
=
dst
.
offset
/
dst
.
step
;
int
_endX
=
(
_offsetX
+
dst
.
cols
);
int
_endY
=
(
_offsetY
+
dst
.
rows
);
cl_int
_rect
[
4
]
=
{
_offsetX
,
_offsetY
,
_endX
,
_endY
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
_rect
[
0
]));
float
borderValue
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
double
borderValueDouble
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
if
((
borderType
&
~
BORDER_ISOLATED
)
==
BORDER_CONSTANT
)
{
if
(
useDouble
)
args
.
push_back
(
make_pair
(
sizeof
(
double
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValue
[
0
]));
else
args
.
push_back
(
make_pair
(
sizeof
(
float
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValueDouble
[
0
]));
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
cl_uint
stepBytes
=
src
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
stepBytes
));
int
offsetXBytes
=
src
.
offset
%
src
.
step
;
int
offsetX
=
offsetXBytes
/
src
.
elemSize
();
CV_Assert
((
int
)(
offsetX
*
src
.
elemSize
())
==
offsetXBytes
);
int
offsetY
=
src
.
offset
/
src
.
step
;
int
endX
=
(
offsetX
+
src
.
cols
);
int
endY
=
(
offsetY
+
src
.
rows
);
cl_int
rect
[
4
]
=
{
offsetX
,
offsetY
,
endX
,
endY
};
if
(
!
isIsolatedBorder
)
{
rect
[
2
]
=
src
.
wholecols
;
rect
[
3
]
=
src
.
wholerows
;
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
rect
[
0
]));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
cl_uint
_stepBytes
=
dst
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
_stepBytes
));
int
_offsetXBytes
=
dst
.
offset
%
dst
.
step
;
int
_offsetX
=
_offsetXBytes
/
dst
.
elemSize
();
CV_Assert
((
int
)(
_offsetX
*
dst
.
elemSize
())
==
_offsetXBytes
);
int
_offsetY
=
dst
.
offset
/
dst
.
step
;
int
_endX
=
(
_offsetX
+
dst
.
cols
);
int
_endY
=
(
_offsetY
+
dst
.
rows
);
cl_int
_rect
[
4
]
=
{
_offsetX
,
_offsetY
,
_endX
,
_endY
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
_rect
[
0
]));
float
borderValue
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
double
borderValueDouble
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
if
((
borderType
&
~
BORDER_ISOLATED
)
==
BORDER_CONSTANT
)
{
if
(
useDouble
)
args
.
push_back
(
make_pair
(
sizeof
(
double
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValue
[
0
]));
else
args
.
push_back
(
make_pair
(
sizeof
(
float
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValueDouble
[
0
]));
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
oclKernelParameter
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
oclKernelParameter
.
data
));
const
char
*
btype
=
NULL
;
const
char
*
btype
=
NULL
;
switch
(
borderType
&
~
BORDER_ISOLATED
)
{
case
BORDER_CONSTANT
:
btype
=
"BORDER_CONSTANT"
;
break
;
case
BORDER_REPLICATE
:
btype
=
"BORDER_REPLICATE"
;
break
;
case
BORDER_REFLECT
:
btype
=
"BORDER_REFLECT"
;
break
;
case
BORDER_WRAP
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
BORDER_REFLECT101
:
btype
=
"BORDER_REFLECT_101"
;
break
;
}
switch
(
borderType
&
~
BORDER_ISOLATED
)
{
case
BORDER_CONSTANT
:
btype
=
"BORDER_CONSTANT"
;
break
;
case
BORDER_REPLICATE
:
btype
=
"BORDER_REPLICATE"
;
break
;
case
BORDER_REFLECT
:
btype
=
"BORDER_REFLECT"
;
break
;
case
BORDER_WRAP
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
BORDER_REFLECT101
:
btype
=
"BORDER_REFLECT_101"
;
break
;
}
int
requiredTop
=
anchor
.
y
;
int
requiredLeft
=
BLOCK_SIZE
;
// not this: anchor.x;
int
requiredBottom
=
ksize
.
height
-
1
-
anchor
.
y
;
int
requiredRight
=
BLOCK_SIZE
;
// not this: ksize.width - 1 - anchor.x;
int
h
=
isIsolatedBorder
?
src
.
rows
:
src
.
wholerows
;
int
w
=
isIsolatedBorder
?
src
.
cols
:
src
.
wholecols
;
bool
extra_extrapolation
=
h
<
requiredTop
||
h
<
requiredBottom
||
w
<
requiredLeft
||
w
<
requiredRight
;
char
build_options
[
1024
];
sprintf
(
build_options
,
"-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
"-D %s -D %s -D %s"
,
(
int
)
BLOCK_SIZE
,
(
int
)
BLOCK_SIZE_Y
,
src
.
depth
(),
src
.
oclchannels
(),
useDouble
?
1
:
0
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
kernel_size_y2_aligned
,
btype
,
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isIsolatedBorder
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
);
size_t
lt
[
3
]
=
{
BLOCK_SIZE
,
1
,
1
};
size_t
gt
[
3
]
=
{
divUp
(
dst
.
cols
,
BLOCK_SIZE
-
(
ksize
.
width
-
1
))
*
BLOCK_SIZE
,
divUp
(
dst
.
rows
,
BLOCK_SIZE_Y
),
1
};
cl_kernel
kernel
=
openCLGetKernelFromSource
(
src
.
clCxt
,
&
filtering_filter2D
,
"filter2D"
,
-
1
,
-
1
,
build_options
);
size_t
kernelWorkGroupSize
;
openCLSafeCall
(
clGetKernelWorkGroupInfo
(
kernel
,
getClDeviceID
(
src
.
clCxt
),
CL_KERNEL_WORK_GROUP_SIZE
,
sizeof
(
size_t
),
&
kernelWorkGroupSize
,
0
));
if
(
lt
[
0
]
>
kernelWorkGroupSize
)
{
clReleaseKernel
(
kernel
);
CV_Assert
(
BLOCK_SIZE
>
kernelWorkGroupSize
);
tryWorkItems
=
kernelWorkGroupSize
;
continue
;
}
int
requiredTop
=
anchor
.
y
;
int
requiredLeft
=
BLOCK_SIZE
;
// not this: anchor.x;
int
requiredBottom
=
ksize
.
height
-
1
-
anchor
.
y
;
int
requiredRight
=
BLOCK_SIZE
;
// not this: ksize.width - 1 - anchor.x;
int
h
=
isIsolatedBorder
?
src
.
rows
:
src
.
wholerows
;
int
w
=
isIsolatedBorder
?
src
.
cols
:
src
.
wholecols
;
bool
extra_extrapolation
=
h
<
requiredTop
||
h
<
requiredBottom
||
w
<
requiredLeft
||
w
<
requiredRight
;
char
build_options
[
1024
];
sprintf
(
build_options
,
"-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
"-D %s -D %s -D %s"
,
(
int
)
BLOCK_SIZE
,
(
int
)
BLOCK_SIZE_Y
,
src
.
depth
(),
src
.
oclchannels
(),
useDouble
?
1
:
0
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
kernel_size_y2_aligned
,
btype
,
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isIsolatedBorder
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
);
size_t
gt
[
3
]
=
{
divUp
(
dst
.
cols
,
BLOCK_SIZE
-
(
ksize
.
width
-
1
))
*
BLOCK_SIZE
,
divUp
(
dst
.
rows
,
BLOCK_SIZE_Y
),
1
},
lt
[
3
]
=
{
BLOCK_SIZE
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
filtering_filter2D
,
"filter2D"
,
gt
,
lt
,
args
,
-
1
,
-
1
,
build_options
);
openCLExecuteKernel
(
src
.
clCxt
,
kernel
,
gt
,
lt
,
args
);
// kernel will be released here
}
while
(
false
);
}
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getLinearFilter_GPU
(
int
/*srcType*/
,
int
/*dstType*/
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
...
...
@@ -770,106 +790,126 @@ static void GPUFilterBox(const oclMat &src, oclMat &dst,
(
src
.
rows
==
dst
.
rows
));
CV_Assert
(
src
.
oclchannels
()
==
dst
.
oclchannels
());
size_t
BLOCK_SIZE
=
src
.
clCxt
->
getDeviceInfo
().
maxWorkItemSizes
[
0
];
size_t
BLOCK_SIZE_Y
=
8
;
// TODO Check heuristic value on devices
while
(
BLOCK_SIZE_Y
<
BLOCK_SIZE
/
8
&&
BLOCK_SIZE_Y
*
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
*
32
<
(
size_t
)
src
.
rows
)
BLOCK_SIZE_Y
*=
2
;
CV_Assert
((
size_t
)
ksize
.
width
<=
BLOCK_SIZE
);
bool
isIsolatedBorder
=
(
borderType
&
BORDER_ISOLATED
)
!=
0
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
cl_uint
stepBytes
=
src
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
stepBytes
));
int
offsetXBytes
=
src
.
offset
%
src
.
step
;
int
offsetX
=
offsetXBytes
/
src
.
elemSize
();
CV_Assert
((
int
)(
offsetX
*
src
.
elemSize
())
==
offsetXBytes
);
int
offsetY
=
src
.
offset
/
src
.
step
;
int
endX
=
(
offsetX
+
src
.
cols
);
int
endY
=
(
offsetY
+
src
.
rows
);
cl_int
rect
[
4
]
=
{
offsetX
,
offsetY
,
endX
,
endY
};
if
(
!
isIsolatedBorder
)
{
rect
[
2
]
=
src
.
wholecols
;
rect
[
3
]
=
src
.
wholerows
;
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
rect
[
0
]));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
cl_uint
_stepBytes
=
dst
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
_stepBytes
));
int
_offsetXBytes
=
dst
.
offset
%
dst
.
step
;
int
_offsetX
=
_offsetXBytes
/
dst
.
elemSize
();
CV_Assert
((
int
)(
_offsetX
*
dst
.
elemSize
())
==
_offsetXBytes
);
int
_offsetY
=
dst
.
offset
/
dst
.
step
;
int
_endX
=
(
_offsetX
+
dst
.
cols
);
int
_endY
=
(
_offsetY
+
dst
.
rows
);
cl_int
_rect
[
4
]
=
{
_offsetX
,
_offsetY
,
_endX
,
_endY
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
_rect
[
0
]));
bool
useDouble
=
src
.
depth
()
==
CV_64F
;
size_t
tryWorkItems
=
src
.
clCxt
->
getDeviceInfo
().
maxWorkItemSizes
[
0
];
do
{
size_t
BLOCK_SIZE
=
tryWorkItems
;
while
(
BLOCK_SIZE
>
32
&&
BLOCK_SIZE
>=
(
size_t
)
ksize
.
width
*
2
&&
BLOCK_SIZE
>
(
size_t
)
src
.
cols
*
2
)
BLOCK_SIZE
/=
2
;
size_t
BLOCK_SIZE_Y
=
8
;
// TODO Check heuristic value on devices
while
(
BLOCK_SIZE_Y
<
BLOCK_SIZE
/
8
&&
BLOCK_SIZE_Y
*
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
*
32
<
(
size_t
)
src
.
rows
)
BLOCK_SIZE_Y
*=
2
;
CV_Assert
((
size_t
)
ksize
.
width
<=
BLOCK_SIZE
);
bool
isIsolatedBorder
=
(
borderType
&
BORDER_ISOLATED
)
!=
0
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
cl_uint
stepBytes
=
src
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
stepBytes
));
int
offsetXBytes
=
src
.
offset
%
src
.
step
;
int
offsetX
=
offsetXBytes
/
src
.
elemSize
();
CV_Assert
((
int
)(
offsetX
*
src
.
elemSize
())
==
offsetXBytes
);
int
offsetY
=
src
.
offset
/
src
.
step
;
int
endX
=
(
offsetX
+
src
.
cols
);
int
endY
=
(
offsetY
+
src
.
rows
);
cl_int
rect
[
4
]
=
{
offsetX
,
offsetY
,
endX
,
endY
};
if
(
!
isIsolatedBorder
)
{
rect
[
2
]
=
src
.
wholecols
;
rect
[
3
]
=
src
.
wholerows
;
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
rect
[
0
]));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
cl_uint
_stepBytes
=
dst
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
_stepBytes
));
int
_offsetXBytes
=
dst
.
offset
%
dst
.
step
;
int
_offsetX
=
_offsetXBytes
/
dst
.
elemSize
();
CV_Assert
((
int
)(
_offsetX
*
dst
.
elemSize
())
==
_offsetXBytes
);
int
_offsetY
=
dst
.
offset
/
dst
.
step
;
int
_endX
=
(
_offsetX
+
dst
.
cols
);
int
_endY
=
(
_offsetY
+
dst
.
rows
);
cl_int
_rect
[
4
]
=
{
_offsetX
,
_offsetY
,
_endX
,
_endY
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
_rect
[
0
]));
bool
useDouble
=
src
.
depth
()
==
CV_64F
;
float
borderValue
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
double
borderValueDouble
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
if
((
borderType
&
~
BORDER_ISOLATED
)
==
BORDER_CONSTANT
)
{
if
(
useDouble
)
args
.
push_back
(
make_pair
(
sizeof
(
double
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValue
[
0
]));
else
args
.
push_back
(
make_pair
(
sizeof
(
float
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValueDouble
[
0
]));
}
float
borderValue
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
double
borderValueDouble
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
if
((
borderType
&
~
BORDER_ISOLATED
)
==
BORDER_CONSTANT
)
{
double
alphaDouble
=
alpha
;
// DON'T move into 'if' body
if
(
useDouble
)
args
.
push_back
(
make_pair
(
sizeof
(
double
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValue
[
0
]
));
args
.
push_back
(
make_pair
(
sizeof
(
double
)
,
(
void
*
)
&
alphaDouble
));
else
args
.
push_back
(
make_pair
(
sizeof
(
float
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValueDouble
[
0
]));
}
args
.
push_back
(
make_pair
(
sizeof
(
float
),
(
void
*
)
&
alpha
));
double
alphaDouble
=
alpha
;
// DON'T move into 'if' body
if
(
useDouble
)
args
.
push_back
(
make_pair
(
sizeof
(
double
),
(
void
*
)
&
alphaDouble
));
else
args
.
push_back
(
make_pair
(
sizeof
(
float
),
(
void
*
)
&
alpha
));
const
char
*
btype
=
NULL
;
const
char
*
btype
=
NULL
;
switch
(
borderType
&
~
BORDER_ISOLATED
)
{
case
BORDER_CONSTANT
:
btype
=
"BORDER_CONSTANT"
;
break
;
case
BORDER_REPLICATE
:
btype
=
"BORDER_REPLICATE"
;
break
;
case
BORDER_REFLECT
:
btype
=
"BORDER_REFLECT"
;
break
;
case
BORDER_WRAP
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
BORDER_REFLECT101
:
btype
=
"BORDER_REFLECT_101"
;
break
;
}
switch
(
borderType
&
~
BORDER_ISOLATED
)
{
case
BORDER_CONSTANT
:
btype
=
"BORDER_CONSTANT"
;
break
;
case
BORDER_REPLICATE
:
btype
=
"BORDER_REPLICATE"
;
break
;
case
BORDER_REFLECT
:
btype
=
"BORDER_REFLECT"
;
break
;
case
BORDER_WRAP
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
BORDER_REFLECT101
:
btype
=
"BORDER_REFLECT_101"
;
break
;
}
int
requiredTop
=
anchor
.
y
;
int
requiredLeft
=
BLOCK_SIZE
;
// not this: anchor.x;
int
requiredBottom
=
ksize
.
height
-
1
-
anchor
.
y
;
int
requiredRight
=
BLOCK_SIZE
;
// not this: ksize.width - 1 - anchor.x;
int
h
=
isIsolatedBorder
?
src
.
rows
:
src
.
wholerows
;
int
w
=
isIsolatedBorder
?
src
.
cols
:
src
.
wholecols
;
bool
extra_extrapolation
=
h
<
requiredTop
||
h
<
requiredBottom
||
w
<
requiredLeft
||
w
<
requiredRight
;
CV_Assert
(
w
>=
ksize
.
width
&&
h
>=
ksize
.
height
);
// TODO Other cases are not tested well
char
build_options
[
1024
];
sprintf
(
build_options
,
"-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s"
,
(
int
)
BLOCK_SIZE
,
(
int
)
BLOCK_SIZE_Y
,
src
.
depth
(),
src
.
oclchannels
(),
useDouble
?
1
:
0
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
,
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isIsolatedBorder
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
);
size_t
lt
[
3
]
=
{
BLOCK_SIZE
,
1
,
1
};
size_t
gt
[
3
]
=
{
divUp
(
dst
.
cols
,
BLOCK_SIZE
-
(
ksize
.
width
-
1
))
*
BLOCK_SIZE
,
divUp
(
dst
.
rows
,
BLOCK_SIZE_Y
),
1
};
cl_kernel
kernel
=
openCLGetKernelFromSource
(
src
.
clCxt
,
&
filtering_boxFilter
,
"boxFilter"
,
-
1
,
-
1
,
build_options
);
size_t
kernelWorkGroupSize
;
openCLSafeCall
(
clGetKernelWorkGroupInfo
(
kernel
,
getClDeviceID
(
src
.
clCxt
),
CL_KERNEL_WORK_GROUP_SIZE
,
sizeof
(
size_t
),
&
kernelWorkGroupSize
,
0
));
if
(
lt
[
0
]
>
kernelWorkGroupSize
)
{
clReleaseKernel
(
kernel
);
CV_Assert
(
BLOCK_SIZE
>
kernelWorkGroupSize
);
tryWorkItems
=
kernelWorkGroupSize
;
continue
;
}
int
requiredTop
=
anchor
.
y
;
int
requiredLeft
=
BLOCK_SIZE
;
// not this: anchor.x;
int
requiredBottom
=
ksize
.
height
-
1
-
anchor
.
y
;
int
requiredRight
=
BLOCK_SIZE
;
// not this: ksize.width - 1 - anchor.x;
int
h
=
isIsolatedBorder
?
src
.
rows
:
src
.
wholerows
;
int
w
=
isIsolatedBorder
?
src
.
cols
:
src
.
wholecols
;
bool
extra_extrapolation
=
h
<
requiredTop
||
h
<
requiredBottom
||
w
<
requiredLeft
||
w
<
requiredRight
;
CV_Assert
(
w
>=
ksize
.
width
&&
h
>=
ksize
.
height
);
// TODO Other cases are not tested well
char
build_options
[
1024
];
sprintf
(
build_options
,
"-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s"
,
(
int
)
BLOCK_SIZE
,
(
int
)
BLOCK_SIZE_Y
,
src
.
depth
(),
src
.
oclchannels
(),
useDouble
?
1
:
0
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
,
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isIsolatedBorder
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
);
size_t
gt
[
3
]
=
{
divUp
(
dst
.
cols
,
BLOCK_SIZE
-
(
ksize
.
width
-
1
))
*
BLOCK_SIZE
,
divUp
(
dst
.
rows
,
BLOCK_SIZE_Y
),
1
},
lt
[
3
]
=
{
BLOCK_SIZE
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
filtering_boxFilter
,
"boxFilter"
,
gt
,
lt
,
args
,
-
1
,
-
1
,
build_options
);
openCLExecuteKernel
(
src
.
clCxt
,
kernel
,
gt
,
lt
,
args
);
// kernel will be released here
}
while
(
false
);
}
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getBoxFilter_GPU
(
int
/*srcType*/
,
int
/*dstType*/
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment