Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
0f95f0d8
Commit
0f95f0d8
authored
Oct 26, 2013
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocl: rewrite filter2D
parent
0bf9ece9
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
537 additions
and
493 deletions
+537
-493
ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+6
-6
filtering.cpp
modules/ocl/src/filtering.cpp
+158
-103
filtering_filter2D.cl
modules/ocl/src/opencl/filtering_filter2D.cl
+370
-0
filtering_laplacian.cl
modules/ocl/src/opencl/filtering_laplacian.cl
+0
-381
test_filters.cpp
modules/ocl/test/test_filters.cpp
+3
-3
No files found.
modules/ocl/include/opencv2/ocl/ocl.hpp
View file @
0f95f0d8
...
...
@@ -718,8 +718,9 @@ namespace cv
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createDerivFilter_GPU
(
int
srcType
,
int
dstType
,
int
dx
,
int
dy
,
int
ksize
,
int
borderType
=
BORDER_DEFAULT
);
//! applies Laplacian operator to the image
// supports only ksize = 1 and ksize = 3 8UC1 8UC4 32FC1 32FC4 data type
CV_EXPORTS
void
Laplacian
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
int
ksize
=
1
,
double
scale
=
1
);
// supports only ksize = 1 and ksize = 3
CV_EXPORTS
void
Laplacian
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
int
ksize
=
1
,
double
scale
=
1
,
double
delta
=
0
,
int
borderType
=
BORDER_DEFAULT
);
//! returns 2D box filter
// dst type must be the same as source type
...
...
@@ -731,11 +732,12 @@ namespace cv
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
//! returns 2D filter with the specified kernel
// supports
CV_8UC1 and CV_8UC4 types
// supports
: dst type must be the same as source type
CV_EXPORTS
Ptr
<
BaseFilter_GPU
>
getLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
//! returns the non-separable linear filter engine
// supports: dst type must be the same as source type
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
...
...
@@ -762,10 +764,8 @@ namespace cv
}
//! applies non-separable 2D linear filter to the image
// Note, at the moment this function only works when anchor point is in the kernel center
// and kernel size supported is either 3x3 or 5x5; otherwise the function will fail to output valid result
CV_EXPORTS
void
filter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
const
Mat
&
kernel
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
Point
anchor
=
Point
(
-
1
,
-
1
),
double
delta
=
0.0
,
int
borderType
=
BORDER_DEFAULT
);
//! applies separable 2D linear filter to the image
CV_EXPORTS
void
sepFilter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
const
Mat
&
kernelX
,
const
Mat
&
kernelY
,
...
...
modules/ocl/src/filtering.cpp
View file @
0f95f0d8
...
...
@@ -69,37 +69,14 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize)
normalizeAnchor
(
anchor
.
y
,
ksize
.
height
);
}
inline
void
normalizeROI
(
Rect
&
roi
,
const
Size
&
ksize
,
const
Point
&
anchor
,
const
Size
&
src_size
)
inline
void
normalizeROI
(
Rect
&
roi
,
const
Size
&
ksize
,
const
Point
&
/*anchor*/
,
const
Size
&
src_size
)
{
if
(
roi
==
Rect
(
0
,
0
,
-
1
,
-
1
))
roi
=
Rect
(
0
,
0
,
src_size
.
width
,
src_size
.
height
);
CV_Assert
(
ksize
.
height
>
0
&&
ksize
.
width
>
0
&&
((
ksize
.
height
&
1
)
==
1
)
&&
((
ksize
.
width
&
1
)
==
1
));
CV_Assert
((
anchor
.
x
==
-
1
&&
anchor
.
y
==
-
1
)
||
(
anchor
.
x
==
ksize
.
width
>>
1
&&
anchor
.
y
==
ksize
.
height
>>
1
));
CV_Assert
(
roi
.
x
>=
0
&&
roi
.
y
>=
0
&&
roi
.
width
<=
src_size
.
width
&&
roi
.
height
<=
src_size
.
height
);
}
inline
void
normalizeKernel
(
const
Mat
&
kernel
,
oclMat
&
gpu_krnl
,
int
type
=
CV_8U
,
int
*
nDivisor
=
0
,
bool
reverse
=
false
)
{
int
scale
=
nDivisor
&&
(
kernel
.
depth
()
==
CV_32F
||
kernel
.
depth
()
==
CV_64F
)
?
256
:
1
;
if
(
nDivisor
)
*
nDivisor
=
scale
;
Mat
temp
(
kernel
.
size
(),
type
);
kernel
.
convertTo
(
temp
,
type
,
scale
);
Mat
cont_krnl
=
temp
.
reshape
(
1
,
1
);
if
(
reverse
)
{
int
count
=
cont_krnl
.
cols
>>
1
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
std
::
swap
(
cont_krnl
.
at
<
int
>
(
0
,
i
),
cont_krnl
.
at
<
int
>
(
0
,
cont_krnl
.
cols
-
1
-
i
));
}
gpu_krnl
.
upload
(
cont_krnl
);
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -168,7 +145,7 @@ typedef void (*GPUMorfFilter_t)(const oclMat & , oclMat & , oclMat & , Size &, c
class
MorphFilter_GPU
:
public
BaseFilter_GPU
{
public
:
MorphFilter_GPU
(
const
Size
&
ksize_
,
const
Point
&
anchor_
,
const
ocl
Mat
&
kernel_
,
GPUMorfFilter_t
func_
)
:
MorphFilter_GPU
(
const
Size
&
ksize_
,
const
Point
&
anchor_
,
const
Mat
&
kernel_
,
GPUMorfFilter_t
func_
)
:
BaseFilter_GPU
(
ksize_
,
anchor_
,
BORDER_CONSTANT
),
kernel
(
kernel_
),
func
(
func_
),
rectKernel
(
false
)
{}
virtual
void
operator
()(
const
oclMat
&
src
,
oclMat
&
dst
)
...
...
@@ -355,16 +332,17 @@ Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat
CV_Assert
(
op
==
MORPH_ERODE
||
op
==
MORPH_DILATE
);
CV_Assert
(
type
==
CV_8UC1
||
type
==
CV_8UC3
||
type
==
CV_8UC4
||
type
==
CV_32FC1
||
type
==
CV_32FC3
||
type
==
CV_32FC4
);
oclMat
gpu_krnl
;
normalizeKernel
(
kernel
,
gpu_krnl
);
normalizeAnchor
(
anchor
,
ksize
);
Mat
kernel8U
;
kernel
.
convertTo
(
kernel8U
,
CV_8U
);
Mat
cont_krnl
=
kernel8U
.
reshape
(
1
,
1
);
bool
noZero
=
true
;
for
(
int
i
=
0
;
i
<
kernel
.
rows
*
kernel
.
cols
;
++
i
)
if
(
kernel
.
data
[
i
]
!=
1
)
noZero
=
false
;
MorphFilter_GPU
*
mfgpu
=
new
MorphFilter_GPU
(
ksize
,
anchor
,
gpu
_krnl
,
GPUMorfFilter_callers
[
op
][
CV_MAT_CN
(
type
)]);
MorphFilter_GPU
*
mfgpu
=
new
MorphFilter_GPU
(
ksize
,
anchor
,
cont
_krnl
,
GPUMorfFilter_callers
[
op
][
CV_MAT_CN
(
type
)]);
if
(
noZero
)
mfgpu
->
rectKernel
=
true
;
...
...
@@ -524,12 +502,12 @@ void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &ke
namespace
{
typedef
void
(
*
GPUFilter2D_t
)(
const
oclMat
&
,
oclMat
&
,
const
ocl
Mat
&
,
const
Size
&
,
const
Point
&
,
const
int
);
typedef
void
(
*
GPUFilter2D_t
)(
const
oclMat
&
,
oclMat
&
,
const
Mat
&
,
const
Size
&
,
const
Point
&
,
const
int
);
class
LinearFilter_GPU
:
public
BaseFilter_GPU
{
public
:
LinearFilter_GPU
(
const
Size
&
ksize_
,
const
Point
&
anchor_
,
const
ocl
Mat
&
kernel_
,
GPUFilter2D_t
func_
,
LinearFilter_GPU
(
const
Size
&
ksize_
,
const
Point
&
anchor_
,
const
Mat
&
kernel_
,
GPUFilter2D_t
func_
,
int
borderType_
)
:
BaseFilter_GPU
(
ksize_
,
anchor_
,
borderType_
),
kernel
(
kernel_
),
func
(
func_
)
{}
...
...
@@ -543,118 +521,192 @@ public:
};
}
static
void
GPUFilter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
const
oclMat
&
mat_kernel
,
// prepare kernel: transpose and make double rows (+align). Returns size of aligned row
// Samples:
// a b c
// Input: d e f
// g h i
// Output, last two zeros is the alignment:
// a d g a d g 0 0
// b e h b e h 0 0
// c f i c f i 0 0
template
<
typename
T
>
static
int
_prepareKernelFilter2D
(
std
::
vector
<
T
>&
data
,
const
Mat
&
kernel
)
{
Mat
_kernel
;
kernel
.
convertTo
(
_kernel
,
DataDepth
<
T
>::
value
);
int
size_y_aligned
=
roundUp
(
kernel
.
rows
*
2
,
4
);
data
.
clear
();
data
.
resize
(
size_y_aligned
*
kernel
.
cols
,
0
);
for
(
int
x
=
0
;
x
<
kernel
.
cols
;
x
++
)
{
for
(
int
y
=
0
;
y
<
kernel
.
rows
;
y
++
)
{
data
[
x
*
size_y_aligned
+
y
]
=
_kernel
.
at
<
T
>
(
y
,
x
);
data
[
x
*
size_y_aligned
+
y
+
kernel
.
rows
]
=
_kernel
.
at
<
T
>
(
y
,
x
);
}
}
return
size_y_aligned
;
}
static
void
GPUFilter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
const
Point
&
anchor
,
const
int
borderType
)
{
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
CV_Assert
((
src
.
oclchannels
()
==
dst
.
oclchannels
()));
CV_Assert
(
ksize
.
height
>
0
&&
ksize
.
width
>
0
&&
((
ksize
.
height
&
1
)
==
1
)
&&
((
ksize
.
width
&
1
)
==
1
));
CV_Assert
((
anchor
.
x
==
-
1
&&
anchor
.
y
==
-
1
)
||
(
anchor
.
x
==
ksize
.
width
>>
1
&&
anchor
.
y
==
ksize
.
height
>>
1
));
CV_Assert
(
ksize
.
width
==
ksize
.
height
);
Context
*
clCxt
=
src
.
clCxt
;
CV_Assert
(
src
.
oclchannels
()
==
dst
.
oclchannels
());
int
filterWidth
=
ksize
.
width
;
bool
ksize_3x3
=
filterWidth
==
3
&&
src
.
type
()
!=
CV_32FC4
&&
src
.
type
()
!=
CV_32FC3
;
// CV_32FC4 is not tuned up with filter2d_3x3 kernel
CV_Assert
(
kernel
.
cols
==
ksize
.
width
&&
kernel
.
rows
==
ksize
.
height
)
;
CV_Assert
(
kernel
.
channels
()
==
1
);
string
kernelName
=
ksize_3x3
?
"filter2D_3x3"
:
"filter2D"
;
CV_Assert
(
anchor
.
x
>=
0
&&
anchor
.
x
<
kernel
.
cols
);
CV_Assert
(
anchor
.
y
>=
0
&&
anchor
.
y
<
kernel
.
rows
);
size_t
src_offset_x
=
(
src
.
offset
%
src
.
step
)
/
src
.
elemSize
();
size_t
src_offset_y
=
src
.
offset
/
src
.
step
;
bool
useDouble
=
src
.
depth
()
==
CV_64F
;
size_t
dst_offset_x
=
(
dst
.
offset
%
dst
.
step
)
/
dst
.
elemSize
();
size_t
dst_offset_y
=
dst
.
offset
/
dst
.
step
;
std
::
vector
<
float
>
kernelDataFloat
;
std
::
vector
<
double
>
kernelDataDouble
;
int
kernel_size_y2_aligned
=
useDouble
?
_prepareKernelFilter2D
<
double
>
(
kernelDataDouble
,
kernel
)
:
_prepareKernelFilter2D
<
float
>
(
kernelDataFloat
,
kernel
);
oclMat
oclKernelParameter
;
if
(
useDouble
)
{
oclKernelParameter
.
createEx
(
1
,
kernelDataDouble
.
size
(),
CV_64FC1
,
DEVICE_MEM_R_ONLY
,
DEVICE_MEM_DEFAULT
);
openCLMemcpy2D
(
src
.
clCxt
,
oclKernelParameter
.
data
,
kernelDataDouble
.
size
()
*
sizeof
(
double
),
&
kernelDataDouble
[
0
],
kernelDataDouble
.
size
()
*
sizeof
(
double
),
kernelDataDouble
.
size
()
*
sizeof
(
double
),
1
,
clMemcpyHostToDevice
);
}
else
{
oclKernelParameter
.
createEx
(
1
,
kernelDataFloat
.
size
(),
CV_32FC1
,
DEVICE_MEM_R_ONLY
,
DEVICE_MEM_DEFAULT
);
openCLMemcpy2D
(
src
.
clCxt
,
oclKernelParameter
.
data
,
kernelDataFloat
.
size
()
*
sizeof
(
float
),
&
kernelDataFloat
[
0
],
kernelDataFloat
.
size
()
*
sizeof
(
float
),
kernelDataFloat
.
size
()
*
sizeof
(
float
),
1
,
clMemcpyHostToDevice
);
}
int
paddingPixels
=
filterWidth
&
(
-
2
);
size_t
BLOCK_SIZE
=
src
.
clCxt
->
getDeviceInfo
().
maxWorkItemSizes
[
0
];
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
size_t
BLOCK_SIZE_Y
=
1
;
#else
size_t
BLOCK_SIZE_Y
=
8
;
// TODO Check heuristic value on devices
while
(
BLOCK_SIZE_Y
<
BLOCK_SIZE
/
8
&&
BLOCK_SIZE_Y
*
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
*
32
<
(
size_t
)
src
.
rows
)
BLOCK_SIZE_Y
*=
2
;
#endif
size_t
localThreads
[
3
]
=
{
ksize_3x3
?
256
:
16
,
ksize_3x3
?
1
:
16
,
1
};
size_t
globalThreads
[
3
]
=
{
src
.
wholecols
,
src
.
wholerows
,
1
};
CV_Assert
((
size_t
)
ksize
.
width
<=
BLOCK_SIZE
);
int
cn
=
src
.
oclchannels
();
int
src_step
=
(
int
)(
src
.
step
/
src
.
elemSize
());
int
dst_step
=
(
int
)(
dst
.
step
/
src
.
elemSize
());
bool
isIsolatedBorder
=
(
borderType
&
BORDER_ISOLATED
)
!=
0
;
int
localWidth
=
localThreads
[
0
]
+
paddingPixels
;
int
localHeight
=
localThreads
[
1
]
+
paddingPixels
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
size_t
localMemSize
=
ksize_3x3
?
260
*
6
*
src
.
elemSize
()
:
(
localWidth
*
localHeight
)
*
src
.
elemSize
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
cl_uint
stepBytes
=
src
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
stepBytes
));
int
offsetXBytes
=
src
.
offset
%
src
.
step
;
int
offsetX
=
offsetXBytes
/
src
.
elemSize
();
CV_Assert
((
int
)(
offsetX
*
src
.
elemSize
())
==
offsetXBytes
);
int
offsetY
=
src
.
offset
/
src
.
step
;
int
endX
=
(
offsetX
+
src
.
cols
);
int
endY
=
(
offsetY
+
src
.
rows
);
cl_int
rect
[
4
]
=
{
offsetX
,
offsetY
,
endX
,
endY
};
if
(
!
isIsolatedBorder
)
{
rect
[
2
]
=
src
.
wholecols
;
rect
[
3
]
=
src
.
wholerows
;
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
rect
[
0
]));
int
vector_lengths
[
4
][
7
]
=
{{
4
,
4
,
4
,
4
,
4
,
4
,
4
},
{
4
,
4
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
4
,
4
,
4
,
4
,
1
,
1
,
4
}
};
int
cols
=
dst
.
cols
+
((
dst_offset_x
)
&
(
vector_lengths
[
cn
-
1
][
src
.
depth
()]
-
1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
cl_uint
_stepBytes
=
dst
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
_stepBytes
));
int
_offsetXBytes
=
dst
.
offset
%
dst
.
step
;
int
_offsetX
=
_offsetXBytes
/
dst
.
elemSize
();
CV_Assert
((
int
)(
_offsetX
*
dst
.
elemSize
())
==
_offsetXBytes
);
int
_offsetY
=
dst
.
offset
/
dst
.
step
;
int
_endX
=
(
_offsetX
+
dst
.
cols
);
int
_endY
=
(
_offsetY
+
dst
.
rows
);
cl_int
_rect
[
4
]
=
{
_offsetX
,
_offsetY
,
_endX
,
_endY
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
_rect
[
0
]));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_kernel
.
data
));
args
.
push_back
(
make_pair
(
localMemSize
,
(
void
*
)
NULL
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_offset_x
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_offset_y
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_offset_x
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_offset_y
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
char
btype
[
30
];
switch
(
borderType
)
float
borderValue
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
double
borderValueDouble
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
if
((
borderType
&
~
BORDER_ISOLATED
)
==
BORDER_CONSTANT
)
{
case
0
:
sprintf
(
btype
,
"BORDER_CONSTANT"
);
if
(
useDouble
)
args
.
push_back
(
make_pair
(
sizeof
(
double
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValue
[
0
]));
else
args
.
push_back
(
make_pair
(
sizeof
(
float
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValueDouble
[
0
]));
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
oclKernelParameter
.
data
));
const
char
*
btype
=
NULL
;
switch
(
borderType
&
~
BORDER_ISOLATED
)
{
case
BORDER_CONSTANT
:
btype
=
"BORDER_CONSTANT"
;
break
;
case
1
:
sprintf
(
btype
,
"BORDER_REPLICATE"
)
;
case
BORDER_REPLICATE
:
btype
=
"BORDER_REPLICATE"
;
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
)
;
case
BORDER_REFLECT
:
btype
=
"BORDER_REFLECT"
;
break
;
case
3
:
case
BORDER_WRAP
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
)
;
case
BORDER_REFLECT101
:
btype
=
"BORDER_REFLECT_101"
;
break
;
}
int
type
=
src
.
depth
();
char
build_options
[
150
];
sprintf
(
build_options
,
"-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d"
,
btype
,
cn
,
type
,
cn
,
ksize
.
width
);
openCLExecuteKernel
(
clCxt
,
&
filtering_laplacian
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
int
requiredTop
=
anchor
.
y
;
int
requiredLeft
=
BLOCK_SIZE
;
// not this: anchor.x;
int
requiredBottom
=
ksize
.
height
-
1
-
anchor
.
y
;
int
requiredRight
=
BLOCK_SIZE
;
// not this: ksize.width - 1 - anchor.x;
int
h
=
isIsolatedBorder
?
src
.
rows
:
src
.
wholerows
;
int
w
=
isIsolatedBorder
?
src
.
cols
:
src
.
wholecols
;
bool
extra_extrapolation
=
h
<
requiredTop
||
h
<
requiredBottom
||
w
<
requiredLeft
||
w
<
requiredRight
;
char
build_options
[
1024
];
sprintf
(
build_options
,
"-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
"-D %s -D %s -D %s"
,
(
int
)
BLOCK_SIZE
,
(
int
)
BLOCK_SIZE_Y
,
src
.
depth
(),
src
.
oclchannels
(),
useDouble
?
1
:
0
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
kernel_size_y2_aligned
,
btype
,
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isIsolatedBorder
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
);
size_t
gt
[
3
]
=
{
divUp
(
dst
.
cols
,
BLOCK_SIZE
-
(
ksize
.
width
-
1
))
*
BLOCK_SIZE
,
divUp
(
dst
.
rows
,
BLOCK_SIZE_Y
),
1
},
lt
[
3
]
=
{
BLOCK_SIZE
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
filtering_filter2D
,
"filter2D"
,
gt
,
lt
,
args
,
-
1
,
-
1
,
build_options
);
}
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getLinearFilter_GPU
(
int
/*srcType*/
,
int
/*dstType*/
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
const
Point
&
anchor
,
int
borderType
)
{
static
const
GPUFilter2D_t
GPUFilter2D_callers
[]
=
{
0
,
GPUFilter2D
,
0
,
GPUFilter2D
,
GPUFilter2D
};
CV_Assert
((
srcType
==
CV_8UC1
||
srcType
==
CV_8UC3
||
srcType
==
CV_8UC4
||
srcType
==
CV_32FC1
||
srcType
==
CV_32FC3
||
srcType
==
CV_32FC4
)
&&
dstType
==
srcType
);
oclMat
gpu_krnl
;
Point
norm_archor
=
anchor
;
normalizeKernel
(
kernel
,
gpu_krnl
,
CV_32FC1
);
normalizeAnchor
(
norm_archor
,
ksize
);
return
Ptr
<
BaseFilter_GPU
>
(
new
LinearFilter_GPU
(
ksize
,
anchor
,
gpu_krnl
,
GPUFilter2D_callers
[
CV_MAT_CN
(
srcType
)]
,
return
Ptr
<
BaseFilter_GPU
>
(
new
LinearFilter_GPU
(
ksize
,
norm_archor
,
kernel
,
GPUFilter2D
,
borderType
));
}
Ptr
<
FilterEngine_GPU
>
cv
::
ocl
::
createLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Point
&
anchor
,
int
borderType
)
{
Size
ksize
=
kernel
.
size
();
Size
ksize
=
kernel
.
size
();
// TODO remove duplicated parameter
Ptr
<
BaseFilter_GPU
>
linearFilter
=
getLinearFilter_GPU
(
srcType
,
dstType
,
kernel
,
ksize
,
anchor
,
borderType
);
return
createFilter2D_GPU
(
linearFilter
);
}
void
cv
::
ocl
::
filter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
const
Mat
&
kernel
,
Point
anchor
,
int
borderType
)
void
cv
::
ocl
::
filter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
const
Mat
&
kernel
,
Point
anchor
,
double
delta
,
int
borderType
)
{
CV_Assert
(
delta
==
0
);
if
(
ddepth
<
0
)
ddepth
=
src
.
depth
();
...
...
@@ -1222,8 +1274,11 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
sepFilter2D
(
src
,
dst
,
ddepth
,
kx
,
ky
,
Point
(
-
1
,
-
1
),
delta
,
bordertype
);
}
void
cv
::
ocl
::
Laplacian
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
int
ksize
,
double
scale
)
void
cv
::
ocl
::
Laplacian
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
int
ksize
,
double
scale
,
double
delta
,
int
borderType
)
{
CV_Assert
(
delta
==
0
);
if
(
!
src
.
clCxt
->
supportsFeature
(
FEATURE_CL_DOUBLE
)
&&
src
.
type
()
==
CV_64F
)
{
CV_Error
(
CV_OpenCLDoubleNotSupported
,
"Selected device doesn't support double"
);
...
...
@@ -1232,17 +1287,17 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
CV_Assert
(
ksize
==
1
||
ksize
==
3
);
int
K
[
2
][
9
]
=
double
K
[
2
][
9
]
=
{
{
0
,
1
,
0
,
1
,
-
4
,
1
,
0
,
1
,
0
},
{
2
,
0
,
2
,
0
,
-
8
,
0
,
2
,
0
,
2
}
};
Mat
kernel
(
3
,
3
,
CV_
32S
,
(
void
*
)
K
[
ksize
==
3
]);
Mat
kernel
(
3
,
3
,
CV_
64F
,
(
void
*
)
K
[
ksize
==
3
?
1
:
0
]);
if
(
scale
!=
1
)
kernel
*=
scale
;
filter2D
(
src
,
dst
,
ddepth
,
kernel
,
Point
(
-
1
,
-
1
));
filter2D
(
src
,
dst
,
ddepth
,
kernel
,
Point
(
-
1
,
-
1
)
,
0
,
borderType
);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
modules/ocl/src/opencl/filtering_filter2D.cl
0 → 100644
View file @
0f95f0d8
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2013,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
materials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
ifdef
BORDER_REPLICATE
//BORDER_REPLICATE:
aaaaaa|abcdefgh|hhhhhhh
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
l_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
r_edge
)
-1
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
t_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
b_edge
)
-1
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT
//BORDER_REFLECT:
fedcba|abcdefgh|hgfedcb
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-1+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-1+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT_101
//BORDER_REFLECT_101:
gfedcb|abcdefgh|gfedcba
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-2+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-2+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
//blur
function
does
not
support
BORDER_WRAP
#
ifdef
BORDER_WRAP
//BORDER_WRAP:
cdefgh|abcdefgh|abcdefg
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
i
)
+
(
r_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
i
)
-
(
r_edge
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
i
)
+
(
b_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
i
)
-
(
b_edge
)
:
(
addr
))
#
endif
#
ifdef
EXTRA_EXTRAPOLATION
//
border
>
src
image
size
#
ifdef
BORDER_CONSTANT
//
None
#
elif
defined
BORDER_REPLICATE
#
define
EXTRAPOLATE
(
x,
y,
minX,
minY,
maxX,
maxY
)
\
{
\
x
=
max
(
min
(
x,
maxX
-
1
)
,
minX
)
; \
y
=
max
(
min
(
y,
maxY
-
1
)
,
minY
)
; \
}
#
elif
defined
BORDER_WRAP
#
define
EXTRAPOLATE
(
x,
y,
minX,
minY,
maxX,
maxY
)
\
{
\
if
(
x
<
minX
)
\
x
-=
((
x
-
maxX
+
1
)
/
maxX
)
*
maxX
; \
if
(
x
>=
maxX
)
\
x
%=
maxX
; \
if
(
y
<
minY
)
\
y
-=
((
y
-
maxY
+
1
)
/
maxY
)
*
maxY
; \
if
(
y
>=
maxY
)
\
y
%=
maxY
; \
}
#
elif
defined
(
BORDER_REFLECT
)
|
| defined(BORDER_REFLECT_101)
#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
{ \
if (maxX - minX == 1) \
x = minX; \
else \
do \
{ \
if (x < minX) \
x = -(x - minX) - 1 + delta; \
else \
x = maxX - 1 - (x - maxX) - delta; \
} \
while (x >= maxX || x < minX); \
\
if (maxY - minY == 1) \
y = minY; \
else \
do \
{ \
if (y < minY) \
y = -(y - minY) - 1 + delta; \
else \
y = maxY - 1 - (y - maxY) - delta; \
} \
while (y >= maxY || y < minY); \
}
#ifdef BORDER_REFLECT
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
#elif defined(BORDER_REFLECT_101)
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
#endif
#else
#error No extrapolation method
#endif
#else
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
{ \
int _row = y - minY, _col = x - minX; \
_row = ADDR_H(_row, 0, maxY - minY); \
_row = ADDR_B(_row, maxY - minY, _row); \
y = _row + minY; \
\
_col = ADDR_L(_col, 0, maxX - minX); \
_col = ADDR_R(_col, maxX - minX, _col); \
x = _col + minX; \
}
#endif
#if USE_DOUBLE
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#define FPTYPE double
#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
#else
#define FPTYPE float
#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
#endif
#if DATA_DEPTH == 0
#define BASE_TYPE uchar
#elif DATA_DEPTH == 1
#define BASE_TYPE char
#elif DATA_DEPTH == 2
#define BASE_TYPE ushort
#elif DATA_DEPTH == 3
#define BASE_TYPE short
#elif DATA_DEPTH == 4
#define BASE_TYPE int
#elif DATA_DEPTH == 5
#define BASE_TYPE float
#elif DATA_DEPTH == 6
#define BASE_TYPE double
#else
#error data_depth
#endif
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
#define uchar1 uchar
#define char1 char
#define ushort1 ushort
#define short1 short
#define int1 int
#define float1 float
#define double1 double
#define convert_uchar1_sat_rte convert_uchar_sat_rte
#define convert_char1_sat_rte convert_char_sat_rte
#define convert_ushort1_sat_rte convert_ushort_sat_rte
#define convert_short1_sat_rte convert_short_sat_rte
#define convert_int1_sat_rte convert_int_sat_rte
#define convert_float1
#define convert_double1
#if DATA_DEPTH == 5 |
|
DATA_DEPTH
==
6
#
define
CONVERT_TO_TYPE
CAT
(
CAT
(
convert_,
BASE_TYPE
)
,
VEC_SIZE
)
#
else
#
define
CONVERT_TO_TYPE
CAT
(
CAT
(
CAT
(
convert_,
BASE_TYPE
)
,
VEC_SIZE
)
,
_sat_rte
)
#
endif
#
define
VEC_SIZE
DATA_CHAN
#
define
VEC_TYPE
CAT
(
BASE_TYPE,
VEC_SIZE
)
#
define
TYPE
VEC_TYPE
#
define
SCALAR_TYPE
CAT
(
FPTYPE,
VEC_SIZE
)
#
define
INTERMEDIATE_TYPE
CAT
(
FPTYPE,
VEC_SIZE
)
struct
RectCoords
{
int
x1,
y1,
x2,
y2
;
}
;
//#define
DEBUG
#
ifdef
DEBUG
#
define
DEBUG_ONLY
(
x
)
x
#
define
ASSERT
(
condition
)
do
{
if
(
!
(
condition
))
{
printf
(
"BUG in boxFilter kernel (global=%d,%d): "
#
condition
"\n"
,
get_global_id
(
0
)
,
get_global_id
(
1
))
; } } while (0)
#
else
#
define
DEBUG_ONLY
(
x
)
(
void
)
0
#
define
ASSERT
(
condition
)
(
void
)
0
#
endif
inline
INTERMEDIATE_TYPE
readSrcPixel
(
int2
pos,
__global
TYPE
*src,
const
unsigned
int
srcStepBytes,
const
struct
RectCoords
srcCoords
#
ifdef
BORDER_CONSTANT
,
SCALAR_TYPE
borderValue
#
endif
)
{
#
ifdef
BORDER_ISOLATED
if
(
pos.x
>=
srcCoords.x1
&&
pos.y
>=
srcCoords.y1
&&
pos.x
<
srcCoords.x2
&&
pos.y
<
srcCoords.y2
)
#
else
if
(
pos.x
>=
0
&&
pos.y
>=
0
&&
pos.x
<
srcCoords.x2
&&
pos.y
<
srcCoords.y2
)
#
endif
{
__global
TYPE*
ptr
=
(
__global
TYPE*
)((
__global
char*
)
src
+
pos.x
*
sizeof
(
TYPE
)
+
pos.y
*
srcStepBytes
)
;
return
CONVERT_TO_FPTYPE
(
*ptr
)
;
}
else
{
#
ifdef
BORDER_CONSTANT
return
borderValue
;
#
else
int
selected_col
=
pos.x
;
int
selected_row
=
pos.y
;
EXTRAPOLATE
(
selected_col,
selected_row,
#
ifdef
BORDER_ISOLATED
srcCoords.x1,
srcCoords.y1,
#
else
0
,
0
,
#
endif
srcCoords.x2,
srcCoords.y2
)
;
//
debug
border
mapping
//printf
(
"pos=%d,%d --> %d, %d\n"
,
pos.x,
pos.y,
selected_col,
selected_row
)
;
pos
=
(
int2
)(
selected_col,
selected_row
)
;
if
(
pos.x
>=
0
&&
pos.y
>=
0
&&
pos.x
<
srcCoords.x2
&&
pos.y
<
srcCoords.y2
)
{
__global
TYPE*
ptr
=
(
__global
TYPE*
)((
__global
char*
)
src
+
pos.x
*
sizeof
(
TYPE
)
+
pos.y
*
srcStepBytes
)
;
return
CONVERT_TO_FPTYPE
(
*ptr
)
;
}
else
{
//
for
debug
only
DEBUG_ONLY
(
printf
(
"BUG in boxFilter kernel\n"
))
;
return
(
FPTYPE
)(
0.0f
)
;
}
#
endif
}
}
//
INPUT
PARAMETER:
BLOCK_SIZE_Y
(
via
defines
)
__kernel
__attribute__
((
reqd_work_group_size
(
LOCAL_SIZE,
1
,
1
)))
void
filter2D
(
__global
TYPE
*src,
const
unsigned
int
srcStepBytes,
const
int4
srcRC,
__global
TYPE
*dst,
const
unsigned
int
dstStepBytes,
const
int4
dstRC,
#
ifdef
BORDER_CONSTANT
SCALAR_TYPE
borderValue,
#
endif
__constant
FPTYPE*
kernelData
//
transposed:
[KERNEL_SIZE_X][KERNEL_SIZE_Y2_ALIGNED]
)
{
const
struct
RectCoords
srcCoords
=
{srcRC.s0,
srcRC.s1,
srcRC.s2,
srcRC.s3}
; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
struct
RectCoords
dstCoords
=
{dstRC.s0,
dstRC.s1,
dstRC.s2,
dstRC.s3}
;
const
int
local_id
=
get_local_id
(
0
)
;
const
int
x
=
local_id
+
(
LOCAL_SIZE
-
(
KERNEL_SIZE_X
-
1
))
*
get_group_id
(
0
)
-
ANCHOR_X
;
const
int
y
=
get_global_id
(
1
)
*
BLOCK_SIZE_Y
;
INTERMEDIATE_TYPE
data[KERNEL_SIZE_Y]
;
__local
INTERMEDIATE_TYPE
sumOfCols[LOCAL_SIZE]
;
int2
srcPos
=
(
int2
)(
srcCoords.x1
+
x,
srcCoords.y1
+
y
-
ANCHOR_Y
)
;
int2
pos
=
(
int2
)(
dstCoords.x1
+
x,
dstCoords.y1
+
y
)
;
__global
TYPE*
dstPtr
=
(
__global
TYPE*
)((
__global
char*
)
dst
+
pos.x
*
sizeof
(
TYPE
)
+
pos.y
*
dstStepBytes
)
; // Pointer can be out of bounds!
bool
writeResult
=
(
local_id
>=
ANCHOR_X
&&
local_id
<
LOCAL_SIZE
-
(
KERNEL_SIZE_X
-
1
-
ANCHOR_X
)
&&
pos.x
>=
dstCoords.x1
&&
pos.x
<
dstCoords.x2
)
;
#
if
BLOCK_SIZE_Y
>
1
bool
readAllpixels
=
true
;
int
sy_index
=
0
; // current index in data[] array
dstCoords.y2
=
min
(
dstCoords.y2,
pos.y
+
BLOCK_SIZE_Y
)
;
for
(
;
pos.y
<
dstCoords.y2
;
pos.y++,
dstPtr
=
(
__global
TYPE*
)((
__global
char*
)
dstPtr
+
dstStepBytes
))
#
endif
{
ASSERT
(
pos.y
<
dstCoords.y2
)
;
for
(
#
if
BLOCK_SIZE_Y
>
1
int
sy
=
readAllpixels
?
0
:
-1
; sy < (readAllpixels ? KERNEL_SIZE_Y : 0);
#
else
int
sy
=
0
,
sy_index
=
0
; sy < KERNEL_SIZE_Y;
#
endif
sy++,
srcPos.y++
)
{
data[sy
+
sy_index]
=
readSrcPixel
(
srcPos,
src,
srcStepBytes,
srcCoords
#
ifdef
BORDER_CONSTANT
,
borderValue
#
endif
)
;
}
INTERMEDIATE_TYPE
total_sum
=
0
;
for
(
int
sx
=
0
; sx < KERNEL_SIZE_X; sx++)
{
{
__constant
FPTYPE*
k
=
&kernelData[KERNEL_SIZE_Y2_ALIGNED
*
sx
#
if
BLOCK_SIZE_Y
>
1
+
KERNEL_SIZE_Y
-
sy_index
#
endif
]
;
INTERMEDIATE_TYPE
tmp_sum
=
0
;
for
(
int
sy
=
0
; sy < KERNEL_SIZE_Y; sy++)
{
tmp_sum
+=
data[sy]
*
k[sy]
;
}
sumOfCols[local_id]
=
tmp_sum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
int
id
=
local_id
+
sx
-
ANCHOR_X
;
if
(
id
>=
0
&&
id
<
LOCAL_SIZE
)
total_sum
+=
sumOfCols[id]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
writeResult
)
{
ASSERT
(
pos.y
>=
dstCoords.y1
&&
pos.y
<
dstCoords.y2
)
;
*dstPtr
=
CONVERT_TO_TYPE
(
total_sum
)
;
}
#
if
BLOCK_SIZE_Y
>
1
readAllpixels
=
false
;
#
if
BLOCK_SIZE_Y
>
KERNEL_SIZE_Y
sy_index
=
(
sy_index
+
1
<=
KERNEL_SIZE_Y
)
?
sy_index
+
1
:
1
;
#
else
sy_index++
;
#
endif
#
endif
//
BLOCK_SIZE_Y
==
1
}
}
modules/ocl/src/opencl/filtering_laplacian.cl
deleted
100644 → 0
View file @
0bf9ece9
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Pang
Erping,
erping@multicorewareinc.com
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
Peng
Xiao,
pengxiao@outlook.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
materials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro
for
border
type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#
ifdef
BORDER_REPLICATE
//BORDER_REPLICATE:
aaaaaa|abcdefgh|hhhhhhh
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
l_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
r_edge
)
-1
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
t_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
b_edge
)
-1
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
((
l_edge
)
<<1
)
-
(
i
)
-1
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-1+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
((
t_edge
)
<<1
)
-
(
i
)
-1
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-1+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT_101
//BORDER_REFLECT_101:
gfedcb|abcdefgh|gfedcba
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
((
l_edge
)
<<1
)
-
(
i
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-2+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
((
t_edge
)
<<1
)
-
(
i
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-2+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
ifdef
IMG_C_1_0
#
define
T_IMG
uchar
#
define
T_IMGx4
uchar4
#
define
T_IMG_C1
uchar
#
define
CONVERT_TYPE
convert_uchar_sat
#
define
CONVERT_TYPEx4
convert_uchar4_sat
#
endif
#
ifdef
IMG_C_4_0
#
define
T_IMG
uchar4
#
define
T_IMGx4
uchar16
#
define
T_IMG_C1
uchar
#
define
CONVERT_TYPE
convert_uchar4_sat
#
define
CONVERT_TYPEx4
convert_uchar16_sat
#
endif
#
ifdef
IMG_C_1_5
#
define
T_IMG
float
#
define
T_IMGx4
float4
#
define
T_IMG_C1
float
#
define
CONVERT_TYPE
convert_float
#
define
CONVERT_TYPEx4
convert_float4
#
endif
#
ifdef
IMG_C_4_5
#
define
T_IMG
float4
#
define
T_IMGx4
float16
#
define
T_IMG_C1
float
#
define
CONVERT_TYPE
convert_float4
#
define
CONVERT_TYPEx4
convert_float16
#
endif
#
ifndef
CN
#
define
CN
1
#
endif
#
if
CN
==
1
#
define
T_SUM
float
#
define
T_SUMx4
float4
#
define
CONVERT_TYPE_SUM
convert_float
#
define
CONVERT_TYPE_SUMx4
convert_float4
#
define
SUM_ZERO
(
0.0f
)
#
define
SUM_ZEROx4
(
0.0f,
0.0f,
0.0f,
0.0f
)
#
define
VLOAD4
vload4
#
define
SX
x
#
define
SY
y
#
define
SZ
z
#
define
SW
w
#
elif
CN
==
4
#
define
T_SUM
float4
#
define
T_SUMx4
float16
#
define
CONVERT_TYPE_SUM
convert_float4
#
define
CONVERT_TYPE_SUMx4
convert_float16
#
define
SUM_ZERO
(
0.0f,
0.0f,
0.0f,
0.0f
)
#
define
SUM_ZEROx4
(
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f
)
#
define
VLOAD4
vload16
#
define
SX
s0123
#
define
SY
s4567
#
define
SZ
s89ab
#
define
SW
scdef
#
endif
#
ifndef
FILTER_SIZE
#
define
FILTER_SIZE
3
#
endif
#
define
LOCAL_GROUP_SIZE
16
#
define
LOCAL_WIDTH
((
FILTER_SIZE/2
)
*2
+
LOCAL_GROUP_SIZE
)
#
define
LOCAL_HEIGHT
((
FILTER_SIZE/2
)
*2
+
LOCAL_GROUP_SIZE
)
#
define
FILTER_RADIUS
(
FILTER_SIZE
>>
1
)
__kernel
void
filter2D
(
__global
T_IMG
*src,
__global
T_IMG
*dst,
int
src_step,
int
dst_step,
__constant
float
*mat_kernel,
__local
T_IMG
*local_data,
int
wholerows,
int
wholecols,
int
src_offset_x,
int
src_offset_y,
int
dst_offset_x,
int
dst_offset_y,
int
cols,
int
rows,
int
operate_cols
)
{
int
groupStartCol
=
get_group_id
(
0
)
*
get_local_size
(
0
)
;
int
groupStartRow
=
get_group_id
(
1
)
*
get_local_size
(
1
)
;
int
localCol
=
get_local_id
(
0
)
;
int
localRow
=
get_local_id
(
1
)
;
int
globalCol
=
groupStartCol
+
localCol
;
int
globalRow
=
groupStartRow
+
localRow
;
const
int
src_offset
=
mad24
(
src_offset_y,
src_step,
src_offset_x
)
;
const
int
dst_offset
=
mad24
(
dst_offset_y,
dst_step,
dst_offset_x
)
;
#
ifdef
BORDER_CONSTANT
for
(
int
i
=
localRow
; i < LOCAL_HEIGHT; i += get_local_size(1))
{
int
curRow
=
groupStartRow
+
i
;
for
(
int
j
=
localCol
; j < LOCAL_WIDTH; j += get_local_size(0))
{
int
curCol
=
groupStartCol
+
j
;
if
(
curRow
<
FILTER_RADIUS
-
src_offset_y
|
| (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
curCol < FILTER_RADIUS - src_offset_x |
|
(
curCol
-
FILTER_RADIUS
)
>=
wholecols
-
src_offset_x
)
{
local_data[
(
i
)
*
LOCAL_WIDTH
+
j]
=
0
;
}
else
{
local_data[
(
i
)
*
LOCAL_WIDTH
+
j]
=
src[
(
curRow
-
FILTER_RADIUS
)
*
src_step
+
curCol
-
FILTER_RADIUS
+
src_offset]
;
}
}
}
#
else
for
(
int
i
=
localRow
; i < LOCAL_HEIGHT; i += get_local_size(1))
{
int
curRow
=
groupStartRow
+
i
;
curRow
=
ADDR_H
(
curRow,
FILTER_RADIUS
-
src_offset_y,
wholerows
-
src_offset_y
)
;
curRow
=
ADDR_B
(
curRow
-
FILTER_RADIUS,
wholerows
-
src_offset_y,
curRow
-
FILTER_RADIUS
)
;
for
(
int
j
=
localCol
; j < LOCAL_WIDTH; j += get_local_size(0))
{
int
curCol
=
groupStartCol
+
j
;
curCol
=
ADDR_L
(
curCol,
FILTER_RADIUS
-
src_offset_x,
wholecols
-
src_offset_x
)
;
curCol
=
ADDR_R
(
curCol
-
FILTER_RADIUS,
wholecols
-
src_offset_x,
curCol
-
FILTER_RADIUS
)
;
if
(
curRow
<
wholerows
&&
curCol
<
wholecols
)
{
local_data[
(
i
)
*
LOCAL_WIDTH
+
j]
=
src[
(
curRow
)
*
src_step
+
curCol
+
src_offset]
;
}
}
}
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
globalRow
<
rows
&&
globalCol
<
cols
)
{
T_SUM
sum
=
(
T_SUM
)(
SUM_ZERO
)
;
int
filterIdx
=
0
;
for
(
int
i
=
0
; i < FILTER_SIZE; i++)
{
int
offset
=
(
i
+
localRow
)
*
LOCAL_WIDTH
;
for
(
int
j
=
0
; j < FILTER_SIZE; j++)
{
sum
+=
CONVERT_TYPE_SUM
(
local_data[offset
+
j
+
localCol]
)
*
mat_kernel[filterIdx++]
;
}
}
dst[
(
globalRow
)
*dst_step
+
(
globalCol
)
+
dst_offset]
=
CONVERT_TYPE
(
sum
)
;
}
}
///
following
is
specific
for
3x3
kernels
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////Macro
for
define
elements
number
per
thread/////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#
define
ANX
1
#
define
ANY
1
#
define
ROWS_PER_GROUP
4
#
define
ROWS_PER_GROUP_BITS
2
#
define
ROWS_FETCH
(
ROWS_PER_GROUP
+
ANY
+
ANY
)
//
(
ROWS_PER_GROUP
+
anY
*
2
)
#
define
THREADS_PER_ROW
64
#
define
THREADS_PER_ROW_BIT
6
#
define
ELEMENTS_PER_THREAD
4
#
define
ELEMENTS_PER_THREAD_BIT
2
#
define
LOCAL_MEM_STEP
260
//divup
((
get_local_size
(
0
)
+
anX
*
2
)
,
4
)
*
4
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
filter2D_3x3
(
__global
T_IMG
*src,
__global
T_IMG
*dst,
int
src_step,
int
dst_step,
__constant
float
*mat_kernel,
__local
T_IMG
*local_data,
int
wholerows,
int
wholecols,
int
src_offset_x,
int
src_offset_y,
int
dst_offset_x,
int
dst_offset_y,
int
cols,
int
rows,
int
operate_cols
)
{
int
gX
=
get_global_id
(
0
)
;
int
gY
=
get_global_id
(
1
)
;
int
lX
=
get_local_id
(
0
)
;
int
groupX_size
=
get_local_size
(
0
)
;
int
groupX_id
=
get_group_id
(
0
)
;
#
define
dst_align
(
dst_offset_x
&
3
)
int
cols_start_index_group
=
src_offset_x
-
dst_align
+
groupX_size
*
groupX_id
-
ANX
;
int
rows_start_index
=
src_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
-
ANY
;
if
((
gY
<<
2
)
<
rows
)
{
for
(
int
i
=
0
; i < ROWS_FETCH; ++i)
{
if
((
rows_start_index
-
src_offset_y
)
+
i
<
rows
+
ANY
)
{
#
ifdef
BORDER_CONSTANT
int
selected_row
=
rows_start_index
+
i
;
int
selected_cols
=
cols_start_index_group
+
lX
;
T_IMG
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
int
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
(
T_IMG
)(
0
)
;
local_data[mad24
(
i,
LOCAL_MEM_STEP,
lX
)
]
=
data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
(
T_IMG
)(
0
)
;
local_data[mad24
(
i,
LOCAL_MEM_STEP,
lX
)
+
groupX_size]
=
data
;
}
#
else
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
selected_row
=
ADDR_B
(
rows_start_index
+
i,
wholerows,
selected_row
)
;
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
T_IMG
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
local_data[mad24
(
i,
LOCAL_MEM_STEP,
lX
)
]
=
data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
local_data[mad24
(
i,
LOCAL_MEM_STEP,
lX
)
+
groupX_size]
=
data
;
}
#
endif
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
process_col
=
groupX_size
*
groupX_id
+
((
lX
%
THREADS_PER_ROW
)
<<
2
)
;
if
(((
gY
<<
2
)
<
rows
)
&&
(
process_col
<
operate_cols
))
{
int
dst_cols_start
=
dst_offset_x
;
int
dst_cols_end
=
dst_offset_x
+
cols
;
int
dst_cols_index
=
(
dst_offset_x
+
process_col
)
&
0xfffffffc
;
int
dst_rows_end
=
dst_offset_y
+
rows
;
int
dst_rows_index
=
dst_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
+
(
lX
>>
THREADS_PER_ROW_BIT
)
;
dst
=
dst
+
mad24
(
dst_rows_index,
dst_step,
dst_cols_index
)
;
T_IMGx4
dst_data
=
*
(
__global
T_IMGx4
*
)
dst
;
T_SUMx4
sum
=
(
T_SUMx4
)
SUM_ZEROx4
;
T_IMGx4
data
;
for
(
int
i
=
0
; i < FILTER_SIZE; i++)
{
#
pragma
unroll
for
(
int
j
=
0
; j < FILTER_SIZE; j++)
{
if
(
dst_rows_index
<
dst_rows_end
)
{
int
local_row
=
(
lX
>>
THREADS_PER_ROW_BIT
)
+
i
;
int
local_cols
=
((
lX
%
THREADS_PER_ROW
)
<<
ELEMENTS_PER_THREAD_BIT
)
+
j
;
data
=
VLOAD4
(
0
,
(
__local
T_IMG_C1
*
)(
local_data
+
local_row
*
LOCAL_MEM_STEP
+
local_cols
))
;
sum
=
sum
+
(
mat_kernel[i
*
FILTER_SIZE
+
j]
*
CONVERT_TYPE_SUMx4
(
data
))
;
}
}
}
if
(
dst_rows_index
<
dst_rows_end
)
{
T_IMGx4
tmp_dst
=
CONVERT_TYPEx4
(
sum
)
;
tmp_dst.SX
=
((
dst_cols_index
+
0
>=
dst_cols_start
)
&&
(
dst_cols_index
+
0
<
dst_cols_end
))
?
tmp_dst.SX
:
dst_data.SX
;
tmp_dst.SY
=
((
dst_cols_index
+
1
>=
dst_cols_start
)
&&
(
dst_cols_index
+
1
<
dst_cols_end
))
?
tmp_dst.SY
:
dst_data.SY
;
tmp_dst.SZ
=
((
dst_cols_index
+
2
>=
dst_cols_start
)
&&
(
dst_cols_index
+
2
<
dst_cols_end
))
?
tmp_dst.SZ
:
dst_data.SZ
;
tmp_dst.SW
=
((
dst_cols_index
+
3
>=
dst_cols_start
)
&&
(
dst_cols_index
+
3
<
dst_cols_end
))
?
tmp_dst.SW
:
dst_data.SW
;
*
(
__global
T_IMGx4
*
)
dst
=
tmp_dst
;
}
}
}
modules/ocl/test/test_filters.cpp
View file @
0f95f0d8
...
...
@@ -160,8 +160,8 @@ OCL_TEST_P(LaplacianTest, Accuracy)
{
random_roi
();
Laplacian
(
src_roi
,
dst_roi
,
-
1
,
ksize
,
scale
);
// TODO FIXIT
, 0, borderType);
ocl
::
Laplacian
(
gsrc_roi
,
gdst_roi
,
-
1
,
ksize
,
scale
);
// TODO FIXIT
, 0, borderType);
Laplacian
(
src_roi
,
dst_roi
,
-
1
,
ksize
,
scale
,
0
,
borderType
);
ocl
::
Laplacian
(
gsrc_roi
,
gdst_roi
,
-
1
,
ksize
,
scale
,
0
,
borderType
);
Near
();
}
...
...
@@ -298,7 +298,7 @@ OCL_TEST_P(Filter2D, Mat)
kernel
*=
1.0
/
(
double
)(
ksize
*
ksize
);
cv
::
filter2D
(
src_roi
,
dst_roi
,
-
1
,
kernel
,
anchor
,
0.0
,
borderType
);
ocl
::
filter2D
(
gsrc_roi
,
gdst_roi
,
-
1
,
kernel
,
anchor
,
/* TODO FIXIT 0.0,*/
borderType
);
ocl
::
filter2D
(
gsrc_roi
,
gdst_roi
,
-
1
,
kernel
,
anchor
,
0.0
,
borderType
);
Near
();
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment