Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
6ae9870d
Commit
6ae9870d
authored
Aug 14, 2013
by
Roman Donchenko
Committed by
OpenCV Buildbot
Aug 14, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1254 from pengx17:2.4_filter2d_
parents
bcba3fc6
124ede61
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
295 additions
and
388 deletions
+295
-388
ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+1
-1
filtering.cpp
modules/ocl/src/filtering.cpp
+63
-36
filtering_laplacian.cl
modules/ocl/src/opencl/filtering_laplacian.cl
+195
-350
test_filters.cpp
modules/ocl/test/test_filters.cpp
+36
-1
No files found.
modules/ocl/include/opencv2/ocl/ocl.hpp
View file @
6ae9870d
...
@@ -691,7 +691,7 @@ namespace cv
...
@@ -691,7 +691,7 @@ namespace cv
//! returns 2D filter with the specified kernel
//! returns 2D filter with the specified kernel
// supports CV_8UC1 and CV_8UC4 types
// supports CV_8UC1 and CV_8UC4 types
CV_EXPORTS
Ptr
<
BaseFilter_GPU
>
getLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
CV_EXPORTS
Ptr
<
BaseFilter_GPU
>
getLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
//! returns the non-separable linear filter engine
//! returns the non-separable linear filter engine
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
...
...
modules/ocl/src/filtering.cpp
View file @
6ae9870d
...
@@ -572,7 +572,7 @@ void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &ke
...
@@ -572,7 +572,7 @@ void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &ke
namespace
namespace
{
{
typedef
void
(
*
GPUFilter2D_t
)(
const
oclMat
&
,
oclMat
&
,
oclMat
&
,
Size
&
,
const
Point
,
const
int
);
typedef
void
(
*
GPUFilter2D_t
)(
const
oclMat
&
,
oclMat
&
,
const
oclMat
&
,
const
Size
&
,
const
Point
&
,
const
int
);
class
LinearFilter_GPU
:
public
BaseFilter_GPU
class
LinearFilter_GPU
:
public
BaseFilter_GPU
{
{
...
@@ -591,21 +591,22 @@ public:
...
@@ -591,21 +591,22 @@ public:
};
};
}
}
static
void
GPUFilter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
oclMat
&
mat_kernel
,
static
void
GPUFilter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
const
oclMat
&
mat_kernel
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
const
Size
&
ksize
,
const
Point
&
anchor
,
const
int
borderType
)
{
{
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
(
src
.
rows
==
dst
.
rows
));
CV_Assert
((
src
.
oclchannels
()
==
dst
.
oclchannels
()));
CV_Assert
((
src
.
oclchannels
()
==
dst
.
oclchannels
()));
CV_Assert
((
borderType
!=
0
));
CV_Assert
(
ksize
.
height
>
0
&&
ksize
.
width
>
0
&&
((
ksize
.
height
&
1
)
==
1
)
&&
((
ksize
.
width
&
1
)
==
1
));
CV_Assert
(
ksize
.
height
>
0
&&
ksize
.
width
>
0
&&
((
ksize
.
height
&
1
)
==
1
)
&&
((
ksize
.
width
&
1
)
==
1
));
CV_Assert
((
anchor
.
x
==
-
1
&&
anchor
.
y
==
-
1
)
||
(
anchor
.
x
==
ksize
.
width
>>
1
&&
anchor
.
y
==
ksize
.
height
>>
1
));
CV_Assert
((
anchor
.
x
==
-
1
&&
anchor
.
y
==
-
1
)
||
(
anchor
.
x
==
ksize
.
width
>>
1
&&
anchor
.
y
==
ksize
.
height
>>
1
));
CV_Assert
(
ksize
.
width
==
ksize
.
height
);
Context
*
clCxt
=
src
.
clCxt
;
Context
*
clCxt
=
src
.
clCxt
;
int
cn
=
src
.
oclchannels
();
int
depth
=
src
.
depth
();
string
kernelName
=
"filter2D"
;
int
filterWidth
=
ksize
.
width
;
bool
ksize_3x3
=
filterWidth
==
3
&&
src
.
type
()
!=
CV_32FC4
;
// CV_32FC4 is not tuned up with filter2d_3x3 kernel
string
kernelName
=
ksize_3x3
?
"filter2D_3x3"
:
"filter2D"
;
size_t
src_offset_x
=
(
src
.
offset
%
src
.
step
)
/
src
.
elemSize
();
size_t
src_offset_x
=
(
src
.
offset
%
src
.
step
)
/
src
.
elemSize
();
size_t
src_offset_y
=
src
.
offset
/
src
.
step
;
size_t
src_offset_y
=
src
.
offset
/
src
.
step
;
...
@@ -613,55 +614,81 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
...
@@ -613,55 +614,81 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
size_t
dst_offset_x
=
(
dst
.
offset
%
dst
.
step
)
/
dst
.
elemSize
();
size_t
dst_offset_x
=
(
dst
.
offset
%
dst
.
step
)
/
dst
.
elemSize
();
size_t
dst_offset_y
=
dst
.
offset
/
dst
.
step
;
size_t
dst_offset_y
=
dst
.
offset
/
dst
.
step
;
int
vector_lengths
[
4
][
7
]
=
{{
4
,
4
,
4
,
4
,
4
,
4
,
4
},
int
paddingPixels
=
filterWidth
&
(
-
2
);
{
4
,
4
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
},
size_t
localThreads
[
3
]
=
{
ksize_3x3
?
256
:
16
,
ksize_3x3
?
1
:
16
,
1
};
{
4
,
4
,
4
,
4
,
1
,
1
,
4
}
size_t
globalThreads
[
3
]
=
{
src
.
wholecols
,
src
.
wholerows
,
1
};
};
int
vector_length
=
vector_lengths
[
cn
-
1
][
depth
];
int
cn
=
src
.
oclchannels
();
int
offset_cols
=
(
dst_offset_x
)
&
(
vector_length
-
1
);
int
src_step
=
(
int
)(
src
.
step
/
src
.
elemSize
());
int
cols
=
dst
.
cols
+
offset_cols
;
int
dst_step
=
(
int
)(
dst
.
step
/
src
.
elemSize
());
int
rows
=
divUp
(
dst
.
rows
,
vector_length
);
int
localWidth
=
localThreads
[
0
]
+
paddingPixels
;
int
localHeight
=
localThreads
[
1
]
+
paddingPixels
;
size_t
localThreads
[
3
]
=
{
256
,
1
,
1
};
// 260 = divup((localThreads[0] + filterWidth * 2), 4) * 4
size_t
globalThreads
[
3
]
=
{
divUp
(
cols
,
localThreads
[
0
])
*
localThreads
[
0
],
// 6 = (ROWS_PER_GROUP_WHICH_IS_4 + filterWidth * 2)
divUp
(
rows
,
localThreads
[
1
])
*
localThreads
[
1
],
1
size_t
localMemSize
=
ksize_3x3
?
260
*
6
*
src
.
elemSize
()
:
(
localWidth
*
localHeight
)
*
src
.
elemSize
();
};
int
vector_lengths
[
4
][
7
]
=
{{
4
,
4
,
4
,
4
,
4
,
4
,
4
},
{
4
,
4
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
4
,
4
,
4
,
4
,
1
,
1
,
4
}
};
int
cols
=
dst
.
cols
+
((
dst_offset_x
)
&
(
vector_lengths
[
cn
-
1
][
src
.
depth
()]
-
1
));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_kernel
.
data
));
args
.
push_back
(
make_pair
(
localMemSize
,
(
void
*
)
NULL
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_offset_x
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_offset_x
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_offset_y
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_offset_y
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_offset_x
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_offset_x
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_offset_y
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_offset_y
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_kernel
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
char
btype
[
30
];
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
switch
(
borderType
)
{
const
int
buffer_size
=
100
;
case
0
:
char
opt_buffer
[
buffer_size
]
=
""
;
sprintf
(
btype
,
"BORDER_CONSTANT"
);
sprintf
(
opt_buffer
,
"-DANCHOR=%d -DANX=%d -DANY=%d"
,
ksize
.
width
,
anchor
.
x
,
anchor
.
y
);
break
;
case
1
:
openCLExecuteKernel
(
clCxt
,
&
filtering_laplacian
,
kernelName
,
globalThreads
,
localThreads
,
args
,
cn
,
depth
,
opt_buffer
);
sprintf
(
btype
,
"BORDER_REPLICATE"
);
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
);
break
;
case
3
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
);
break
;
}
int
type
=
src
.
depth
();
char
build_options
[
150
];
sprintf
(
build_options
,
"-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d"
,
btype
,
cn
,
type
,
cn
,
ksize
.
width
);
openCLExecuteKernel
(
clCxt
,
&
filtering_laplacian
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
}
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
Point
anchor
,
int
borderType
)
const
Point
&
anchor
,
int
borderType
)
{
{
static
const
GPUFilter2D_t
GPUFilter2D_callers
[]
=
{
0
,
GPUFilter2D
,
0
,
GPUFilter2D
,
GPUFilter2D
};
static
const
GPUFilter2D_t
GPUFilter2D_callers
[]
=
{
0
,
GPUFilter2D
,
0
,
GPUFilter2D
,
GPUFilter2D
};
CV_Assert
((
srcType
==
CV_8UC1
||
srcType
==
CV_8UC3
||
srcType
==
CV_8UC4
||
srcType
==
CV_32FC1
||
srcType
==
CV_32FC3
||
srcType
==
CV_32FC4
)
&&
dstType
==
srcType
);
CV_Assert
((
srcType
==
CV_8UC1
||
srcType
==
CV_8UC3
||
srcType
==
CV_8UC4
||
srcType
==
CV_32FC1
||
srcType
==
CV_32FC3
||
srcType
==
CV_32FC4
)
&&
dstType
==
srcType
);
oclMat
gpu_krnl
;
oclMat
gpu_krnl
;
int
nDivis
or
;
Point
norm_archor
=
anch
or
;
normalizeKernel
(
kernel
,
gpu_krnl
,
CV_32
S
,
&
nDivisor
,
false
);
normalizeKernel
(
kernel
,
gpu_krnl
,
CV_32
FC1
);
normalizeAnchor
(
an
chor
,
ksize
);
normalizeAnchor
(
norm_ar
chor
,
ksize
);
return
Ptr
<
BaseFilter_GPU
>
(
new
LinearFilter_GPU
(
ksize
,
anchor
,
gpu_krnl
,
GPUFilter2D_callers
[
CV_MAT_CN
(
srcType
)],
return
Ptr
<
BaseFilter_GPU
>
(
new
LinearFilter_GPU
(
ksize
,
anchor
,
gpu_krnl
,
GPUFilter2D_callers
[
CV_MAT_CN
(
srcType
)],
borderType
));
borderType
));
...
...
modules/ocl/src/opencl/filtering_laplacian.cl
View file @
6ae9870d
...
@@ -15,7 +15,9 @@
...
@@ -15,7 +15,9 @@
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
//
@Authors
//
@Authors
//
Pang
Erping,
erping@multicorewareinc.com
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
Peng
Xiao,
pengxiao@outlook.com
//
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
are
permitted
provided
that
the
following
conditions
are
met:
...
@@ -42,292 +44,228 @@
...
@@ -42,292 +44,228 @@
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//
//M*/
//M*/
#
define
BORDER_REFLECT_101
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro
for
border
type////////////////////////////////////////////
/////////////////////////////////Macro
for
border
type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#
ifdef
BORDER_REPLICATE
#
ifdef
BORDER_REPLICATE
//BORDER_REPLICATE:
aaaaaa|abcdefgh|hhhhhhh
//BORDER_REPLICATE:
aaaaaa|abcdefgh|hhhhhhh
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
l_edge
)
:
(
i
))
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
l_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
r_edge
)
-1
:
(
addr
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
r_edge
)
-1
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
t_edge
)
:
(
i
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
t_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
b_edge
)
-1
:
(
addr
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
b_edge
)
-1
:
(
addr
))
#
endif
#
endif
#
ifdef
BORDER_REFLECT
#
ifdef
BORDER_REFLECT
//BORDER_REFLECT:
fedcba|abcdefgh|hgfedcb
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
((
l_edge
)
<<1
)
-
(
i
)
-1
:
(
i
))
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-1+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-1+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
((
t_edge
)
<<1
)
-
(
i
)
-1
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-1+
((
b_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-1+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
endif
#
ifdef
BORDER_REFLECT_101
#
ifdef
BORDER_REFLECT_101
//BORDER_REFLECT_101:
gfedcb|abcdefgh|gfedcba
//BORDER_REFLECT_101:
gfedcb|abcdefgh|gfedcba
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
((
l_edge
)
<<1
)
-
(
i
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-2+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-2+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
((
t_edge
)
<<1
)
-
(
i
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-2+
((
b_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-2+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
endif
#
ifdef
BORDER_WRAP
#
ifdef
IMG_C_1_0
//BORDER_WRAP:
cdefgh|abcdefgh|abcdefg
#
define
T_IMG
uchar
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
i
)
+
(
r_edge
)
:
(
i
))
#
define
T_IMGx4
uchar4
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
i
)
-
(
r_edge
)
:
(
addr
))
#
define
T_IMG_C1
uchar
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
i
)
+
(
b_edge
)
:
(
i
))
#
define
CONVERT_TYPE
convert_uchar_sat
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
i
)
-
(
b_edge
)
:
(
addr
))
#
define
CONVERT_TYPEx4
convert_uchar4_sat
#
endif
#
ifdef
IMG_C_4_0
#
define
T_IMG
uchar4
#
define
T_IMGx4
uchar16
#
define
T_IMG_C1
uchar
#
define
CONVERT_TYPE
convert_uchar4_sat
#
define
CONVERT_TYPEx4
convert_uchar16_sat
#
endif
#
ifdef
IMG_C_1_5
#
define
T_IMG
float
#
define
T_IMGx4
float4
#
define
T_IMG_C1
float
#
define
CONVERT_TYPE
convert_float
#
define
CONVERT_TYPEx4
convert_float4
#
endif
#
ifdef
IMG_C_4_5
#
define
T_IMG
float4
#
define
T_IMGx4
float16
#
define
T_IMG_C1
float
#
define
CONVERT_TYPE
convert_float4
#
define
CONVERT_TYPEx4
convert_float16
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
#
ifndef
CN
/////////////////////////////Macro
for
define
elements
number
per
thread/////////////////////////////
#
define
CN
1
////////////////////////////////////////////////////////////////////////////////////////////////////
#
endif
//#define
ANCHOR
3
//#define
ANX
1
//#define
ANY
1
#
define
ROWS_PER_GROUP
4
#
define
ROWS_PER_GROUP_BITS
2
#
define
ROWS_FETCH
(
ROWS_PER_GROUP
+
ANY
+
ANY
)
//
(
ROWS_PER_GROUP
+
anY
*
2
)
#
define
THREADS_PER_ROW
64
#
define
THREADS_PER_ROW_BIT
6
#
define
ELEMENTS_PER_THREAD
4
#
if
CN
==
1
#
define
ELEMENTS_PER_THREAD_BIT
2
#
define
T_SUM
float
#
define
T_SUMx4
float4
#
define
CONVERT_TYPE_SUM
convert_float
#
define
CONVERT_TYPE_SUMx4
convert_float4
#
define
SUM_ZERO
(
0.0f
)
#
define
SUM_ZEROx4
(
0.0f,
0.0f,
0.0f,
0.0f
)
#
define
VLOAD4
vload4
#
define
SX
x
#
define
SY
y
#
define
SZ
z
#
define
SW
w
#
elif
CN
==
4
#
define
T_SUM
float4
#
define
T_SUMx4
float16
#
define
CONVERT_TYPE_SUM
convert_float4
#
define
CONVERT_TYPE_SUMx4
convert_float16
#
define
SUM_ZERO
(
0.0f,
0.0f,
0.0f,
0.0f
)
#
define
SUM_ZEROx4
(
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f
)
#
define
VLOAD4
vload16
#
define
SX
s0123
#
define
SY
s4567
#
define
SZ
s89ab
#
define
SW
scdef
#
endif
#
define
LOCAL_MEM_STEP
260
//divup
((
get_local_size
(
0
)
+
anX
*
2
)
,
4
)
*
4
#
ifndef
FILTER_SIZE
#
define
FILTER_SIZE
3
#
endif
///////////////////////////////////////////////////////////////////////////////////////////////////
#
define
LOCAL_GROUP_SIZE
16
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#
define
LOCAL_WIDTH
((
FILTER_SIZE/2
)
*2
+
LOCAL_GROUP_SIZE
)
__kernel
void
filter2D_C1_D0
(
__global
uchar
*src,
int
src_step,
int
src_offset_x,
int
src_offset_y,
#
define
LOCAL_HEIGHT
((
FILTER_SIZE/2
)
*2
+
LOCAL_GROUP_SIZE
)
__global
uchar
*dst,
int
dst_step,
int
dst_offset_x,
int
dst_offset_y,
__constant
int
*mat_kernel
__attribute__
((
max_constant_size
(
16384
)))
,
#
define
FILTER_RADIUS
(
FILTER_SIZE
>>
1
)
int
cols,int
rows,
int
operate_cols,
int
wholecols,
int
wholerows
)
__kernel
void
filter2D
(
__global
T_IMG
*src,
__global
T_IMG
*dst,
int
src_step,
int
dst_step,
__constant
float
*mat_kernel,
__local
T_IMG
*local_data,
int
wholerows,
int
wholecols,
int
src_offset_x,
int
src_offset_y,
int
dst_offset_x,
int
dst_offset_y,
int
cols,
int
rows,
int
operate_cols
)
{
{
int
gX
=
get_global_id
(
0
)
;
int
groupStartCol
=
get_group_id
(
0
)
*
get_local_size
(
0
)
;
int
gY
=
get_global_id
(
1
)
;
int
groupStartRow
=
get_group_id
(
1
)
*
get_local_size
(
1
)
;
int
lX
=
get_local_id
(
0
)
;
int
localCol
=
get_local_id
(
0
)
;
int
localRow
=
get_local_id
(
1
)
;
int
groupX_size
=
get_local_size
(
0
)
;
int
globalCol
=
groupStartCol
+
localCol
;
int
groupX_id
=
get_group_id
(
0
)
;
int
globalRow
=
groupStartRow
+
localRow
;
const
int
src_offset
=
mad24
(
src_offset_y,
src_step,
src_offset_x
)
;
#
define
dst_align
(
dst_offset_x
&
3
)
const
int
dst_offset
=
mad24
(
dst_offset_y,
dst_step,
dst_offset_x
)
;
int
cols_start_index_group
=
src_offset_x
-
dst_align
+
groupX_size
*
groupX_id
-
ANX
;
#
ifdef
BORDER_CONSTANT
int
rows_start_index
=
src_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
-
ANY
;
for
(
int
i
=
localRow
; i < LOCAL_HEIGHT; i += get_local_size(1))
__local
uchar
local_data[LOCAL_MEM_STEP
*
ROWS_FETCH]
;
if
((
gY
<<
2
)
<
rows
)
{
{
for
(
int
i
=
0
; i < ROWS_FETCH; ++i)
int
curRow
=
groupStartRow
+
i
;
for
(
int
j
=
localCol
; j < LOCAL_WIDTH; j += get_local_size(0))
{
{
if
((
rows_start_index
-
src_offset_y
)
+
i
<
rows
+
ANY
)
int
curCol
=
groupStartCol
+
j
;
if
(
curRow
<
FILTER_RADIUS
-
src_offset_y
|
| (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
curCol < FILTER_RADIUS - src_offset_x |
|
(
curCol
-
FILTER_RADIUS
)
>=
wholecols
-
src_offset_x
)
{
{
#
ifdef
BORDER_CONSTANT
local_data[
(
i
)
*
LOCAL_WIDTH
+
j]
=
0
;
int
selected_row
=
rows_start_index
+
i
;
}
int
selected_cols
=
cols_start_index_group
+
lX
;
else
{
uchar
data
=
*
(
src
+
selected_row
*
src_step
+
selected_cols
)
;
local_data[
(
i
)
*
LOCAL_WIDTH
+
j]
=
src[
(
curRow
-
FILTER_RADIUS
)
*
src_step
+
curCol
-
FILTER_RADIUS
+
src_offset]
;
int
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
data
=
*
(
src
+
selected_row
*
src_step
+
selected_cols
)
;
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
else
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
selected_row
=
ADDR_B
(
rows_start_index
+
i,
wholerows,
selected_row
)
;
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
uchar
data
=
*
(
src
+
selected_row
*
src_step
+
selected_cols
)
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
data
=
*
(
src
+
selected_row
*
src_step
+
selected_cols
)
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
endif
}
}
}
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
else
for
(
int
i
=
localRow
; i < LOCAL_HEIGHT; i += get_local_size(1))
int
process_col
=
groupX_size
*
groupX_id
+
((
lX
%
THREADS_PER_ROW
)
<<
2
)
;
if
(((
gY
<<
2
)
<
rows
)
&&
(
process_col
<
operate_cols
))
{
{
int
dst_cols_start
=
dst_offset_x
;
int
curRow
=
groupStartRow
+
i
;
int
dst_cols_end
=
dst_offset_x
+
cols
;
int
dst_cols_index
=
(
dst_offset_x
+
process_col
)
&
0xfffffffc
;
int
dst_rows_end
=
dst_offset_y
+
rows
;
int
dst_rows_index
=
dst_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
+
(
lX
>>
THREADS_PER_ROW_BIT
)
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_rows_index
*
dst_step
+
dst_cols_index
)
)
;
curRow
=
ADDR_H
(
curRow,
FILTER_RADIUS
-
src_offset_y,
wholerows
-
src_offset_y
)
;
int4
sum
=
(
int4
)(
0
)
;
curRow
=
ADDR_B
(
curRow
-
FILTER_RADIUS,
wholerows
-
src_offset_y,
curRow
-
FILTER_RADIUS
)
;
uchar4
data
;
for
(
int
i
=
0
; i < ANCHOR; i++
)
for
(
int
j
=
localCol
; j < LOCAL_WIDTH; j += get_local_size(0)
)
{
{
#
pragma
unroll
int
curCol
=
groupStartCol
+
j
;
for
(
int
j
=
0
; j < ANCHOR; j++)
curCol
=
ADDR_L
(
curCol,
FILTER_RADIUS
-
src_offset_x,
wholecols
-
src_offset_x
)
;
curCol
=
ADDR_R
(
curCol
-
FILTER_RADIUS,
wholecols
-
src_offset_x,
curCol
-
FILTER_RADIUS
)
;
if
(
curRow
<
wholerows
&&
curCol
<
wholecols
)
{
{
if
(
dst_rows_index
<
dst_rows_end
)
local_data[
(
i
)
*
LOCAL_WIDTH
+
j]
=
src[
(
curRow
)
*
src_step
+
curCol
+
src_offset]
;
{
int
local_row
=
(
lX
>>
THREADS_PER_ROW_BIT
)
+
i
;
int
local_cols
=
((
lX
%
THREADS_PER_ROW
)
<<
ELEMENTS_PER_THREAD_BIT
)
+
j
;
data
=
vload4
(
0
,
local_data+local_row
*
LOCAL_MEM_STEP
+
local_cols
)
;
sum
=
sum
+
(
mat_kernel[i
*
ANCHOR
+
j]
*
convert_int4_sat
(
data
))
;
}
}
}
}
}
if
(
dst_rows_index
<
dst_rows_end
)
{
sum.x
=
((
dst_cols_index
+
0
>=
dst_cols_start
)
&&
(
dst_cols_index
+
0
<
dst_cols_end
))
?
sum.x
:
dst_data.x
;
sum.y
=
((
dst_cols_index
+
1
>=
dst_cols_start
)
&&
(
dst_cols_index
+
1
<
dst_cols_end
))
?
sum.y
:
dst_data.y
;
sum.z
=
((
dst_cols_index
+
2
>=
dst_cols_start
)
&&
(
dst_cols_index
+
2
<
dst_cols_end
))
?
sum.z
:
dst_data.z
;
sum.w
=
((
dst_cols_index
+
3
>=
dst_cols_start
)
&&
(
dst_cols_index
+
3
<
dst_cols_end
))
?
sum.w
:
dst_data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_rows_index
*
dst_step
+
dst_cols_index
))
=
convert_uchar4_sat
(
sum
)
;
}
}
}
}
#
endif
///////////////////////////////////////////////////////////////////////////////////////////////////
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/////////////////////////////////////////32FC1////////////////////////////////////////////////////////
if
(
globalRow
<
rows
&&
globalCol
<
cols
)
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
filter2D_C1_D5
(
__global
float
*src,
int
src_step,
int
src_offset_x,
int
src_offset_y,
__global
float
*dst,
int
dst_step,
int
dst_offset_x,
int
dst_offset_y,
__constant
int
*mat_kernel
__attribute__
((
max_constant_size
(
16384
)))
,
int
cols,int
rows,
int
operate_cols,
int
wholecols,
int
wholerows
)
{
int
gX
=
get_global_id
(
0
)
;
int
gY
=
get_global_id
(
1
)
;
int
lX
=
get_local_id
(
0
)
;
int
groupX_size
=
get_local_size
(
0
)
;
int
groupX_id
=
get_group_id
(
0
)
;
#
define
dst_align
(
dst_offset_x
&
3
)
int
cols_start_index_group
=
src_offset_x
-
dst_align
+
groupX_size
*
groupX_id
-
ANX
;
int
rows_start_index
=
src_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
-
ANY
;
__local
float
local_data[LOCAL_MEM_STEP
*
ROWS_FETCH]
;
if
(((
gY
<<
2
)
<
rows
))
{
{
for
(
int
i
=
0
; i < ROWS_FETCH; ++i)
T_SUM
sum
=
(
T_SUM
)
SUM_ZERO
;
int
filterIdx
=
0
;
for
(
int
i
=
0
; i < FILTER_SIZE; i++)
{
{
if
((
rows_start_index
-
src_offset_y
)
+
i
<
rows
+
ANY
)
int
offset
=
(
i
+
localRow
)
*
LOCAL_WIDTH
;
{
#
ifdef
BORDER_CONSTANT
int
selected_row
=
rows_start_index
+
i
;
int
selected_cols
=
cols_start_index_group
+
lX
;
float
data
=
*
((
__global
float
*
)((
__global
char
*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
int
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
]
=data
;
if
(
lX
<
(
ANX
<<
1
))
for
(
int
j
=
0
; j < FILTER_SIZE; j++)
{
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
sum
+=
CONVERT_TYPE_SUM
(
local_data[offset
+
j
+
localCol]
)
*
mat_kernel[filterIdx++]
;
data
=
*
((
__global
float
*
)((
__global
char
*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
else
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
selected_row
=
ADDR_B
(
rows_start_index
+
i,
wholerows,
selected_row
)
;
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
float
data
=
*
((
__global
float
*
)((
__global
char
*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
local_data[i
*
LOCAL_MEM_STEP
+
lX]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
data
=
*
((
__global
float
*
)((
__global
char
*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
endif
}
}
}
}
dst[
(
globalRow
)
*dst_step
+
(
globalCol
)
+
dst_offset]
=
CONVERT_TYPE
(
sum
)
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
int
process_col
=
groupX_size
*
groupX_id
+
((
lX
%
THREADS_PER_ROW
)
<<
2
)
;
if
(((
gY
<<
2
)
<
rows
)
&&
(
process_col
<
operate_cols
))
{
int
dst_cols_start
=
dst_offset_x
;
int
dst_cols_end
=
dst_offset_x
+
cols
;
int
dst_cols_index
=
(
dst_offset_x
+
process_col
)
&
0xfffffffc
;
int
dst_rows_end
=
dst_offset_y
+
rows
;
int
dst_rows_index
=
dst_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
+
(
lX
>>
THREADS_PER_ROW_BIT
)
;
float4
dst_data
=
*
((
__global
float4*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
2
)))
;
///
following
is
specific
for
3x3
kernels
float4
sum
=
(
float4
)(
0
)
;
//////////////////////////////////////////////////////////////////////////////////////////////////////
float4
data
;
/////////////////////////////Macro
for
define
elements
number
per
thread/////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#
define
ANX
1
#
define
ANY
1
for
(
int
i
=
0
; i < ANCHOR; i++)
#
define
ROWS_PER_GROUP
4
{
#
define
ROWS_PER_GROUP_BITS
2
#
pragma
unroll
#
define
ROWS_FETCH
(
ROWS_PER_GROUP
+
ANY
+
ANY
)
//
(
ROWS_PER_GROUP
+
anY
*
2
)
for
(
int
j
=
0
; j < ANCHOR; j++)
{
if
(
dst_rows_index
<
dst_rows_end
)
{
int
local_row
=
(
lX
>>
THREADS_PER_ROW_BIT
)
+
i
;
int
local_cols
=
((
lX
%
THREADS_PER_ROW
)
<<
ELEMENTS_PER_THREAD_BIT
)
+
j
;
data
=
vload4
(
0
,
local_data+local_row
*
LOCAL_MEM_STEP
+
local_cols
)
;
#
define
THREADS_PER_ROW
64
sum
=
sum
+
((
float
)(
mat_kernel[i
*
ANCHOR
+
j]
)
*
data
)
;
#
define
THREADS_PER_ROW_BIT
6
}
}
}
if
(
dst_rows_index
<
dst_rows_end
)
#
define
ELEMENTS_PER_THREAD
4
{
#
define
ELEMENTS_PER_THREAD_BIT
2
sum.x
=
((
dst_cols_index
+
0
>=
dst_cols_start
)
&&
(
dst_cols_index
+
0
<
dst_cols_end
))
?
sum.x
:
dst_data.x
;
sum.y
=
((
dst_cols_index
+
1
>=
dst_cols_start
)
&&
(
dst_cols_index
+
1
<
dst_cols_end
))
?
sum.y
:
dst_data.y
;
sum.z
=
((
dst_cols_index
+
2
>=
dst_cols_start
)
&&
(
dst_cols_index
+
2
<
dst_cols_end
))
?
sum.z
:
dst_data.z
;
sum.w
=
((
dst_cols_index
+
3
>=
dst_cols_start
)
&&
(
dst_cols_index
+
3
<
dst_cols_end
))
?
sum.w
:
dst_data.w
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
2
)))
=
sum
;
#
define
LOCAL_MEM_STEP
260
//divup
((
get_local_size
(
0
)
+
anX
*
2
)
,
4
)
*
4
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC
4
////////////////////////////////////////////////////////
/////////////////////////////////////////8uC
1
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
filter2D_C4_D0
(
__global
uchar4
*src,
int
src_step,
int
src_offset_x,
int
src_offset_y,
__kernel
void
filter2D_3x3
(
__global
uchar4
*dst,
int
dst_step,
int
dst_offset_x,
int
dst_offset_y,
__global
T_IMG
*src,
__constant
int
*mat_kernel
__attribute__
((
max_constant_size
(
16384
)))
,
__global
T_IMG
*dst,
int
cols,int
rows,
int
operate_cols,
int
wholecols,
int
wholerows
)
int
src_step,
int
dst_step,
__constant
float
*mat_kernel,
__local
T_IMG
*local_data,
int
wholerows,
int
wholecols,
int
src_offset_x,
int
src_offset_y,
int
dst_offset_x,
int
dst_offset_y,
int
cols,
int
rows,
int
operate_cols
)
{
{
int
gX
=
get_global_id
(
0
)
;
int
gX
=
get_global_id
(
0
)
;
int
gY
=
get_global_id
(
1
)
;
int
gY
=
get_global_id
(
1
)
;
...
@@ -341,9 +279,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
...
@@ -341,9 +279,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int
cols_start_index_group
=
src_offset_x
-
dst_align
+
groupX_size
*
groupX_id
-
ANX
;
int
cols_start_index_group
=
src_offset_x
-
dst_align
+
groupX_size
*
groupX_id
-
ANX
;
int
rows_start_index
=
src_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
-
ANY
;
int
rows_start_index
=
src_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
-
ANY
;
__local
uchar4
local_data[LOCAL_MEM_STEP
*
ROWS_FETCH]
;
if
((
gY
<<
2
)
<
rows
)
if
(((
gY
<<
2
)
<
rows
))
{
{
for
(
int
i
=
0
; i < ROWS_FETCH; ++i)
for
(
int
i
=
0
; i < ROWS_FETCH; ++i)
{
{
...
@@ -353,19 +289,19 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
...
@@ -353,19 +289,19 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int
selected_row
=
rows_start_index
+
i
;
int
selected_row
=
rows_start_index
+
i
;
int
selected_cols
=
cols_start_index_group
+
lX
;
int
selected_cols
=
cols_start_index_group
+
lX
;
uchar4
data
=
*
((
__global
uchar4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
T_IMG
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
int
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
int
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
data
=
con
?
data
:
0
;
local_data[
i
*
LOCAL_MEM_STEP
+
lX
]
=
data
;
local_data[
mad24
(
i,
LOCAL_MEM_STEP,
lX
)
]
=
data
;
if
(
lX
<
(
ANX
<<
1
))
if
(
lX
<
(
ANX
<<
1
))
{
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
data
=
*
((
__global
uchar4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
data
=
con
?
data
:
0
;
local_data[
i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=
data
;
local_data[
mad24
(
i,
LOCAL_MEM_STEP,
lX
)
+
groupX_size]
=
data
;
}
}
#
else
#
else
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
...
@@ -374,17 +310,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
...
@@ -374,17 +310,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
uchar4
data
=
*
((
__global
uchar4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
T_IMG
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
local_data[
i
*
LOCAL_MEM_STEP
+
lX]
=
data
;
local_data[
mad24
(
i,
LOCAL_MEM_STEP,
lX
)
]
=
data
;
if
(
lX
<
(
ANX
<<
1
))
if
(
lX
<
(
ANX
<<
1
))
{
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
data
=
*
((
__global
uchar4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
local_data[
i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=
data
;
local_data[
mad24
(
i,
LOCAL_MEM_STEP,
lX
)
+
groupX_size]
=
data
;
}
}
#
endif
#
endif
}
}
...
@@ -401,131 +337,40 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
...
@@ -401,131 +337,40 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int
dst_rows_end
=
dst_offset_y
+
rows
;
int
dst_rows_end
=
dst_offset_y
+
rows
;
int
dst_rows_index
=
dst_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
+
(
lX
>>
THREADS_PER_ROW_BIT
)
;
int
dst_rows_index
=
dst_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
+
(
lX
>>
THREADS_PER_ROW_BIT
)
;
dst
=
dst
+
mad24
(
dst_rows_index,
dst_step,
dst_cols_index
)
;
uchar16
dst_data
;
T_IMGx4
dst_data
=
*
(
__global
T_IMGx4
*
)
dst
;
dst_data
=
*
((
__global
uchar16*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
2
)))
;
int16
sum
=
(
int16
)(
0
)
;
T_SUMx4
sum
=
(
T_SUMx4
)
SUM_ZEROx4
;
uchar16
data
;
T_IMGx4
data
;
for
(
int
i
=
0
; i <
ANCHOR
; i++)
for
(
int
i
=
0
; i <
FILTER_SIZE
; i++)
{
{
#
pragma
unroll
#
pragma
unroll
for
(
int
j
=
0
; j <
ANCHOR
; j++)
for
(
int
j
=
0
; j <
FILTER_SIZE
; j++)
{
{
if
(
dst_rows_index
<
dst_rows_end
)
if
(
dst_rows_index
<
dst_rows_end
)
{
{
int
local_row
=
(
lX
>>
THREADS_PER_ROW_BIT
)
+
i
;
int
local_row
=
(
lX
>>
THREADS_PER_ROW_BIT
)
+
i
;
int
local_cols
=
((
lX
%
THREADS_PER_ROW
)
<<
ELEMENTS_PER_THREAD_BIT
)
+
j
;
int
local_cols
=
((
lX
%
THREADS_PER_ROW
)
<<
ELEMENTS_PER_THREAD_BIT
)
+
j
;
data
=
vload16
(
0
,
(
__local
uchar
*
)(
local_data+
local_row
*
LOCAL_MEM_STEP
+
local_cols
))
;
data
=
VLOAD4
(
0
,
(
__local
T_IMG_C1
*
)(
local_data
+
local_row
*
LOCAL_MEM_STEP
+
local_cols
))
;
sum
=
sum
+
(
mat_kernel[i
*
ANCHOR
+
j]
*
convert_int16_sat
(
data
))
;
sum
=
sum
+
(
mat_kernel[i
*
FILTER_SIZE
+
j]
*
CONVERT_TYPE_SUMx4
(
data
))
;
}
}
}
}
}
}
if
(
dst_rows_index
<
dst_rows_end
)
if
(
dst_rows_index
<
dst_rows_end
)
{
{
uchar16
sum1
=
convert_uchar16_sat
(
sum
)
;
T_IMGx4
tmp_dst
=
CONVERT_TYPEx4
(
sum
)
;
sum1.s0123
=
((
dst_cols_index
+
0
>=
dst_cols_start
)
&&
(
dst_cols_index
+
0
<
dst_cols_end
))
?
tmp_dst.SX
=
((
dst_cols_index
+
0
>=
dst_cols_start
)
&&
(
dst_cols_index
+
0
<
dst_cols_end
))
?
sum1.s0123
:
dst_data.s0123
;
tmp_dst.SX
:
dst_data.SX
;
sum1.s4567
=
((
dst_cols_index
+
1
>=
dst_cols_start
)
&&
(
dst_cols_index
+
1
<
dst_cols_end
))
?
tmp_dst.SY
=
((
dst_cols_index
+
1
>=
dst_cols_start
)
&&
(
dst_cols_index
+
1
<
dst_cols_end
))
?
sum1.s4567
:
dst_data.s4567
;
tmp_dst.SY
:
dst_data.SY
;
sum1.s89ab
=
((
dst_cols_index
+
2
>=
dst_cols_start
)
&&
(
dst_cols_index
+
2
<
dst_cols_end
))
?
tmp_dst.SZ
=
((
dst_cols_index
+
2
>=
dst_cols_start
)
&&
(
dst_cols_index
+
2
<
dst_cols_end
))
?
sum1.s89ab
:
dst_data.s89ab
;
tmp_dst.SZ
:
dst_data.SZ
;
sum1.scdef
=
((
dst_cols_index
+
3
>=
dst_cols_start
)
&&
(
dst_cols_index
+
3
<
dst_cols_end
))
?
tmp_dst.SW
=
((
dst_cols_index
+
3
>=
dst_cols_start
)
&&
(
dst_cols_index
+
3
<
dst_cols_end
))
?
sum1.scdef
:
dst_data.scdef
;
tmp_dst.SW
:
dst_data.SW
;
*
(
__global
T_IMGx4
*
)
dst
=
tmp_dst
;
*
((
__global
uchar16*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
2
)))
=
sum1
;
}
}
}
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32FC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#
define
ROWS_FETCH_C4
(
1
+
ANY
+
ANY
)
//
(
ROWS_PER_GROUP
+
anY
*
2
)
#
define
LOCAL_MEM_STEP_C4
260
//divup
((
get_local_size
(
0
)
+
anX
*
2
)
,
4
)
*
4
)
__kernel
void
filter2D_C4_D5
(
__global
float4
*src,
int
src_step,
int
src_offset_x,
int
src_offset_y,
__global
float4
*dst,
int
dst_step,
int
dst_offset_x,
int
dst_offset_y,
__constant
int
*mat_kernel
__attribute__
((
max_constant_size
(
16384
)))
,
int
cols,int
rows,
int
operate_cols,
int
wholecols,
int
wholerows
)
{
int
gX
=
get_global_id
(
0
)
;
int
gY
=
get_global_id
(
1
)
;
int
lX
=
get_local_id
(
0
)
;
int
groupX_size
=
get_local_size
(
0
)
;
int
groupX_id
=
get_group_id
(
0
)
;
int
cols_start_index_group
=
src_offset_x
+
groupX_size
*
groupX_id
-
ANX
;
int
rows_start_index
=
src_offset_y
+
gY
-
ANY
;
__local
float4
local_data[LOCAL_MEM_STEP_C4
*
ROWS_FETCH_C4]
;
if
((
gY
<
rows
)
&&
(
gX
<
(
operate_cols
+
ANX
+
ANX
)))
{
for
(
int
i
=
0
; i < ROWS_FETCH_C4; ++i)
{
if
((
rows_start_index
-
src_offset_y
)
+
i
<
rows
+
ANY
)
{
#
ifdef
BORDER_CONSTANT
int
selected_row
=
rows_start_index
+
i
;
int
selected_cols
=
cols_start_index_group
+
lX
;
float4
data
=
*
((
__global
float4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
4
)))
;
int
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
data
=
*
((
__global
float4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
4
)))
;
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
else
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
selected_row
=
ADDR_B
(
rows_start_index
+
i,
wholerows,
selected_row
)
;
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
float4
data
=
*
((
__global
float4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
4
)))
;
local_data[i
*
LOCAL_MEM_STEP_C4
+
lX]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
data
=
*
((
__global
float4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
4
)))
;
local_data[i
*
LOCAL_MEM_STEP_C4
+
lX
+
groupX_size]
=data
;
}
#
endif
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
((
gY
<
rows
)
&&
(
gX
<
operate_cols
))
{
int
dst_cols_index
=
dst_offset_x
+
gX
;
int
dst_rows_index
=
dst_offset_y
+
gY
;
float4
sum
=
(
float4
)(
0
)
;
for
(
int
i
=
0
; i < ANCHOR; i++)
{
for
(
int
j
=
0
; j < ANCHOR; j++)
{
int
local_cols
=
lX
+
j
;
sum
=
sum
+
((
float
)
mat_kernel[i
*
ANCHOR
+
j]
*
local_data[i
*
LOCAL_MEM_STEP_C4
+
local_cols]
)
;
}
}
*
((
__global
float4*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
4
)))
=
sum
;
}
}
modules/ocl/test/test_filters.cpp
View file @
6ae9870d
...
@@ -324,6 +324,35 @@ TEST_P(GaussianBlur, Mat)
...
@@ -324,6 +324,35 @@ TEST_P(GaussianBlur, Mat)
////////////////////////////////////////////////////////////////////////////////////////////////////
// Filter2D
struct
Filter2D
:
FilterTestBase
{
int
type
;
cv
::
Size
ksize
;
int
bordertype
;
Point
anchor
;
virtual
void
SetUp
()
{
type
=
GET_PARAM
(
0
);
ksize
=
GET_PARAM
(
1
);
bordertype
=
GET_PARAM
(
3
);
Init
(
type
);
anchor
=
Point
(
-
1
,
-
1
);
}
};
TEST_P
(
Filter2D
,
Mat
)
{
cv
::
Mat
kernel
=
randomMat
(
cv
::
Size
(
ksize
.
width
,
ksize
.
height
),
CV_32FC1
,
0.0
,
1.0
);
for
(
int
j
=
0
;
j
<
LOOP_TIMES
;
j
++
)
{
random_roi
();
cv
::
filter2D
(
mat1_roi
,
dst_roi
,
-
1
,
kernel
,
anchor
,
0.0
,
bordertype
);
cv
::
ocl
::
filter2D
(
gmat1
,
gdst
,
-
1
,
kernel
,
anchor
,
bordertype
);
Near
(
1
);
}
}
INSTANTIATE_TEST_CASE_P
(
Filter
,
Blur
,
Combine
(
INSTANTIATE_TEST_CASE_P
(
Filter
,
Blur
,
Combine
(
Values
(
CV_8UC1
,
CV_8UC3
,
CV_8UC4
,
CV_32FC1
,
CV_32FC4
),
Values
(
CV_8UC1
,
CV_8UC3
,
CV_8UC4
,
CV_32FC1
,
CV_32FC4
),
Values
(
cv
::
Size
(
3
,
3
),
cv
::
Size
(
5
,
5
),
cv
::
Size
(
7
,
7
)),
Values
(
cv
::
Size
(
3
,
3
),
cv
::
Size
(
5
,
5
),
cv
::
Size
(
7
,
7
)),
...
@@ -331,7 +360,7 @@ INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
...
@@ -331,7 +360,7 @@ INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
Values
((
MatType
)
cv
::
BORDER_CONSTANT
,
(
MatType
)
cv
::
BORDER_REPLICATE
,
(
MatType
)
cv
::
BORDER_REFLECT
,
(
MatType
)
cv
::
BORDER_REFLECT_101
)));
Values
((
MatType
)
cv
::
BORDER_CONSTANT
,
(
MatType
)
cv
::
BORDER_REPLICATE
,
(
MatType
)
cv
::
BORDER_REFLECT
,
(
MatType
)
cv
::
BORDER_REFLECT_101
)));
INSTANTIATE_TEST_CASE_P
(
Filter
s
,
Laplacian
,
Combine
(
INSTANTIATE_TEST_CASE_P
(
Filter
,
Laplacian
,
Combine
(
Values
(
CV_8UC1
,
CV_8UC3
,
CV_8UC4
,
CV_32FC1
,
CV_32FC3
,
CV_32FC4
),
Values
(
CV_8UC1
,
CV_8UC3
,
CV_8UC4
,
CV_32FC1
,
CV_32FC3
,
CV_32FC4
),
Values
(
Size
(
3
,
3
)),
Values
(
Size
(
3
,
3
)),
Values
(
Size
(
0
,
0
)),
//not use
Values
(
Size
(
0
,
0
)),
//not use
...
@@ -365,4 +394,10 @@ INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
...
@@ -365,4 +394,10 @@ INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
INSTANTIATE_TEST_CASE_P
(
Filter
,
Filter2D
,
testing
::
Combine
(
Values
(
CV_8UC1
,
CV_32FC1
,
CV_32FC4
),
Values
(
Size
(
3
,
3
),
Size
(
15
,
15
),
Size
(
25
,
25
)),
Values
(
Size
(
0
,
0
)),
//not use
Values
((
MatType
)
cv
::
BORDER_CONSTANT
,
(
MatType
)
cv
::
BORDER_REFLECT101
,
(
MatType
)
cv
::
BORDER_REPLICATE
,
(
MatType
)
cv
::
BORDER_REFLECT
)));
#endif // HAVE_OPENCL
#endif // HAVE_OPENCL
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment