Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
6ae9870d
Commit
6ae9870d
authored
Aug 14, 2013
by
Roman Donchenko
Committed by
OpenCV Buildbot
Aug 14, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1254 from pengx17:2.4_filter2d_
parents
bcba3fc6
124ede61
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
295 additions
and
388 deletions
+295
-388
ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+1
-1
filtering.cpp
modules/ocl/src/filtering.cpp
+63
-36
filtering_laplacian.cl
modules/ocl/src/opencl/filtering_laplacian.cl
+195
-350
test_filters.cpp
modules/ocl/test/test_filters.cpp
+36
-1
No files found.
modules/ocl/include/opencv2/ocl/ocl.hpp
View file @
6ae9870d
...
...
@@ -691,7 +691,7 @@ namespace cv
//! returns 2D filter with the specified kernel
// supports CV_8UC1 and CV_8UC4 types
CV_EXPORTS
Ptr
<
BaseFilter_GPU
>
getLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
//! returns the non-separable linear filter engine
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
...
...
modules/ocl/src/filtering.cpp
View file @
6ae9870d
...
...
@@ -572,7 +572,7 @@ void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &ke
namespace
{
typedef
void
(
*
GPUFilter2D_t
)(
const
oclMat
&
,
oclMat
&
,
oclMat
&
,
Size
&
,
const
Point
,
const
int
);
typedef
void
(
*
GPUFilter2D_t
)(
const
oclMat
&
,
oclMat
&
,
const
oclMat
&
,
const
Size
&
,
const
Point
&
,
const
int
);
class
LinearFilter_GPU
:
public
BaseFilter_GPU
{
...
...
@@ -591,21 +591,22 @@ public:
};
}
static
void
GPUFilter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
oclMat
&
mat_kernel
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
static
void
GPUFilter2D
(
const
oclMat
&
src
,
oclMat
&
dst
,
const
oclMat
&
mat_kernel
,
const
Size
&
ksize
,
const
Point
&
anchor
,
const
int
borderType
)
{
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
CV_Assert
((
src
.
oclchannels
()
==
dst
.
oclchannels
()));
CV_Assert
((
borderType
!=
0
));
CV_Assert
(
ksize
.
height
>
0
&&
ksize
.
width
>
0
&&
((
ksize
.
height
&
1
)
==
1
)
&&
((
ksize
.
width
&
1
)
==
1
));
CV_Assert
((
anchor
.
x
==
-
1
&&
anchor
.
y
==
-
1
)
||
(
anchor
.
x
==
ksize
.
width
>>
1
&&
anchor
.
y
==
ksize
.
height
>>
1
));
CV_Assert
(
ksize
.
width
==
ksize
.
height
);
Context
*
clCxt
=
src
.
clCxt
;
int
cn
=
src
.
oclchannels
();
int
depth
=
src
.
depth
();
string
kernelName
=
"filter2D"
;
int
filterWidth
=
ksize
.
width
;
bool
ksize_3x3
=
filterWidth
==
3
&&
src
.
type
()
!=
CV_32FC4
;
// CV_32FC4 is not tuned up with filter2d_3x3 kernel
string
kernelName
=
ksize_3x3
?
"filter2D_3x3"
:
"filter2D"
;
size_t
src_offset_x
=
(
src
.
offset
%
src
.
step
)
/
src
.
elemSize
();
size_t
src_offset_y
=
src
.
offset
/
src
.
step
;
...
...
@@ -613,55 +614,81 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
size_t
dst_offset_x
=
(
dst
.
offset
%
dst
.
step
)
/
dst
.
elemSize
();
size_t
dst_offset_y
=
dst
.
offset
/
dst
.
step
;
int
vector_lengths
[
4
][
7
]
=
{{
4
,
4
,
4
,
4
,
4
,
4
,
4
},
{
4
,
4
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
4
,
4
,
4
,
4
,
1
,
1
,
4
}
};
int
paddingPixels
=
filterWidth
&
(
-
2
);
size_t
localThreads
[
3
]
=
{
ksize_3x3
?
256
:
16
,
ksize_3x3
?
1
:
16
,
1
};
size_t
globalThreads
[
3
]
=
{
src
.
wholecols
,
src
.
wholerows
,
1
};
int
vector_length
=
vector_lengths
[
cn
-
1
][
depth
];
int
offset_cols
=
(
dst_offset_x
)
&
(
vector_length
-
1
);
int
cols
=
dst
.
cols
+
offset_cols
;
int
rows
=
divUp
(
dst
.
rows
,
vector_length
);
int
cn
=
src
.
oclchannels
();
int
src_step
=
(
int
)(
src
.
step
/
src
.
elemSize
());
int
dst_step
=
(
int
)(
dst
.
step
/
src
.
elemSize
());
int
localWidth
=
localThreads
[
0
]
+
paddingPixels
;
int
localHeight
=
localThreads
[
1
]
+
paddingPixels
;
size_t
localThreads
[
3
]
=
{
256
,
1
,
1
};
size_t
globalThreads
[
3
]
=
{
divUp
(
cols
,
localThreads
[
0
])
*
localThreads
[
0
],
divUp
(
rows
,
localThreads
[
1
])
*
localThreads
[
1
],
1
};
// 260 = divup((localThreads[0] + filterWidth * 2), 4) * 4
// 6 = (ROWS_PER_GROUP_WHICH_IS_4 + filterWidth * 2)
size_t
localMemSize
=
ksize_3x3
?
260
*
6
*
src
.
elemSize
()
:
(
localWidth
*
localHeight
)
*
src
.
elemSize
();
int
vector_lengths
[
4
][
7
]
=
{{
4
,
4
,
4
,
4
,
4
,
4
,
4
},
{
4
,
4
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
4
,
4
,
4
,
4
,
1
,
1
,
4
}
};
int
cols
=
dst
.
cols
+
((
dst_offset_x
)
&
(
vector_lengths
[
cn
-
1
][
src
.
depth
()]
-
1
));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_kernel
.
data
));
args
.
push_back
(
make_pair
(
localMemSize
,
(
void
*
)
NULL
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_offset_x
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_offset_y
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_offset_x
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_offset_y
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_kernel
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
const
int
buffer_size
=
100
;
char
opt_buffer
[
buffer_size
]
=
""
;
sprintf
(
opt_buffer
,
"-DANCHOR=%d -DANX=%d -DANY=%d"
,
ksize
.
width
,
anchor
.
x
,
anchor
.
y
);
openCLExecuteKernel
(
clCxt
,
&
filtering_laplacian
,
kernelName
,
globalThreads
,
localThreads
,
args
,
cn
,
depth
,
opt_buffer
);
char
btype
[
30
];
switch
(
borderType
)
{
case
0
:
sprintf
(
btype
,
"BORDER_CONSTANT"
);
break
;
case
1
:
sprintf
(
btype
,
"BORDER_REPLICATE"
);
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
);
break
;
case
3
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
);
break
;
}
int
type
=
src
.
depth
();
char
build_options
[
150
];
sprintf
(
build_options
,
"-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d"
,
btype
,
cn
,
type
,
cn
,
ksize
.
width
);
openCLExecuteKernel
(
clCxt
,
&
filtering_laplacian
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
kernel
,
const
Size
&
ksize
,
Point
anchor
,
int
borderType
)
const
Point
&
anchor
,
int
borderType
)
{
static
const
GPUFilter2D_t
GPUFilter2D_callers
[]
=
{
0
,
GPUFilter2D
,
0
,
GPUFilter2D
,
GPUFilter2D
};
CV_Assert
((
srcType
==
CV_8UC1
||
srcType
==
CV_8UC3
||
srcType
==
CV_8UC4
||
srcType
==
CV_32FC1
||
srcType
==
CV_32FC3
||
srcType
==
CV_32FC4
)
&&
dstType
==
srcType
);
oclMat
gpu_krnl
;
int
nDivis
or
;
normalizeKernel
(
kernel
,
gpu_krnl
,
CV_32
S
,
&
nDivisor
,
false
);
normalizeAnchor
(
an
chor
,
ksize
);
Point
norm_archor
=
anch
or
;
normalizeKernel
(
kernel
,
gpu_krnl
,
CV_32
FC1
);
normalizeAnchor
(
norm_ar
chor
,
ksize
);
return
Ptr
<
BaseFilter_GPU
>
(
new
LinearFilter_GPU
(
ksize
,
anchor
,
gpu_krnl
,
GPUFilter2D_callers
[
CV_MAT_CN
(
srcType
)],
borderType
));
...
...
modules/ocl/src/opencl/filtering_laplacian.cl
View file @
6ae9870d
...
...
@@ -15,7 +15,9 @@
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Pang
Erping,
erping@multicorewareinc.com
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
Peng
Xiao,
pengxiao@outlook.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
...
...
@@ -42,292 +44,228 @@
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
define
BORDER_REFLECT_101
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro
for
border
type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#
ifdef
BORDER_REPLICATE
//BORDER_REPLICATE:
aaaaaa|abcdefgh|hhhhhhh
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
l_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
r_edge
)
-1
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
t_edge
)
:
(
i
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
t_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
b_edge
)
-1
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT
//BORDER_REFLECT:
fedcba|abcdefgh|hgfedcb
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
((
l_edge
)
<<1
)
-
(
i
)
-1
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-1+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
((
t_edge
)
<<1
)
-
(
i
)
-1
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-1+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT_101
//BORDER_REFLECT_101:
gfedcb|abcdefgh|gfedcba
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
((
l_edge
)
<<1
)
-
(
i
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-2+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
((
t_edge
)
<<1
)
-
(
i
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-2+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
ifdef
BORDER_WRAP
//BORDER_WRAP:
cdefgh|abcdefgh|abcdefg
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
i
)
+
(
r_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
i
)
-
(
r_edge
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
i
)
+
(
b_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
i
)
-
(
b_edge
)
:
(
addr
))
#
ifdef
IMG_C_1_0
#
define
T_IMG
uchar
#
define
T_IMGx4
uchar4
#
define
T_IMG_C1
uchar
#
define
CONVERT_TYPE
convert_uchar_sat
#
define
CONVERT_TYPEx4
convert_uchar4_sat
#
endif
#
ifdef
IMG_C_4_0
#
define
T_IMG
uchar4
#
define
T_IMGx4
uchar16
#
define
T_IMG_C1
uchar
#
define
CONVERT_TYPE
convert_uchar4_sat
#
define
CONVERT_TYPEx4
convert_uchar16_sat
#
endif
#
ifdef
IMG_C_1_5
#
define
T_IMG
float
#
define
T_IMGx4
float4
#
define
T_IMG_C1
float
#
define
CONVERT_TYPE
convert_float
#
define
CONVERT_TYPEx4
convert_float4
#
endif
#
ifdef
IMG_C_4_5
#
define
T_IMG
float4
#
define
T_IMGx4
float16
#
define
T_IMG_C1
float
#
define
CONVERT_TYPE
convert_float4
#
define
CONVERT_TYPEx4
convert_float16
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////Macro
for
define
elements
number
per
thread/////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//#define
ANCHOR
3
//#define
ANX
1
//#define
ANY
1
#
define
ROWS_PER_GROUP
4
#
define
ROWS_PER_GROUP_BITS
2
#
define
ROWS_FETCH
(
ROWS_PER_GROUP
+
ANY
+
ANY
)
//
(
ROWS_PER_GROUP
+
anY
*
2
)
#
define
THREADS_PER_ROW
64
#
define
THREADS_PER_ROW_BIT
6
#
ifndef
CN
#
define
CN
1
#
endif
#
define
ELEMENTS_PER_THREAD
4
#
define
ELEMENTS_PER_THREAD_BIT
2
#
if
CN
==
1
#
define
T_SUM
float
#
define
T_SUMx4
float4
#
define
CONVERT_TYPE_SUM
convert_float
#
define
CONVERT_TYPE_SUMx4
convert_float4
#
define
SUM_ZERO
(
0.0f
)
#
define
SUM_ZEROx4
(
0.0f,
0.0f,
0.0f,
0.0f
)
#
define
VLOAD4
vload4
#
define
SX
x
#
define
SY
y
#
define
SZ
z
#
define
SW
w
#
elif
CN
==
4
#
define
T_SUM
float4
#
define
T_SUMx4
float16
#
define
CONVERT_TYPE_SUM
convert_float4
#
define
CONVERT_TYPE_SUMx4
convert_float16
#
define
SUM_ZERO
(
0.0f,
0.0f,
0.0f,
0.0f
)
#
define
SUM_ZEROx4
(
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f,
0.0f
)
#
define
VLOAD4
vload16
#
define
SX
s0123
#
define
SY
s4567
#
define
SZ
s89ab
#
define
SW
scdef
#
endif
#
define
LOCAL_MEM_STEP
260
//divup
((
get_local_size
(
0
)
+
anX
*
2
)
,
4
)
*
4
#
ifndef
FILTER_SIZE
#
define
FILTER_SIZE
3
#
endif
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
filter2D_C1_D0
(
__global
uchar
*src,
int
src_step,
int
src_offset_x,
int
src_offset_y,
__global
uchar
*dst,
int
dst_step,
int
dst_offset_x,
int
dst_offset_y,
__constant
int
*mat_kernel
__attribute__
((
max_constant_size
(
16384
)))
,
int
cols,int
rows,
int
operate_cols,
int
wholecols,
int
wholerows
)
#
define
LOCAL_GROUP_SIZE
16
#
define
LOCAL_WIDTH
((
FILTER_SIZE/2
)
*2
+
LOCAL_GROUP_SIZE
)
#
define
LOCAL_HEIGHT
((
FILTER_SIZE/2
)
*2
+
LOCAL_GROUP_SIZE
)
#
define
FILTER_RADIUS
(
FILTER_SIZE
>>
1
)
__kernel
void
filter2D
(
__global
T_IMG
*src,
__global
T_IMG
*dst,
int
src_step,
int
dst_step,
__constant
float
*mat_kernel,
__local
T_IMG
*local_data,
int
wholerows,
int
wholecols,
int
src_offset_x,
int
src_offset_y,
int
dst_offset_x,
int
dst_offset_y,
int
cols,
int
rows,
int
operate_cols
)
{
int
gX
=
get_global_id
(
0
)
;
int
gY
=
get_global_id
(
1
)
;
int
lX
=
get_local_id
(
0
)
;
int
groupX_size
=
get_local_size
(
0
)
;
int
groupX_id
=
get_group_id
(
0
)
;
#
define
dst_align
(
dst_offset_x
&
3
)
int
cols_start_index_group
=
src_offset_x
-
dst_align
+
groupX_size
*
groupX_id
-
ANX
;
int
rows_start_index
=
src_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
-
ANY
;
__local
uchar
local_data[LOCAL_MEM_STEP
*
ROWS_FETCH]
;
if
((
gY
<<
2
)
<
rows
)
int
groupStartCol
=
get_group_id
(
0
)
*
get_local_size
(
0
)
;
int
groupStartRow
=
get_group_id
(
1
)
*
get_local_size
(
1
)
;
int
localCol
=
get_local_id
(
0
)
;
int
localRow
=
get_local_id
(
1
)
;
int
globalCol
=
groupStartCol
+
localCol
;
int
globalRow
=
groupStartRow
+
localRow
;
const
int
src_offset
=
mad24
(
src_offset_y,
src_step,
src_offset_x
)
;
const
int
dst_offset
=
mad24
(
dst_offset_y,
dst_step,
dst_offset_x
)
;
#
ifdef
BORDER_CONSTANT
for
(
int
i
=
localRow
; i < LOCAL_HEIGHT; i += get_local_size(1))
{
for
(
int
i
=
0
; i < ROWS_FETCH; ++i)
int
curRow
=
groupStartRow
+
i
;
for
(
int
j
=
localCol
; j < LOCAL_WIDTH; j += get_local_size(0))
{
if
((
rows_start_index
-
src_offset_y
)
+
i
<
rows
+
ANY
)
int
curCol
=
groupStartCol
+
j
;
if
(
curRow
<
FILTER_RADIUS
-
src_offset_y
|
| (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
curCol < FILTER_RADIUS - src_offset_x |
|
(
curCol
-
FILTER_RADIUS
)
>=
wholecols
-
src_offset_x
)
{
#
ifdef
BORDER_CONSTANT
int
selected_row
=
rows_start_index
+
i
;
int
selected_cols
=
cols_start_index_group
+
lX
;
uchar
data
=
*
(
src
+
selected_row
*
src_step
+
selected_cols
)
;
int
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
data
=
*
(
src
+
selected_row
*
src_step
+
selected_cols
)
;
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
else
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
selected_row
=
ADDR_B
(
rows_start_index
+
i,
wholerows,
selected_row
)
;
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
uchar
data
=
*
(
src
+
selected_row
*
src_step
+
selected_cols
)
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
data
=
*
(
src
+
selected_row
*
src_step
+
selected_cols
)
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
endif
local_data[
(
i
)
*
LOCAL_WIDTH
+
j]
=
0
;
}
else
{
local_data[
(
i
)
*
LOCAL_WIDTH
+
j]
=
src[
(
curRow
-
FILTER_RADIUS
)
*
src_step
+
curCol
-
FILTER_RADIUS
+
src_offset]
;
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
process_col
=
groupX_size
*
groupX_id
+
((
lX
%
THREADS_PER_ROW
)
<<
2
)
;
if
(((
gY
<<
2
)
<
rows
)
&&
(
process_col
<
operate_cols
))
#
else
for
(
int
i
=
localRow
; i < LOCAL_HEIGHT; i += get_local_size(1))
{
int
dst_cols_start
=
dst_offset_x
;
int
dst_cols_end
=
dst_offset_x
+
cols
;
int
dst_cols_index
=
(
dst_offset_x
+
process_col
)
&
0xfffffffc
;
int
dst_rows_end
=
dst_offset_y
+
rows
;
int
dst_rows_index
=
dst_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
+
(
lX
>>
THREADS_PER_ROW_BIT
)
;
int
curRow
=
groupStartRow
+
i
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_rows_index
*
dst_step
+
dst_cols_index
)
)
;
curRow
=
ADDR_H
(
curRow,
FILTER_RADIUS
-
src_offset_y,
wholerows
-
src_offset_y
)
;
int4
sum
=
(
int4
)(
0
)
;
uchar4
data
;
curRow
=
ADDR_B
(
curRow
-
FILTER_RADIUS,
wholerows
-
src_offset_y,
curRow
-
FILTER_RADIUS
)
;
for
(
int
i
=
0
; i < ANCHOR; i++
)
for
(
int
j
=
localCol
; j < LOCAL_WIDTH; j += get_local_size(0)
)
{
#
pragma
unroll
for
(
int
j
=
0
; j < ANCHOR; j++)
int
curCol
=
groupStartCol
+
j
;
curCol
=
ADDR_L
(
curCol,
FILTER_RADIUS
-
src_offset_x,
wholecols
-
src_offset_x
)
;
curCol
=
ADDR_R
(
curCol
-
FILTER_RADIUS,
wholecols
-
src_offset_x,
curCol
-
FILTER_RADIUS
)
;
if
(
curRow
<
wholerows
&&
curCol
<
wholecols
)
{
if
(
dst_rows_index
<
dst_rows_end
)
{
int
local_row
=
(
lX
>>
THREADS_PER_ROW_BIT
)
+
i
;
int
local_cols
=
((
lX
%
THREADS_PER_ROW
)
<<
ELEMENTS_PER_THREAD_BIT
)
+
j
;
data
=
vload4
(
0
,
local_data+local_row
*
LOCAL_MEM_STEP
+
local_cols
)
;
sum
=
sum
+
(
mat_kernel[i
*
ANCHOR
+
j]
*
convert_int4_sat
(
data
))
;
}
local_data[
(
i
)
*
LOCAL_WIDTH
+
j]
=
src[
(
curRow
)
*
src_step
+
curCol
+
src_offset]
;
}
}
if
(
dst_rows_index
<
dst_rows_end
)
{
sum.x
=
((
dst_cols_index
+
0
>=
dst_cols_start
)
&&
(
dst_cols_index
+
0
<
dst_cols_end
))
?
sum.x
:
dst_data.x
;
sum.y
=
((
dst_cols_index
+
1
>=
dst_cols_start
)
&&
(
dst_cols_index
+
1
<
dst_cols_end
))
?
sum.y
:
dst_data.y
;
sum.z
=
((
dst_cols_index
+
2
>=
dst_cols_start
)
&&
(
dst_cols_index
+
2
<
dst_cols_end
))
?
sum.z
:
dst_data.z
;
sum.w
=
((
dst_cols_index
+
3
>=
dst_cols_start
)
&&
(
dst_cols_index
+
3
<
dst_cols_end
))
?
sum.w
:
dst_data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_rows_index
*
dst_step
+
dst_cols_index
))
=
convert_uchar4_sat
(
sum
)
;
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32FC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
filter2D_C1_D5
(
__global
float
*src,
int
src_step,
int
src_offset_x,
int
src_offset_y,
__global
float
*dst,
int
dst_step,
int
dst_offset_x,
int
dst_offset_y,
__constant
int
*mat_kernel
__attribute__
((
max_constant_size
(
16384
)))
,
int
cols,int
rows,
int
operate_cols,
int
wholecols,
int
wholerows
)
{
int
gX
=
get_global_id
(
0
)
;
int
gY
=
get_global_id
(
1
)
;
int
lX
=
get_local_id
(
0
)
;
int
groupX_size
=
get_local_size
(
0
)
;
int
groupX_id
=
get_group_id
(
0
)
;
#
define
dst_align
(
dst_offset_x
&
3
)
int
cols_start_index_group
=
src_offset_x
-
dst_align
+
groupX_size
*
groupX_id
-
ANX
;
int
rows_start_index
=
src_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
-
ANY
;
__local
float
local_data[LOCAL_MEM_STEP
*
ROWS_FETCH]
;
if
(((
gY
<<
2
)
<
rows
))
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
globalRow
<
rows
&&
globalCol
<
cols
)
{
for
(
int
i
=
0
; i < ROWS_FETCH; ++i)
T_SUM
sum
=
(
T_SUM
)
SUM_ZERO
;
int
filterIdx
=
0
;
for
(
int
i
=
0
; i < FILTER_SIZE; i++)
{
if
((
rows_start_index
-
src_offset_y
)
+
i
<
rows
+
ANY
)
{
#
ifdef
BORDER_CONSTANT
int
selected_row
=
rows_start_index
+
i
;
int
selected_cols
=
cols_start_index_group
+
lX
;
float
data
=
*
((
__global
float
*
)((
__global
char
*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
int
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
]
=data
;
int
offset
=
(
i
+
localRow
)
*
LOCAL_WIDTH
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
data
=
*
((
__global
float
*
)((
__global
char
*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
else
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
selected_row
=
ADDR_B
(
rows_start_index
+
i,
wholerows,
selected_row
)
;
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
float
data
=
*
((
__global
float
*
)((
__global
char
*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
local_data[i
*
LOCAL_MEM_STEP
+
lX]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
data
=
*
((
__global
float
*
)((
__global
char
*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
endif
for
(
int
j
=
0
; j < FILTER_SIZE; j++)
{
sum
+=
CONVERT_TYPE_SUM
(
local_data[offset
+
j
+
localCol]
)
*
mat_kernel[filterIdx++]
;
}
}
dst[
(
globalRow
)
*dst_step
+
(
globalCol
)
+
dst_offset]
=
CONVERT_TYPE
(
sum
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
process_col
=
groupX_size
*
groupX_id
+
((
lX
%
THREADS_PER_ROW
)
<<
2
)
;
if
(((
gY
<<
2
)
<
rows
)
&&
(
process_col
<
operate_cols
))
{
int
dst_cols_start
=
dst_offset_x
;
int
dst_cols_end
=
dst_offset_x
+
cols
;
int
dst_cols_index
=
(
dst_offset_x
+
process_col
)
&
0xfffffffc
;
int
dst_rows_end
=
dst_offset_y
+
rows
;
int
dst_rows_index
=
dst_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
+
(
lX
>>
THREADS_PER_ROW_BIT
)
;
}
float4
dst_data
=
*
((
__global
float4*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
2
)))
;
///
following
is
specific
for
3x3
kernels
float4
sum
=
(
float4
)(
0
)
;
float4
data
;
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////Macro
for
define
elements
number
per
thread/////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#
define
ANX
1
#
define
ANY
1
for
(
int
i
=
0
; i < ANCHOR; i++)
{
#
pragma
unroll
for
(
int
j
=
0
; j < ANCHOR; j++)
{
if
(
dst_rows_index
<
dst_rows_end
)
{
int
local_row
=
(
lX
>>
THREADS_PER_ROW_BIT
)
+
i
;
int
local_cols
=
((
lX
%
THREADS_PER_ROW
)
<<
ELEMENTS_PER_THREAD_BIT
)
+
j
;
#
define
ROWS_PER_GROUP
4
#
define
ROWS_PER_GROUP_BITS
2
#
define
ROWS_FETCH
(
ROWS_PER_GROUP
+
ANY
+
ANY
)
//
(
ROWS_PER_GROUP
+
anY
*
2
)
data
=
vload4
(
0
,
local_data+local_row
*
LOCAL_MEM_STEP
+
local_cols
)
;
sum
=
sum
+
((
float
)(
mat_kernel[i
*
ANCHOR
+
j]
)
*
data
)
;
}
}
}
#
define
THREADS_PER_ROW
64
#
define
THREADS_PER_ROW_BIT
6
if
(
dst_rows_index
<
dst_rows_end
)
{
sum.x
=
((
dst_cols_index
+
0
>=
dst_cols_start
)
&&
(
dst_cols_index
+
0
<
dst_cols_end
))
?
sum.x
:
dst_data.x
;
sum.y
=
((
dst_cols_index
+
1
>=
dst_cols_start
)
&&
(
dst_cols_index
+
1
<
dst_cols_end
))
?
sum.y
:
dst_data.y
;
sum.z
=
((
dst_cols_index
+
2
>=
dst_cols_start
)
&&
(
dst_cols_index
+
2
<
dst_cols_end
))
?
sum.z
:
dst_data.z
;
sum.w
=
((
dst_cols_index
+
3
>=
dst_cols_start
)
&&
(
dst_cols_index
+
3
<
dst_cols_end
))
?
sum.w
:
dst_data.w
;
#
define
ELEMENTS_PER_THREAD
4
#
define
ELEMENTS_PER_THREAD_BIT
2
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
2
)))
=
sum
;
}
}
}
#
define
LOCAL_MEM_STEP
260
//divup
((
get_local_size
(
0
)
+
anX
*
2
)
,
4
)
*
4
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC
4
////////////////////////////////////////////////////////
/////////////////////////////////////////8uC
1
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
filter2D_C4_D0
(
__global
uchar4
*src,
int
src_step,
int
src_offset_x,
int
src_offset_y,
__global
uchar4
*dst,
int
dst_step,
int
dst_offset_x,
int
dst_offset_y,
__constant
int
*mat_kernel
__attribute__
((
max_constant_size
(
16384
)))
,
int
cols,int
rows,
int
operate_cols,
int
wholecols,
int
wholerows
)
__kernel
void
filter2D_3x3
(
__global
T_IMG
*src,
__global
T_IMG
*dst,
int
src_step,
int
dst_step,
__constant
float
*mat_kernel,
__local
T_IMG
*local_data,
int
wholerows,
int
wholecols,
int
src_offset_x,
int
src_offset_y,
int
dst_offset_x,
int
dst_offset_y,
int
cols,
int
rows,
int
operate_cols
)
{
int
gX
=
get_global_id
(
0
)
;
int
gY
=
get_global_id
(
1
)
;
...
...
@@ -341,9 +279,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int
cols_start_index_group
=
src_offset_x
-
dst_align
+
groupX_size
*
groupX_id
-
ANX
;
int
rows_start_index
=
src_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
-
ANY
;
__local
uchar4
local_data[LOCAL_MEM_STEP
*
ROWS_FETCH]
;
if
(((
gY
<<
2
)
<
rows
))
if
((
gY
<<
2
)
<
rows
)
{
for
(
int
i
=
0
; i < ROWS_FETCH; ++i)
{
...
...
@@ -353,19 +289,19 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int
selected_row
=
rows_start_index
+
i
;
int
selected_cols
=
cols_start_index_group
+
lX
;
uchar4
data
=
*
((
__global
uchar4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
int
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
T_IMG
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
int
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[
i
*
LOCAL_MEM_STEP
+
lX
]
=
data
;
local_data[
mad24
(
i,
LOCAL_MEM_STEP,
lX
)
]
=
data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
data
=
*
((
__global
uchar4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
con
=
selected_row
>=
0
&&
selected_row
<
wholerows
&&
selected_cols
>=
0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[
i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=
data
;
local_data[
mad24
(
i,
LOCAL_MEM_STEP,
lX
)
+
groupX_size]
=
data
;
}
#
else
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
...
...
@@ -374,17 +310,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
uchar4
data
=
*
((
__global
uchar4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
T_IMG
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
local_data[
i
*
LOCAL_MEM_STEP
+
lX]
=
data
;
local_data[
mad24
(
i,
LOCAL_MEM_STEP,
lX
)
]
=
data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
data
=
*
((
__global
uchar4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
2
)))
;
local_data[
i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=
data
;
data
=
src[mad24
(
selected_row,
src_step,
selected_cols
)
]
;
local_data[
mad24
(
i,
LOCAL_MEM_STEP,
lX
)
+
groupX_size]
=
data
;
}
#
endif
}
...
...
@@ -401,131 +337,40 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int
dst_rows_end
=
dst_offset_y
+
rows
;
int
dst_rows_index
=
dst_offset_y
+
(
gY
<<
ROWS_PER_GROUP_BITS
)
+
(
lX
>>
THREADS_PER_ROW_BIT
)
;
dst
=
dst
+
mad24
(
dst_rows_index,
dst_step,
dst_cols_index
)
;
uchar16
dst_data
;
dst_data
=
*
((
__global
uchar16*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
2
)))
;
T_IMGx4
dst_data
=
*
(
__global
T_IMGx4
*
)
dst
;
int16
sum
=
(
int16
)(
0
)
;
uchar16
data
;
T_SUMx4
sum
=
(
T_SUMx4
)
SUM_ZEROx4
;
T_IMGx4
data
;
for
(
int
i
=
0
; i <
ANCHOR
; i++)
for
(
int
i
=
0
; i <
FILTER_SIZE
; i++)
{
#
pragma
unroll
for
(
int
j
=
0
; j <
ANCHOR
; j++)
for
(
int
j
=
0
; j <
FILTER_SIZE
; j++)
{
if
(
dst_rows_index
<
dst_rows_end
)
{
int
local_row
=
(
lX
>>
THREADS_PER_ROW_BIT
)
+
i
;
int
local_cols
=
((
lX
%
THREADS_PER_ROW
)
<<
ELEMENTS_PER_THREAD_BIT
)
+
j
;
data
=
vload16
(
0
,
(
__local
uchar
*
)(
local_data+
local_row
*
LOCAL_MEM_STEP
+
local_cols
))
;
sum
=
sum
+
(
mat_kernel[i
*
ANCHOR
+
j]
*
convert_int16_sat
(
data
))
;
data
=
VLOAD4
(
0
,
(
__local
T_IMG_C1
*
)(
local_data
+
local_row
*
LOCAL_MEM_STEP
+
local_cols
))
;
sum
=
sum
+
(
mat_kernel[i
*
FILTER_SIZE
+
j]
*
CONVERT_TYPE_SUMx4
(
data
))
;
}
}
}
if
(
dst_rows_index
<
dst_rows_end
)
{
uchar16
sum1
=
convert_uchar16_sat
(
sum
)
;
sum1.s0123
=
((
dst_cols_index
+
0
>=
dst_cols_start
)
&&
(
dst_cols_index
+
0
<
dst_cols_end
))
?
sum1.s0123
:
dst_data.s0123
;
sum1.s4567
=
((
dst_cols_index
+
1
>=
dst_cols_start
)
&&
(
dst_cols_index
+
1
<
dst_cols_end
))
?
sum1.s4567
:
dst_data.s4567
;
sum1.s89ab
=
((
dst_cols_index
+
2
>=
dst_cols_start
)
&&
(
dst_cols_index
+
2
<
dst_cols_end
))
?
sum1.s89ab
:
dst_data.s89ab
;
sum1.scdef
=
((
dst_cols_index
+
3
>=
dst_cols_start
)
&&
(
dst_cols_index
+
3
<
dst_cols_end
))
?
sum1.scdef
:
dst_data.scdef
;
*
((
__global
uchar16*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
2
)))
=
sum1
;
T_IMGx4
tmp_dst
=
CONVERT_TYPEx4
(
sum
)
;
tmp_dst.SX
=
((
dst_cols_index
+
0
>=
dst_cols_start
)
&&
(
dst_cols_index
+
0
<
dst_cols_end
))
?
tmp_dst.SX
:
dst_data.SX
;
tmp_dst.SY
=
((
dst_cols_index
+
1
>=
dst_cols_start
)
&&
(
dst_cols_index
+
1
<
dst_cols_end
))
?
tmp_dst.SY
:
dst_data.SY
;
tmp_dst.SZ
=
((
dst_cols_index
+
2
>=
dst_cols_start
)
&&
(
dst_cols_index
+
2
<
dst_cols_end
))
?
tmp_dst.SZ
:
dst_data.SZ
;
tmp_dst.SW
=
((
dst_cols_index
+
3
>=
dst_cols_start
)
&&
(
dst_cols_index
+
3
<
dst_cols_end
))
?
tmp_dst.SW
:
dst_data.SW
;
*
(
__global
T_IMGx4
*
)
dst
=
tmp_dst
;
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32FC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#
define
ROWS_FETCH_C4
(
1
+
ANY
+
ANY
)
//
(
ROWS_PER_GROUP
+
anY
*
2
)
#
define
LOCAL_MEM_STEP_C4
260
//divup
((
get_local_size
(
0
)
+
anX
*
2
)
,
4
)
*
4
)
__kernel
void
filter2D_C4_D5
(
__global
float4
*src,
int
src_step,
int
src_offset_x,
int
src_offset_y,
__global
float4
*dst,
int
dst_step,
int
dst_offset_x,
int
dst_offset_y,
__constant
int
*mat_kernel
__attribute__
((
max_constant_size
(
16384
)))
,
int
cols,int
rows,
int
operate_cols,
int
wholecols,
int
wholerows
)
{
int
gX
=
get_global_id
(
0
)
;
int
gY
=
get_global_id
(
1
)
;
int
lX
=
get_local_id
(
0
)
;
int
groupX_size
=
get_local_size
(
0
)
;
int
groupX_id
=
get_group_id
(
0
)
;
int
cols_start_index_group
=
src_offset_x
+
groupX_size
*
groupX_id
-
ANX
;
int
rows_start_index
=
src_offset_y
+
gY
-
ANY
;
__local
float4
local_data[LOCAL_MEM_STEP_C4
*
ROWS_FETCH_C4]
;
if
((
gY
<
rows
)
&&
(
gX
<
(
operate_cols
+
ANX
+
ANX
)))
{
for
(
int
i
=
0
; i < ROWS_FETCH_C4; ++i)
{
if
((
rows_start_index
-
src_offset_y
)
+
i
<
rows
+
ANY
)
{
#
ifdef
BORDER_CONSTANT
int
selected_row
=
rows_start_index
+
i
;
int
selected_cols
=
cols_start_index_group
+
lX
;
float4
data
=
*
((
__global
float4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
4
)))
;
int
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
data
=
*
((
__global
float4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
4
)))
;
con
=
selected_row
>=0
&&
selected_row
<
wholerows
&&
selected_cols
>=0
&&
selected_cols
<
wholecols
;
data
=
con
?
data
:
0
;
local_data[i
*
LOCAL_MEM_STEP
+
lX
+
groupX_size]
=data
;
}
#
else
int
selected_row
=
ADDR_H
(
rows_start_index
+
i,
0
,
wholerows
)
;
selected_row
=
ADDR_B
(
rows_start_index
+
i,
wholerows,
selected_row
)
;
int
selected_cols
=
ADDR_L
(
cols_start_index_group
+
lX,
0
,
wholecols
)
;
selected_cols
=
ADDR_R
(
cols_start_index_group
+
lX,
wholecols,
selected_cols
)
;
float4
data
=
*
((
__global
float4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
4
)))
;
local_data[i
*
LOCAL_MEM_STEP_C4
+
lX]
=data
;
if
(
lX
<
(
ANX
<<
1
))
{
selected_cols
=
cols_start_index_group
+
lX
+
groupX_size
;
selected_cols
=
ADDR_R
(
selected_cols,
wholecols,
selected_cols
)
;
data
=
*
((
__global
float4*
)((
__global
char*
)
src
+
selected_row
*
src_step
+
(
selected_cols
<<
4
)))
;
local_data[i
*
LOCAL_MEM_STEP_C4
+
lX
+
groupX_size]
=data
;
}
#
endif
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
((
gY
<
rows
)
&&
(
gX
<
operate_cols
))
{
int
dst_cols_index
=
dst_offset_x
+
gX
;
int
dst_rows_index
=
dst_offset_y
+
gY
;
float4
sum
=
(
float4
)(
0
)
;
for
(
int
i
=
0
; i < ANCHOR; i++)
{
for
(
int
j
=
0
; j < ANCHOR; j++)
{
int
local_cols
=
lX
+
j
;
sum
=
sum
+
((
float
)
mat_kernel[i
*
ANCHOR
+
j]
*
local_data[i
*
LOCAL_MEM_STEP_C4
+
local_cols]
)
;
}
}
*
((
__global
float4*
)((
__global
char
*
)
dst
+
dst_rows_index
*
dst_step
+
(
dst_cols_index
<<
4
)))
=
sum
;
}
}
modules/ocl/test/test_filters.cpp
View file @
6ae9870d
...
...
@@ -324,6 +324,35 @@ TEST_P(GaussianBlur, Mat)
////////////////////////////////////////////////////////////////////////////////////////////////////
// Filter2D
struct
Filter2D
:
FilterTestBase
{
int
type
;
cv
::
Size
ksize
;
int
bordertype
;
Point
anchor
;
virtual
void
SetUp
()
{
type
=
GET_PARAM
(
0
);
ksize
=
GET_PARAM
(
1
);
bordertype
=
GET_PARAM
(
3
);
Init
(
type
);
anchor
=
Point
(
-
1
,
-
1
);
}
};
TEST_P
(
Filter2D
,
Mat
)
{
cv
::
Mat
kernel
=
randomMat
(
cv
::
Size
(
ksize
.
width
,
ksize
.
height
),
CV_32FC1
,
0.0
,
1.0
);
for
(
int
j
=
0
;
j
<
LOOP_TIMES
;
j
++
)
{
random_roi
();
cv
::
filter2D
(
mat1_roi
,
dst_roi
,
-
1
,
kernel
,
anchor
,
0.0
,
bordertype
);
cv
::
ocl
::
filter2D
(
gmat1
,
gdst
,
-
1
,
kernel
,
anchor
,
bordertype
);
Near
(
1
);
}
}
INSTANTIATE_TEST_CASE_P
(
Filter
,
Blur
,
Combine
(
Values
(
CV_8UC1
,
CV_8UC3
,
CV_8UC4
,
CV_32FC1
,
CV_32FC4
),
Values
(
cv
::
Size
(
3
,
3
),
cv
::
Size
(
5
,
5
),
cv
::
Size
(
7
,
7
)),
...
...
@@ -331,7 +360,7 @@ INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
Values
((
MatType
)
cv
::
BORDER_CONSTANT
,
(
MatType
)
cv
::
BORDER_REPLICATE
,
(
MatType
)
cv
::
BORDER_REFLECT
,
(
MatType
)
cv
::
BORDER_REFLECT_101
)));
INSTANTIATE_TEST_CASE_P
(
Filter
s
,
Laplacian
,
Combine
(
INSTANTIATE_TEST_CASE_P
(
Filter
,
Laplacian
,
Combine
(
Values
(
CV_8UC1
,
CV_8UC3
,
CV_8UC4
,
CV_32FC1
,
CV_32FC3
,
CV_32FC4
),
Values
(
Size
(
3
,
3
)),
Values
(
Size
(
0
,
0
)),
//not use
...
...
@@ -365,4 +394,10 @@ INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
INSTANTIATE_TEST_CASE_P
(
Filter
,
Filter2D
,
testing
::
Combine
(
Values
(
CV_8UC1
,
CV_32FC1
,
CV_32FC4
),
Values
(
Size
(
3
,
3
),
Size
(
15
,
15
),
Size
(
25
,
25
)),
Values
(
Size
(
0
,
0
)),
//not use
Values
((
MatType
)
cv
::
BORDER_CONSTANT
,
(
MatType
)
cv
::
BORDER_REFLECT101
,
(
MatType
)
cv
::
BORDER_REPLICATE
,
(
MatType
)
cv
::
BORDER_REFLECT
)));
#endif // HAVE_OPENCL
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment