Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
c70fbb95
Commit
c70fbb95
authored
Oct 30, 2013
by
Roman Donchenko
Committed by
OpenCV Buildbot
Oct 30, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1712 from alalek:ocl_split
parents
4bc105c4
50d2c106
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
265 additions
and
1281 deletions
+265
-1281
cl_programcache.cpp
modules/ocl/src/cl_programcache.cpp
+1
-1
split_mat.cl
modules/ocl/src/opencl/split_mat.cl
+148
-1158
safe_call.hpp
modules/ocl/src/safe_call.hpp
+1
-1
split_merge.cpp
modules/ocl/src/split_merge.cpp
+93
-55
test_split_merge.cpp
modules/ocl/test/test_split_merge.cpp
+18
-64
utility.hpp
modules/ocl/test/utility.hpp
+4
-2
No files found.
modules/ocl/src/cl_programcache.cpp
View file @
c70fbb95
...
...
@@ -428,7 +428,7 @@ struct ProgramFileCache
if
(
status
!=
CL_SUCCESS
)
{
if
(
status
==
CL_BUILD_PROGRAM_FAILURE
)
if
(
status
==
CL_BUILD_PROGRAM_FAILURE
||
status
==
CL_INVALID_BUILD_OPTIONS
)
{
size_t
buildLogSize
=
0
;
openCLSafeCall
(
clGetProgramBuildInfo
(
program
,
getClDeviceID
(
ctx
),
...
...
modules/ocl/src/opencl/split_mat.cl
View file @
c70fbb95
...
...
@@ -10,13 +10,9 @@
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Copyright
(
C
)
2010-2013,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
...
...
@@ -46,1177 +42,171 @@
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
endif
///////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////optimized
code
using
vector
////////////////////////////////
////////////vector
fuction
name
format:
split_vector_C
(
channels
number
)
_D
(
data
type
depth
)
//////
////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
split_vector_C4_D0
(
__global
uchar
*mat_src,
int
src_step,
int
src_offset,
__global
uchar
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
uchar
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
uchar
*mat_dst2,
int
dst2_step,
int
dst2_offset,
__global
uchar
*mat_dst3,
int
dst3_step,
int
dst3_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
2
;
int
src_idx
=
mad24
(
y,
src_step,
src_offset
+
(
x
<<
2
))
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
x
)
&
(
int
)
0xfffffffc
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
x
)
&
(
int
)
0xfffffffc
;
int
dst2_start
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst2_end
=
mad24
(
y,
dst2_step,
dst2_offset
+
dst_step1
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
+
x
)
&
(
int
)
0xfffffffc
;
int
dst3_start
=
mad24
(
y,
dst3_step,
dst3_offset
)
;
int
dst3_end
=
mad24
(
y,
dst3_step,
dst3_offset
+
dst_step1
)
;
int
dst3_idx
=
mad24
(
y,
dst3_step,
dst3_offset
+
x
)
&
(
int
)
0xfffffffc
;
uchar4
data_0
=
*
((
global
uchar4
*
)(
mat_src
+
(
src_idx
-
12
>=
0
?
src_idx
-
12
:
src_idx
)))
;
uchar4
data_1
=
*
((
global
uchar4
*
)(
mat_src
+
(
src_idx
-
8
>=
0
?
src_idx
-
8
:
src_idx
)))
;
uchar4
data_2
=
*
((
global
uchar4
*
)(
mat_src
+
(
src_idx
-
4
>=
0
?
src_idx
-
4
:
src_idx
)))
;
uchar4
data_3
=
*
((
global
uchar4
*
)(
mat_src
+
src_idx
+
0
))
;
int
total_bytes
=
src_offset
+
rows
*
src_step
;
uchar4
data_4
=
*
((
global
uchar4
*
)(
mat_src
+
(
src_idx
+
4
<
total_bytes
?
src_idx
+
4
:
src_idx
)))
;
uchar4
data_5
=
*
((
global
uchar4
*
)(
mat_src
+
(
src_idx
+
8
<
total_bytes
?
src_idx
+
8
:
src_idx
)))
;
uchar4
data_6
=
*
((
global
uchar4
*
)(
mat_src
+
(
src_idx
+
12
<
total_bytes
?
src_idx
+
12
:
src_idx
)))
;
uchar4
tmp_data0=1,
tmp_data1=2,
tmp_data2,
tmp_data3
;
if
((
dst0_offset
&
3
)
==
3
)
tmp_data0
=
(
uchar4
)(
data_0.x,
data_1.x,
data_2.x,
data_3.x
)
;
if
((
dst0_offset
&
3
)
==
2
)
tmp_data0
=
(
uchar4
)(
data_1.x,
data_2.x,
data_3.x,
data_4.x
)
;
if
((
dst0_offset
&
3
)
==
1
)
tmp_data0
=
(
uchar4
)(
data_2.x,
data_3.x,
data_4.x,
data_5.x
)
;
if
((
dst0_offset
&
3
)
==
0
)
tmp_data0
=
(
uchar4
)(
data_3.x,
data_4.x,
data_5.x,
data_6.x
)
;
if
((
dst1_offset
&
3
)
==
3
)
tmp_data1
=
(
uchar4
)(
data_0.y,
data_1.y,
data_2.y,
data_3.y
)
;
if
((
dst1_offset
&
3
)
==
2
)
tmp_data1
=
(
uchar4
)(
data_1.y,
data_2.y,
data_3.y,
data_4.y
)
;
if
((
dst1_offset
&
3
)
==
1
)
tmp_data1
=
(
uchar4
)(
data_2.y,
data_3.y,
data_4.y,
data_5.y
)
;
if
((
dst1_offset
&
3
)
==
0
)
tmp_data1
=
(
uchar4
)(
data_3.y,
data_4.y,
data_5.y,
data_6.y
)
;
if
((
dst2_offset
&
3
)
==
3
)
tmp_data2
=
(
uchar4
)(
data_0.z,
data_1.z,
data_2.z,
data_3.z
)
;
if
((
dst2_offset
&
3
)
==
2
)
tmp_data2
=
(
uchar4
)(
data_1.z,
data_2.z,
data_3.z,
data_4.z
)
;
if
((
dst2_offset
&
3
)
==
1
)
tmp_data2
=
(
uchar4
)(
data_2.z,
data_3.z,
data_4.z,
data_5.z
)
;
if
((
dst2_offset
&
3
)
==
0
)
tmp_data2
=
(
uchar4
)(
data_3.z,
data_4.z,
data_5.z,
data_6.z
)
;
if
((
dst3_offset
&
3
)
==
3
)
tmp_data3
=
(
uchar4
)(
data_0.w,
data_1.w,
data_2.w,
data_3.w
)
;
if
((
dst3_offset
&
3
)
==
2
)
tmp_data3
=
(
uchar4
)(
data_1.w,
data_2.w,
data_3.w,
data_4.w
)
;
if
((
dst3_offset
&
3
)
==
1
)
tmp_data3
=
(
uchar4
)(
data_2.w,
data_3.w,
data_4.w,
data_5.w
)
;
if
((
dst3_offset
&
3
)
==
0
)
tmp_data3
=
(
uchar4
)(
data_3.w,
data_4.w,
data_5.w,
data_6.w
)
;
uchar4
dst0_data
=
*
((
__global
uchar4
*
)(
mat_dst0
+
dst0_idx
))
;
uchar4
dst1_data
=
*
((
__global
uchar4
*
)(
mat_dst1
+
dst1_idx
))
;
uchar4
dst2_data
=
*
((
__global
uchar4
*
)(
mat_dst2
+
dst2_idx
))
;
uchar4
dst3_data
=
*
((
__global
uchar4
*
)(
mat_dst3
+
dst3_idx
))
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
tmp_data0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
1
>=
dst0_start
)
&&
(
dst0_idx
+
1
<
dst0_end
))
?
tmp_data0.y
:
dst0_data.y
;
tmp_data0.z
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
tmp_data0.z
:
dst0_data.z
;
tmp_data0.w
=
((
dst0_idx
+
3
>=
dst0_start
)
&&
(
dst0_idx
+
3
<
dst0_end
))
?
tmp_data0.w
:
dst0_data.w
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
tmp_data1.x
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
1
>=
dst1_start
)
&&
(
dst1_idx
+
1
<
dst1_end
))
?
tmp_data1.y
:
dst1_data.y
;
tmp_data1.z
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
tmp_data1.z
:
dst1_data.z
;
tmp_data1.w
=
((
dst1_idx
+
3
>=
dst1_start
)
&&
(
dst1_idx
+
3
<
dst1_end
))
?
tmp_data1.w
:
dst1_data.w
;
tmp_data2.x
=
((
dst2_idx
+
0
>=
dst2_start
)
&&
(
dst2_idx
+
0
<
dst2_end
))
?
tmp_data2.x
:
dst2_data.x
;
tmp_data2.y
=
((
dst2_idx
+
1
>=
dst2_start
)
&&
(
dst2_idx
+
1
<
dst2_end
))
?
tmp_data2.y
:
dst2_data.y
;
tmp_data2.z
=
((
dst2_idx
+
2
>=
dst2_start
)
&&
(
dst2_idx
+
2
<
dst2_end
))
?
tmp_data2.z
:
dst2_data.z
;
tmp_data2.w
=
((
dst2_idx
+
3
>=
dst2_start
)
&&
(
dst2_idx
+
3
<
dst2_end
))
?
tmp_data2.w
:
dst2_data.w
;
tmp_data3.x
=
((
dst3_idx
+
0
>=
dst3_start
)
&&
(
dst3_idx
+
0
<
dst3_end
))
?
tmp_data3.x
:
dst3_data.x
;
tmp_data3.y
=
((
dst3_idx
+
1
>=
dst3_start
)
&&
(
dst3_idx
+
1
<
dst3_end
))
?
tmp_data3.y
:
dst3_data.y
;
tmp_data3.z
=
((
dst3_idx
+
2
>=
dst3_start
)
&&
(
dst3_idx
+
2
<
dst3_end
))
?
tmp_data3.z
:
dst3_data.z
;
tmp_data3.w
=
((
dst3_idx
+
3
>=
dst3_start
)
&&
(
dst3_idx
+
3
<
dst3_end
))
?
tmp_data3.w
:
dst3_data.w
;
*
((
__global
uchar4
*
)(
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
__global
uchar4
*
)(
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
*
((
__global
uchar4
*
)(
mat_dst2
+
dst2_idx
))
=
tmp_data2
;
*
((
__global
uchar4
*
)(
mat_dst3
+
dst3_idx
))
=
tmp_data3
;
}
}
__kernel
void
split_vector_C3_D0
(
__global
uchar
*mat_src,
int
src_step,
int
src_offset,
__global
uchar
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
uchar
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
uchar
*mat_dst2,
int
dst2_step,
int
dst2_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
2
;
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst2_start
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst2_end
=
mad24
(
y,
dst2_step,
dst2_offset
+
dst_step1
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
dst0_data
=
*
((
__global
uchar4
*
)(
mat_dst0
+
dst0_idx
))
;
uchar4
dst1_data
=
*
((
__global
uchar4
*
)(
mat_dst1
+
dst1_idx
))
;
uchar4
dst2_data
=
*
((
__global
uchar4
*
)(
mat_dst2
+
dst2_idx
))
;
uchar4
tmp_data0,
tmp_data1,
tmp_data2
;
uchar
src_data_0
=
*
(
mat_src
+
src_idx
+
3
*
x
-
9
)
;
uchar
src_data_1
=
*
(
mat_src
+
src_idx
+
3
*
x
-
8
)
;
uchar
src_data_2
=
*
(
mat_src
+
src_idx
+
3
*
x
-
7
)
;
uchar
src_data_3
=
*
(
mat_src
+
src_idx
+
3
*
x
-
6
)
;
uchar
src_data_4
=
*
(
mat_src
+
src_idx
+
3
*
x
-
5
)
;
uchar
src_data_5
=
*
(
mat_src
+
src_idx
+
3
*
x
-
4
)
;
uchar
src_data_6
=
*
(
mat_src
+
src_idx
+
3
*
x
-
3
)
;
uchar
src_data_7
=
*
(
mat_src
+
src_idx
+
3
*
x
-
2
)
;
uchar
src_data_8
=
*
(
mat_src
+
src_idx
+
3
*
x
-
1
)
;
uchar
src_data_9
=
*
(
mat_src
+
src_idx
+
3
*
x
+
0
)
;
uchar
src_data_10
=
*
(
mat_src
+
src_idx
+
3
*
x
+
1
)
;
uchar
src_data_11
=
*
(
mat_src
+
src_idx
+
3
*
x
+
2
)
;
uchar
src_data_12
=
*
(
mat_src
+
src_idx
+
3
*
x
+
3
)
;
uchar
src_data_13
=
*
(
mat_src
+
src_idx
+
3
*
x
+
4
)
;
uchar
src_data_14
=
*
(
mat_src
+
src_idx
+
3
*
x
+
5
)
;
uchar
src_data_15
=
*
(
mat_src
+
src_idx
+
3
*
x
+
6
)
;
uchar
src_data_16
=
*
(
mat_src
+
src_idx
+
3
*
x
+
7
)
;
uchar
src_data_17
=
*
(
mat_src
+
src_idx
+
3
*
x
+
8
)
;
uchar
src_data_18
=
*
(
mat_src
+
src_idx
+
3
*
x
+
9
)
;
uchar
src_data_19
=
*
(
mat_src
+
src_idx
+
3
*
x
+
10
)
;
uchar
src_data_20
=
*
(
mat_src
+
src_idx
+
3
*
x
+
11
)
;
uchar
data[7]
=
{src_data_0,
src_data_3,
src_data_6,
src_data_9,
src_data_12,
src_data_15,
src_data_18}
;
int
index
=
3
-
dst0_offset
&
3
;
tmp_data0
=
(
uchar4
)(
data[index],
data[index
+
1],
data[index
+
2],
data[index
+
3]
)
;
uchar4
data0,
data1,
data2
;
data0
=
(
uchar4
)(
src_data_1,
src_data_4,
src_data_7,
src_data_10
)
;
data1
=
(
dst1_offset
&
3
)
==
2
?
(
uchar4
)(
src_data_4,
src_data_7,
src_data_10,
src_data_13
)
:
data0
;
data2
=
(
dst1_offset
&
3
)
==
1
?
(
uchar4
)(
src_data_7,
src_data_10,
src_data_13,
src_data_16
)
:
data1
;
tmp_data1
=
(
dst1_offset
&
3
)
==
0
?
(
uchar4
)(
src_data_10,
src_data_13,
src_data_16,
src_data_19
)
:
data2
;
data0
=
(
uchar4
)(
src_data_2,
src_data_5,
src_data_8,
src_data_11
)
;
data1
=
(
dst2_offset
&
3
)
==
2
?
(
uchar4
)(
src_data_5,
src_data_8,
src_data_11,
src_data_14
)
:
data0
;
data2
=
(
dst2_offset
&
3
)
==
1
?
(
uchar4
)(
src_data_8,
src_data_11,
src_data_14,
src_data_17
)
:
data1
;
tmp_data2
=
(
dst2_offset
&
3
)
==
0
?
(
uchar4
)(
src_data_11,
src_data_14,
src_data_17,
src_data_20
)
:
data2
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
tmp_data0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
1
>=
dst0_start
)
&&
(
dst0_idx
+
1
<
dst0_end
))
?
tmp_data0.y
:
dst0_data.y
;
tmp_data0.z
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
tmp_data0.z
:
dst0_data.z
;
tmp_data0.w
=
((
dst0_idx
+
3
>=
dst0_start
)
&&
(
dst0_idx
+
3
<
dst0_end
))
?
tmp_data0.w
:
dst0_data.w
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
tmp_data1.x
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
1
>=
dst1_start
)
&&
(
dst1_idx
+
1
<
dst1_end
))
?
tmp_data1.y
:
dst1_data.y
;
tmp_data1.z
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
tmp_data1.z
:
dst1_data.z
;
tmp_data1.w
=
((
dst1_idx
+
3
>=
dst1_start
)
&&
(
dst1_idx
+
3
<
dst1_end
))
?
tmp_data1.w
:
dst1_data.w
;
tmp_data2.x
=
((
dst2_idx
+
0
>=
dst2_start
)
&&
(
dst2_idx
+
0
<
dst2_end
))
?
tmp_data2.x
:
dst2_data.x
;
tmp_data2.y
=
((
dst2_idx
+
1
>=
dst2_start
)
&&
(
dst2_idx
+
1
<
dst2_end
))
?
tmp_data2.y
:
dst2_data.y
;
tmp_data2.z
=
((
dst2_idx
+
2
>=
dst2_start
)
&&
(
dst2_idx
+
2
<
dst2_end
))
?
tmp_data2.z
:
dst2_data.z
;
tmp_data2.w
=
((
dst2_idx
+
3
>=
dst2_start
)
&&
(
dst2_idx
+
3
<
dst2_end
))
?
tmp_data2.w
:
dst2_data.w
;
*
((
__global
uchar4
*
)(
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
__global
uchar4
*
)(
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
*
((
__global
uchar4
*
)(
mat_dst2
+
dst2_idx
))
=
tmp_data2
;
}
}
__kernel
void
split_vector_C2_D0
(
__global
uchar
*mat_src,
int
src_step,
int
src_offset,
__global
uchar
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
uchar
*mat_dst1,
int
dst1_step,
int
dst1_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
2
;
#
define
dst0_align
((
dst0_offset
&
3
)
<<
1
)
#
define
dst1_align
((
dst1_offset
&
3
)
<<
1
)
int
src_idx_0
=
mad24
(
y,
src_step,
src_offset
-
dst0_align
+
(
x
<<
1
))
;
int
src_idx_1
=
mad24
(
y,
src_step,
src_offset
-
dst1_align
+
(
x
<<
1
))
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src_idx_0
<
0
?
0
:
src_idx_0
;
int
src2_index_fix
=
src_idx_1
<
0
?
0
:
src_idx_1
;
uchar8
src_data_0
=
vload8
(
0
,
mat_src
+
src_idx_0
)
;
uchar8
src_data_1
=
vload8
(
0
,
mat_src
+
src_idx_1
)
;
if
(
src_idx_0
==
-6
)
src_data_0.s01234567
=
src_data_0.s67012345
;
if
(
src_idx_0
==
-4
)
src_data_0.s01234567
=
src_data_0.s45670123
;
if
(
src_idx_0
==
-2
)
src_data_0.s01234567
=
src_data_0.s23456701
;
if
(
src_idx_1
==
-6
)
src_data_1.s01234567
=
src_data_1.s67012345
;
if
(
src_idx_1
==
-4
)
src_data_1.s01234567
=
src_data_1.s45670123
;
if
(
src_idx_1
==
-2
)
src_data_1.s01234567
=
src_data_1.s23456701
;
uchar4
dst0_data
=
*
((
__global
uchar4
*
)(
mat_dst0
+
dst0_idx
))
;
uchar4
dst1_data
=
*
((
__global
uchar4
*
)(
mat_dst1
+
dst1_idx
))
;
uchar4
tmp_data0,
tmp_data1
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
src_data_0.s0
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
1
>=
dst0_start
)
&&
(
dst0_idx
+
1
<
dst0_end
))
?
src_data_0.s2
:
dst0_data.y
;
tmp_data0.z
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
src_data_0.s4
:
dst0_data.z
;
tmp_data0.w
=
((
dst0_idx
+
3
>=
dst0_start
)
&&
(
dst0_idx
+
3
<
dst0_end
))
?
src_data_0.s6
:
dst0_data.w
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
src_data_1.s1
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
1
>=
dst1_start
)
&&
(
dst1_idx
+
1
<
dst1_end
))
?
src_data_1.s3
:
dst1_data.y
;
tmp_data1.z
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
src_data_1.s5
:
dst1_data.z
;
tmp_data1.w
=
((
dst1_idx
+
3
>=
dst1_start
)
&&
(
dst1_idx
+
3
<
dst1_end
))
?
src_data_1.s7
:
dst1_data.w
;
*
((
__global
uchar4
*
)(
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
__global
uchar4
*
)(
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
}
}
__kernel
void
split_vector_C4_D1
(
__global
char
*mat_src,
int
src_step,
int
src_offset,
__global
char
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
char
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
char
*mat_dst2,
int
dst2_step,
int
dst2_offset,
__global
char
*mat_dst3,
int
dst3_step,
int
dst3_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
2
;
int
src_idx
=
mad24
(
y,
src_step,
src_offset
+
(
x
<<
2
))
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst2_start
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst2_end
=
mad24
(
y,
dst2_step,
dst2_offset
+
dst_step1
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst3_start
=
mad24
(
y,
dst3_step,
dst3_offset
)
;
int
dst3_end
=
mad24
(
y,
dst3_step,
dst3_offset
+
dst_step1
)
;
int
dst3_idx
=
mad24
(
y,
dst3_step,
dst3_offset
+
x
&
(
int
)
0xfffffffc
)
;
char4
data_0
=
*
((
global
char4
*
)(
mat_src
+
src_idx
-
12
))
;
char4
data_1
=
*
((
global
char4
*
)(
mat_src
+
src_idx
-
8
))
;
char4
data_2
=
*
((
global
char4
*
)(
mat_src
+
src_idx
-
4
))
;
char4
data_3
=
*
((
global
char4
*
)(
mat_src
+
src_idx
+
0
))
;
char4
data_4
=
*
((
global
char4
*
)(
mat_src
+
src_idx
+
4
))
;
char4
data_5
=
*
((
global
char4
*
)(
mat_src
+
src_idx
+
8
))
;
char4
data_6
=
*
((
global
char4
*
)(
mat_src
+
src_idx
+
12
))
;
char4
tmp_data0=1,
tmp_data1=2,
tmp_data2,
tmp_data3
;
if
((
dst0_offset
&
3
)
==
3
)
tmp_data0
=
(
char4
)(
data_0.x,
data_1.x,
data_2.x,
data_3.x
)
;
if
((
dst0_offset
&
3
)
==
2
)
tmp_data0
=
(
char4
)(
data_1.x,
data_2.x,
data_3.x,
data_4.x
)
;
if
((
dst0_offset
&
3
)
==
1
)
tmp_data0
=
(
char4
)(
data_2.x,
data_3.x,
data_4.x,
data_5.x
)
;
if
((
dst0_offset
&
3
)
==
0
)
tmp_data0
=
(
char4
)(
data_3.x,
data_4.x,
data_5.x,
data_6.x
)
;
if
((
dst1_offset
&
3
)
==
3
)
tmp_data1
=
(
char4
)(
data_0.y,
data_1.y,
data_2.y,
data_3.y
)
;
if
((
dst1_offset
&
3
)
==
2
)
tmp_data1
=
(
char4
)(
data_1.y,
data_2.y,
data_3.y,
data_4.y
)
;
if
((
dst1_offset
&
3
)
==
1
)
tmp_data1
=
(
char4
)(
data_2.y,
data_3.y,
data_4.y,
data_5.y
)
;
if
((
dst1_offset
&
3
)
==
0
)
tmp_data1
=
(
char4
)(
data_3.y,
data_4.y,
data_5.y,
data_6.y
)
;
if
((
dst2_offset
&
3
)
==
3
)
tmp_data2
=
(
char4
)(
data_0.z,
data_1.z,
data_2.z,
data_3.z
)
;
if
((
dst2_offset
&
3
)
==
2
)
tmp_data2
=
(
char4
)(
data_1.z,
data_2.z,
data_3.z,
data_4.z
)
;
if
((
dst2_offset
&
3
)
==
1
)
tmp_data2
=
(
char4
)(
data_2.z,
data_3.z,
data_4.z,
data_5.z
)
;
if
((
dst2_offset
&
3
)
==
0
)
tmp_data2
=
(
char4
)(
data_3.z,
data_4.z,
data_5.z,
data_6.z
)
;
if
((
dst3_offset
&
3
)
==
3
)
tmp_data3
=
(
char4
)(
data_0.w,
data_1.w,
data_2.w,
data_3.w
)
;
if
((
dst3_offset
&
3
)
==
2
)
tmp_data3
=
(
char4
)(
data_1.w,
data_2.w,
data_3.w,
data_4.w
)
;
if
((
dst3_offset
&
3
)
==
1
)
tmp_data3
=
(
char4
)(
data_2.w,
data_3.w,
data_4.w,
data_5.w
)
;
if
((
dst3_offset
&
3
)
==
0
)
tmp_data3
=
(
char4
)(
data_3.w,
data_4.w,
data_5.w,
data_6.w
)
;
char4
dst0_data
=
*
((
__global
char4
*
)(
mat_dst0
+
dst0_idx
))
;
char4
dst1_data
=
*
((
__global
char4
*
)(
mat_dst1
+
dst1_idx
))
;
char4
dst2_data
=
*
((
__global
char4
*
)(
mat_dst2
+
dst2_idx
))
;
char4
dst3_data
=
*
((
__global
char4
*
)(
mat_dst3
+
dst3_idx
))
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
tmp_data0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
1
>=
dst0_start
)
&&
(
dst0_idx
+
1
<
dst0_end
))
?
tmp_data0.y
:
dst0_data.y
;
tmp_data0.z
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
tmp_data0.z
:
dst0_data.z
;
tmp_data0.w
=
((
dst0_idx
+
3
>=
dst0_start
)
&&
(
dst0_idx
+
3
<
dst0_end
))
?
tmp_data0.w
:
dst0_data.w
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
tmp_data1.x
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
1
>=
dst1_start
)
&&
(
dst1_idx
+
1
<
dst1_end
))
?
tmp_data1.y
:
dst1_data.y
;
tmp_data1.z
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
tmp_data1.z
:
dst1_data.z
;
tmp_data1.w
=
((
dst1_idx
+
3
>=
dst1_start
)
&&
(
dst1_idx
+
3
<
dst1_end
))
?
tmp_data1.w
:
dst1_data.w
;
tmp_data2.x
=
((
dst2_idx
+
0
>=
dst2_start
)
&&
(
dst2_idx
+
0
<
dst2_end
))
?
tmp_data2.x
:
dst2_data.x
;
tmp_data2.y
=
((
dst2_idx
+
1
>=
dst2_start
)
&&
(
dst2_idx
+
1
<
dst2_end
))
?
tmp_data2.y
:
dst2_data.y
;
tmp_data2.z
=
((
dst2_idx
+
2
>=
dst2_start
)
&&
(
dst2_idx
+
2
<
dst2_end
))
?
tmp_data2.z
:
dst2_data.z
;
tmp_data2.w
=
((
dst2_idx
+
3
>=
dst2_start
)
&&
(
dst2_idx
+
3
<
dst2_end
))
?
tmp_data2.w
:
dst2_data.w
;
tmp_data3.x
=
((
dst3_idx
+
0
>=
dst3_start
)
&&
(
dst3_idx
+
0
<
dst3_end
))
?
tmp_data3.x
:
dst3_data.x
;
tmp_data3.y
=
((
dst3_idx
+
1
>=
dst3_start
)
&&
(
dst3_idx
+
1
<
dst3_end
))
?
tmp_data3.y
:
dst3_data.y
;
tmp_data3.z
=
((
dst3_idx
+
2
>=
dst3_start
)
&&
(
dst3_idx
+
2
<
dst3_end
))
?
tmp_data3.z
:
dst3_data.z
;
tmp_data3.w
=
((
dst3_idx
+
3
>=
dst3_start
)
&&
(
dst3_idx
+
3
<
dst3_end
))
?
tmp_data3.w
:
dst3_data.w
;
*
((
__global
char4
*
)(
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
__global
char4
*
)(
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
*
((
__global
char4
*
)(
mat_dst2
+
dst2_idx
))
=
tmp_data2
;
*
((
__global
char4
*
)(
mat_dst3
+
dst3_idx
))
=
tmp_data3
;
}
}
__kernel
void
split_vector_C3_D1
(
__global
char
*mat_src,
int
src_step,
int
src_offset,
__global
char
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
char
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
char
*mat_dst2,
int
dst2_step,
int
dst2_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
2
;
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst2_start
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst2_end
=
mad24
(
y,
dst2_step,
dst2_offset
+
dst_step1
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
+
x
&
(
int
)
0xfffffffc
)
;
char4
dst0_data
=
*
((
__global
char4
*
)(
mat_dst0
+
dst0_idx
))
;
char4
dst1_data
=
*
((
__global
char4
*
)(
mat_dst1
+
dst1_idx
))
;
char4
dst2_data
=
*
((
__global
char4
*
)(
mat_dst2
+
dst2_idx
))
;
char4
tmp_data0,
tmp_data1,
tmp_data2
;
char
src_data_0
=
*
(
mat_src
+
src_idx
+
3
*
x
-
9
)
;
char
src_data_1
=
*
(
mat_src
+
src_idx
+
3
*
x
-
8
)
;
char
src_data_2
=
*
(
mat_src
+
src_idx
+
3
*
x
-
7
)
;
char
src_data_3
=
*
(
mat_src
+
src_idx
+
3
*
x
-
6
)
;
char
src_data_4
=
*
(
mat_src
+
src_idx
+
3
*
x
-
5
)
;
char
src_data_5
=
*
(
mat_src
+
src_idx
+
3
*
x
-
4
)
;
char
src_data_6
=
*
(
mat_src
+
src_idx
+
3
*
x
-
3
)
;
char
src_data_7
=
*
(
mat_src
+
src_idx
+
3
*
x
-
2
)
;
char
src_data_8
=
*
(
mat_src
+
src_idx
+
3
*
x
-
1
)
;
char
src_data_9
=
*
(
mat_src
+
src_idx
+
3
*
x
+
0
)
;
char
src_data_10
=
*
(
mat_src
+
src_idx
+
3
*
x
+
1
)
;
char
src_data_11
=
*
(
mat_src
+
src_idx
+
3
*
x
+
2
)
;
char
src_data_12
=
*
(
mat_src
+
src_idx
+
3
*
x
+
3
)
;
char
src_data_13
=
*
(
mat_src
+
src_idx
+
3
*
x
+
4
)
;
char
src_data_14
=
*
(
mat_src
+
src_idx
+
3
*
x
+
5
)
;
char
src_data_15
=
*
(
mat_src
+
src_idx
+
3
*
x
+
6
)
;
char
src_data_16
=
*
(
mat_src
+
src_idx
+
3
*
x
+
7
)
;
char
src_data_17
=
*
(
mat_src
+
src_idx
+
3
*
x
+
8
)
;
char
src_data_18
=
*
(
mat_src
+
src_idx
+
3
*
x
+
9
)
;
char
src_data_19
=
*
(
mat_src
+
src_idx
+
3
*
x
+
10
)
;
char
src_data_20
=
*
(
mat_src
+
src_idx
+
3
*
x
+
11
)
;
char
data[7]
=
{src_data_0,
src_data_3,
src_data_6,
src_data_9,
src_data_12,
src_data_15,
src_data_18}
;
int
index
=
3
-
dst0_offset
&
3
;
tmp_data0
=
(
char4
)(
data[index],
data[index
+
1],
data[index
+
2],
data[index
+
3]
)
;
char4
data0,
data1,
data2
;
data0
=
(
char4
)(
src_data_1,
src_data_4,
src_data_7,
src_data_10
)
;
data1
=
(
dst1_offset
&
3
)
==
2
?
(
char4
)(
src_data_4,
src_data_7,
src_data_10,
src_data_13
)
:
data0
;
data2
=
(
dst1_offset
&
3
)
==
1
?
(
char4
)(
src_data_7,
src_data_10,
src_data_13,
src_data_16
)
:
data1
;
tmp_data1
=
(
dst1_offset
&
3
)
==
0
?
(
char4
)(
src_data_10,
src_data_13,
src_data_16,
src_data_19
)
:
data2
;
data0
=
(
char4
)(
src_data_2,
src_data_5,
src_data_8,
src_data_11
)
;
data1
=
(
dst2_offset
&
3
)
==
2
?
(
char4
)(
src_data_5,
src_data_8,
src_data_11,
src_data_14
)
:
data0
;
data2
=
(
dst2_offset
&
3
)
==
1
?
(
char4
)(
src_data_8,
src_data_11,
src_data_14,
src_data_17
)
:
data1
;
tmp_data2
=
(
dst2_offset
&
3
)
==
0
?
(
char4
)(
src_data_11,
src_data_14,
src_data_17,
src_data_20
)
:
data2
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
tmp_data0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
1
>=
dst0_start
)
&&
(
dst0_idx
+
1
<
dst0_end
))
?
tmp_data0.y
:
dst0_data.y
;
tmp_data0.z
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
tmp_data0.z
:
dst0_data.z
;
tmp_data0.w
=
((
dst0_idx
+
3
>=
dst0_start
)
&&
(
dst0_idx
+
3
<
dst0_end
))
?
tmp_data0.w
:
dst0_data.w
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
tmp_data1.x
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
1
>=
dst1_start
)
&&
(
dst1_idx
+
1
<
dst1_end
))
?
tmp_data1.y
:
dst1_data.y
;
tmp_data1.z
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
tmp_data1.z
:
dst1_data.z
;
tmp_data1.w
=
((
dst1_idx
+
3
>=
dst1_start
)
&&
(
dst1_idx
+
3
<
dst1_end
))
?
tmp_data1.w
:
dst1_data.w
;
tmp_data2.x
=
((
dst2_idx
+
0
>=
dst2_start
)
&&
(
dst2_idx
+
0
<
dst2_end
))
?
tmp_data2.x
:
dst2_data.x
;
tmp_data2.y
=
((
dst2_idx
+
1
>=
dst2_start
)
&&
(
dst2_idx
+
1
<
dst2_end
))
?
tmp_data2.y
:
dst2_data.y
;
tmp_data2.z
=
((
dst2_idx
+
2
>=
dst2_start
)
&&
(
dst2_idx
+
2
<
dst2_end
))
?
tmp_data2.z
:
dst2_data.z
;
tmp_data2.w
=
((
dst2_idx
+
3
>=
dst2_start
)
&&
(
dst2_idx
+
3
<
dst2_end
))
?
tmp_data2.w
:
dst2_data.w
;
*
((
__global
char4
*
)(
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
__global
char4
*
)(
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
*
((
__global
char4
*
)(
mat_dst2
+
dst2_idx
))
=
tmp_data2
;
}
}
__kernel
void
split_vector_C2_D1
(
__global
char
*mat_src,
int
src_step,
int
src_offset,
__global
char
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
char
*mat_dst1,
int
dst1_step,
int
dst1_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
2
;
#
define
dst0_align
((
dst0_offset
&
3
)
<<
1
)
#
define
dst1_align
((
dst1_offset
&
3
)
<<
1
)
int
src_idx_0
=
mad24
(
y,
src_step,
src_offset
-
dst0_align
+
(
x
<<
1
))
;
int
src_idx_1
=
mad24
(
y,
src_step,
src_offset
-
dst1_align
+
(
x
<<
1
))
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src_idx_0
<
0
?
0
:
src_idx_0
;
int
src2_index_fix
=
src_idx_1
<
0
?
0
:
src_idx_1
;
char8
src_data_0
=
vload8
(
0
,
mat_src
+
src_idx_0
)
;
char8
src_data_1
=
vload8
(
0
,
mat_src
+
src_idx_1
)
;
if
(
src_idx_0
==
-6
)
src_data_0.s01234567
=
src_data_0.s67012345
;
if
(
src_idx_0
==
-4
)
src_data_0.s01234567
=
src_data_0.s45670123
;
if
(
src_idx_0
==
-2
)
src_data_0.s01234567
=
src_data_0.s23456701
;
if
(
src_idx_1
==
-6
)
src_data_1.s01234567
=
src_data_1.s67012345
;
if
(
src_idx_1
==
-4
)
src_data_1.s01234567
=
src_data_1.s45670123
;
if
(
src_idx_1
==
-2
)
src_data_1.s01234567
=
src_data_1.s23456701
;
char4
dst0_data
=
*
((
__global
char4
*
)(
mat_dst0
+
dst0_idx
))
;
char4
dst1_data
=
*
((
__global
char4
*
)(
mat_dst1
+
dst1_idx
))
;
char4
tmp_data0,
tmp_data1
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
src_data_0.s0
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
1
>=
dst0_start
)
&&
(
dst0_idx
+
1
<
dst0_end
))
?
src_data_0.s2
:
dst0_data.y
;
tmp_data0.z
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
src_data_0.s4
:
dst0_data.z
;
tmp_data0.w
=
((
dst0_idx
+
3
>=
dst0_start
)
&&
(
dst0_idx
+
3
<
dst0_end
))
?
src_data_0.s6
:
dst0_data.w
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
src_data_1.s1
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
1
>=
dst1_start
)
&&
(
dst1_idx
+
1
<
dst1_end
))
?
src_data_1.s3
:
dst1_data.y
;
tmp_data1.z
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
src_data_1.s5
:
dst1_data.z
;
tmp_data1.w
=
((
dst1_idx
+
3
>=
dst1_start
)
&&
(
dst1_idx
+
3
<
dst1_end
))
?
src_data_1.s7
:
dst1_data.w
;
*
((
__global
char4
*
)(
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
__global
char4
*
)(
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
}
}
__kernel
void
split_vector_C4_D2
(
__global
ushort
*mat_src,
int
src_step,
int
src_offset,
__global
ushort
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
ushort
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
ushort
*mat_dst2,
int
dst2_step,
int
dst2_offset,
__global
ushort
*mat_dst3,
int
dst3_step,
int
dst3_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
1
;
int
src_idx_0
=
mad24
(
y,
src_step,
src_offset
+
(
x
<<
3
)
-
8
)
;
int
src_idx_1
=
mad24
(
y,
src_step,
src_offset
+
(
x
<<
3
)
+
8
)
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst2_start
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst2_end
=
mad24
(
y,
dst2_step,
dst2_offset
+
dst_step1
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst3_start
=
mad24
(
y,
dst3_step,
dst3_offset
)
;
int
dst3_end
=
mad24
(
y,
dst3_step,
dst3_offset
+
dst_step1
)
;
int
dst3_idx
=
mad24
(
y,
dst3_step,
dst3_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src_idx_0
<
0
?
0
:
src_idx_0
;
ushort8
src_data0
=
vload8
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx_0
))
;
if
(
src_idx_0
==
-6
)
src_data0.s01234567
=
src_data0.s67012345
;
if
(
src_idx_0
==
-4
)
src_data0.s01234567
=
src_data0.s45670123
;
if
(
src_idx_0
==
-2
)
src_data0.s01234567
=
src_data0.s23456701
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
mat_src
+
src_idx_1
))
;
ushort2
dst0_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
;
ushort2
dst1_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
;
ushort2
dst2_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
;
ushort2
dst3_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst3
+
dst3_idx
))
;
ushort2
tmp_data0,
tmp_data1,
tmp_data2,
tmp_data3
;
tmp_data0
=
(
dst0_offset
&
3
)
==
0
?
(
ushort2
)(
src_data0.s4,
src_data1.s0
)
:
(
ushort2
)(
src_data0.s0,
src_data0.s4
)
;
tmp_data1
=
(
dst1_offset
&
3
)
==
0
?
(
ushort2
)(
src_data0.s5,
src_data1.s1
)
:
(
ushort2
)(
src_data0.s1,
src_data0.s5
)
;
tmp_data2
=
(
dst2_offset
&
3
)
==
0
?
(
ushort2
)(
src_data0.s6,
src_data1.s2
)
:
(
ushort2
)(
src_data0.s2,
src_data0.s6
)
;
tmp_data3
=
(
dst3_offset
&
3
)
==
0
?
(
ushort2
)(
src_data0.s7,
src_data1.s3
)
:
(
ushort2
)(
src_data0.s3,
src_data0.s7
)
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
tmp_data0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
tmp_data0.y
:
dst0_data.y
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
tmp_data1.x
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
tmp_data1.y
:
dst1_data.y
;
tmp_data2.x
=
((
dst2_idx
+
0
>=
dst2_start
)
&&
(
dst2_idx
+
0
<
dst2_end
))
?
tmp_data2.x
:
dst2_data.x
;
tmp_data2.y
=
((
dst2_idx
+
2
>=
dst2_start
)
&&
(
dst2_idx
+
2
<
dst2_end
))
?
tmp_data2.y
:
dst2_data.y
;
tmp_data3.x
=
((
dst3_idx
+
0
>=
dst3_start
)
&&
(
dst3_idx
+
0
<
dst3_end
))
?
tmp_data3.x
:
dst3_data.x
;
tmp_data3.y
=
((
dst3_idx
+
2
>=
dst3_start
)
&&
(
dst3_idx
+
2
<
dst3_end
))
?
tmp_data3.y
:
dst3_data.y
;
*
((
global
ushort2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
global
ushort2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
*
((
global
ushort2
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
=
tmp_data2
;
*
((
global
ushort2
*
)((
__global
char
*
)
mat_dst3
+
dst3_idx
))
=
tmp_data3
;
}
}
__kernel
void
split_vector_C3_D2
(
__global
ushort
*mat_src,
int
src_step,
int
src_offset,
__global
ushort
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
ushort
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
ushort
*mat_dst2,
int
dst2_step,
int
dst2_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
1
;
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst2_start
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst2_end
=
mad24
(
y,
dst2_step,
dst2_offset
+
dst_step1
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
dst0_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
;
ushort2
dst1_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
;
ushort2
dst2_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
;
ushort2
tmp_data0,
tmp_data1,
tmp_data2
;
ushort
src_data_0
=
((
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
-
3]
;
ushort
src_data_1
=
((
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
-
2]
;
ushort
src_data_2
=
((
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
-
1]
;
ushort
src_data_3
=
((
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
0]
;
ushort
src_data_4
=
((
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
1]
;
ushort
src_data_5
=
((
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
2]
;
ushort
src_data_6
=
((
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
3]
;
ushort
src_data_7
=
((
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
4]
;
ushort
src_data_8
=
((
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
5]
;
tmp_data0
=
(
dst0_offset
&
3
)
==
0
?
(
ushort2
)(
src_data_3,
src_data_6
)
:
(
ushort2
)(
src_data_0,
src_data_3
)
;
tmp_data1
=
(
dst1_offset
&
3
)
==
0
?
(
ushort2
)(
src_data_4,
src_data_7
)
:
(
ushort2
)(
src_data_1,
src_data_4
)
;
tmp_data2
=
(
dst2_offset
&
3
)
==
0
?
(
ushort2
)(
src_data_5,
src_data_8
)
:
(
ushort2
)(
src_data_2,
src_data_5
)
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
tmp_data0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
tmp_data0.y
:
dst0_data.y
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
tmp_data1.x
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
tmp_data1.y
:
dst1_data.y
;
tmp_data2.x
=
((
dst2_idx
+
0
>=
dst2_start
)
&&
(
dst2_idx
+
0
<
dst2_end
))
?
tmp_data2.x
:
dst2_data.x
;
tmp_data2.y
=
((
dst2_idx
+
2
>=
dst2_start
)
&&
(
dst2_idx
+
2
<
dst2_end
))
?
tmp_data2.y
:
dst2_data.y
;
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
=
tmp_data2
;
}
}
__kernel
void
split_vector_C2_D2
(
__global
ushort
*mat_src,
int
src_step,
int
src_offset,
__global
ushort
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
ushort
*mat_dst1,
int
dst1_step,
int
dst1_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
1
;
#
define
dst0_align
((
dst0_offset
&
3
)
<<
1
)
#
define
dst1_align
((
dst1_offset
&
3
)
<<
1
)
int
src_idx_0
=
mad24
(
y,
src_step,
src_offset
-
dst0_align
+
(
x
<<
2
))
;
int
src_idx_1
=
mad24
(
y,
src_step,
src_offset
-
dst1_align
+
(
x
<<
2
))
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src_idx_0
<
0
?
0
:
src_idx_0
;
int
src2_index_fix
=
src_idx_1
<
0
?
0
:
src_idx_1
;
ushort4
src_data_0
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src1_index_fix
))
;
ushort4
src_data_1
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
mat_src
+
src2_index_fix
))
;
if
(
src_idx_0
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src_idx_0
==
-2
)
?
src_data_0.zwxy
:
src_data_0.yzwx
;
src_data_0.xyzw
=
(
src_idx_1
==
-1
)
?
src_data_0.wxyz:tmp.xyzw
;
}
if
(
src_idx_1
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src_idx_1
==
-2
)
?
src_data_1.zwxy
:
src_data_1.yzwx
;
src_data_1.xyzw
=
(
src_idx_1
==
-1
)
?
src_data_1.wxyz
:
tmp.xyzw
;
}
ushort2
dst0_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
;
ushort2
dst1_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
;
ushort2
tmp_data0,
tmp_data1
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
src_data_0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
src_data_0.z
:
dst0_data.y
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
src_data_1.y
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
src_data_1.w
:
dst1_data.y
;
*
((
global
ushort2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
global
ushort2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
}
}
__kernel
void
split_vector_C4_D3
(
__global
short
*mat_src,
int
src_step,
int
src_offset,
__global
short
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
short
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
short
*mat_dst2,
int
dst2_step,
int
dst2_offset,
__global
short
*mat_dst3,
int
dst3_step,
int
dst3_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
1
;
int
src_idx_0
=
mad24
(
y,
src_step,
src_offset
+
(
x
<<
3
)
-
8
)
;
int
src_idx_1
=
mad24
(
y,
src_step,
src_offset
+
(
x
<<
3
)
+
8
)
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst2_start
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst2_end
=
mad24
(
y,
dst2_step,
dst2_offset
+
dst_step1
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst3_start
=
mad24
(
y,
dst3_step,
dst3_offset
)
;
int
dst3_end
=
mad24
(
y,
dst3_step,
dst3_offset
+
dst_step1
)
;
int
dst3_idx
=
mad24
(
y,
dst3_step,
dst3_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src_idx_0
<
0
?
0
:
src_idx_0
;
short8
src_data0
=
vload8
(
0
,
(
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx_0
))
;
if
(
src_idx_0
==
-6
)
src_data0.s01234567
=
src_data0.s67012345
;
if
(
src_idx_0
==
-4
)
src_data0.s01234567
=
src_data0.s45670123
;
if
(
src_idx_0
==
-2
)
src_data0.s01234567
=
src_data0.s23456701
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
mat_src
+
src_idx_1
))
;
short2
dst0_data
=
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
;
short2
dst1_data
=
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
;
short2
dst2_data
=
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
;
short2
dst3_data
=
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst3
+
dst3_idx
))
;
short2
tmp_data0,
tmp_data1,
tmp_data2,
tmp_data3
;
tmp_data0
=
(
dst0_offset
&
3
)
==
0
?
(
short2
)(
src_data0.s4,
src_data1.s0
)
:
(
short2
)(
src_data0.s0,
src_data0.s4
)
;
tmp_data1
=
(
dst1_offset
&
3
)
==
0
?
(
short2
)(
src_data0.s5,
src_data1.s1
)
:
(
short2
)(
src_data0.s1,
src_data0.s5
)
;
tmp_data2
=
(
dst2_offset
&
3
)
==
0
?
(
short2
)(
src_data0.s6,
src_data1.s2
)
:
(
short2
)(
src_data0.s2,
src_data0.s6
)
;
tmp_data3
=
(
dst3_offset
&
3
)
==
0
?
(
short2
)(
src_data0.s7,
src_data1.s3
)
:
(
short2
)(
src_data0.s3,
src_data0.s7
)
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
tmp_data0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
tmp_data0.y
:
dst0_data.y
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
tmp_data1.x
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
tmp_data1.y
:
dst1_data.y
;
tmp_data2.x
=
((
dst2_idx
+
0
>=
dst2_start
)
&&
(
dst2_idx
+
0
<
dst2_end
))
?
tmp_data2.x
:
dst2_data.x
;
tmp_data2.y
=
((
dst2_idx
+
2
>=
dst2_start
)
&&
(
dst2_idx
+
2
<
dst2_end
))
?
tmp_data2.y
:
dst2_data.y
;
tmp_data3.x
=
((
dst3_idx
+
0
>=
dst3_start
)
&&
(
dst3_idx
+
0
<
dst3_end
))
?
tmp_data3.x
:
dst3_data.x
;
tmp_data3.y
=
((
dst3_idx
+
2
>=
dst3_start
)
&&
(
dst3_idx
+
2
<
dst3_end
))
?
tmp_data3.y
:
dst3_data.y
;
*
((
global
short2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
global
short2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
*
((
global
short2
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
=
tmp_data2
;
*
((
global
short2
*
)((
__global
char
*
)
mat_dst3
+
dst3_idx
))
=
tmp_data3
;
}
}
__kernel
void
split_vector_C3_D3
(
__global
short
*mat_src,
int
src_step,
int
src_offset,
__global
short
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
short
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
short
*mat_dst2,
int
dst2_step,
int
dst2_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
x
=
x
<<
1
;
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst2_start
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst2_end
=
mad24
(
y,
dst2_step,
dst2_offset
+
dst_step1
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
#
if
DATA_DEPTH
==
0
#
define
BASE_TYPE
uchar
#
elif
DATA_DEPTH
==
1
#
error
data_depth
char,
use
uchar
datatype
instead
#
elif
DATA_DEPTH
==
2
#
define
BASE_TYPE
ushort
#
elif
DATA_DEPTH
==
3
#
error
data_depth
short,
use
ushort
datatype
instead
#
elif
DATA_DEPTH
==
4
#
define
BASE_TYPE
int
#
elif
DATA_DEPTH
==
5
#
define
BASE_TYPE
float
#
elif
DATA_DEPTH
==
6
#
define
BASE_TYPE
double
#
else
#
error
data_depth
#
endif
short2
dst0_data
=
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
;
short2
dst1_data
=
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
;
short2
dst2_data
=
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
;
#
if
DATA_CHAN
==
2
#
define
SRC_VEC_SIZE
2
#
elif
DATA_CHAN
==
3
#
define
SRC_VEC_SIZE
4
//
C3
is
stored
as
C4
#
elif
DATA_CHAN
==
4
#
define
SRC_VEC_SIZE
4
#
else
#
error
data_chan
#
endif
short2
tmp_data0,
tmp_data1,
tmp_data2
;
#
define
__CAT
(
x,
y
)
x##y
#
define
CAT
(
x,
y
)
__CAT
(
x,
y
)
short
src_data_0
=
((
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
-
3]
;
short
src_data_1
=
((
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
-
2]
;
short
src_data_2
=
((
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
-
1]
;
short
src_data_3
=
((
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
0]
;
short
src_data_4
=
((
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
1]
;
short
src_data_5
=
((
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
2]
;
short
src_data_6
=
((
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
3]
;
short
src_data_7
=
((
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
4]
;
short
src_data_8
=
((
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
5]
;
#
define
uchar1
uchar
#
define
char1
char
#
define
ushort1
ushort
#
define
short1
short
#
define
int1
int
#
define
float1
float
#
define
double1
double
tmp_data0
=
(
dst0_offset
&
3
)
==
0
?
(
short2
)(
src_data_3,
src_data_6
)
:
(
short2
)(
src_data_0,
src_data_3
)
;
tmp_data1
=
(
dst1_offset
&
3
)
==
0
?
(
short2
)(
src_data_4,
src_data_7
)
:
(
short2
)(
src_data_1,
src_data_4
)
;
tmp_data2
=
(
dst2_offset
&
3
)
==
0
?
(
short2
)(
src_data_5,
src_data_8
)
:
(
short2
)(
src_data_2,
src_data_5
)
;
#
define
TYPE
BASE_TYPE
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
tmp_data0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
tmp_data0.y
:
dst0_data.y
;
#
define
SRC_TYPE
CAT
(
BASE_TYPE,
SRC_VEC_SIZE
)
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
tmp_data1.x
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
tmp_data1.y
:
dst1_data.y
;
#
define
DST_VEC_TYPE
CAT
(
BASE_TYPE,
VEC_SIZE
)
tmp_data2.x
=
((
dst2_idx
+
0
>=
dst2_start
)
&&
(
dst2_idx
+
0
<
dst2_end
))
?
tmp_data2.x
:
dst2_data.x
;
tmp_data2.y
=
((
dst2_idx
+
2
>=
dst2_start
)
&&
(
dst2_idx
+
2
<
dst2_end
))
?
tmp_data2.y
:
dst2_data.y
;
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
=
tmp_data2
;
}
}
#
define
vstore1
vstore
#
define
VSTORE
CAT
(
vstore,
VEC_SIZE
)
#
define
VSTORE_ALIGNED
(
ptr,
v
)
*
((
__global
DST_VEC_TYPE*
)(
ptr
))
=
(
v
)
#
define
VSTORE_UNALIGNED
(
ptr,
v
)
VSTORE
((
v
)
,
0
,
(
__global
TYPE*
)(
ptr
))
#
ifdef
DST0_ALIGNED
#
define
VSTORE_dst0
VSTORE_ALIGNED
#
else
#
define
VSTORE_dst0
VSTORE_UNALIGNED
#
endif
#
ifdef
DST1_ALIGNED
#
define
VSTORE_dst1
VSTORE_ALIGNED
#
else
#
define
VSTORE_dst1
VSTORE_UNALIGNED
#
endif
#
ifdef
DST2_ALIGNED
#
define
VSTORE_dst2
VSTORE_ALIGNED
#
else
#
define
VSTORE_dst2
VSTORE_UNALIGNED
#
endif
#
ifdef
DST3_ALIGNED
#
define
VSTORE_dst3
VSTORE_ALIGNED
#
else
#
define
VSTORE_dst3
VSTORE_UNALIGNED
#
endif
__kernel
void
split_vector_C2_D3
(
__global
short
*mat_src,
int
src_step,
int
src_offset,
__global
short
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
short
*mat_dst1,
int
dst1_step,
int
dst1_offset,
int
rows,
int
cols,
int
dst_step1
)
__kernel
void
split_vector
(
__global
SRC_TYPE*
src,
int
srcStepBytes,
int2
srcOffset,
//
offset.x
in
bytes
__global
TYPE*
dst0,
int
dst0StepBytes,
int2
dst0Offset,
__global
TYPE*
dst1,
int
dst1StepBytes,
int2
dst1Offset,
#
if
DATA_CHAN
>
2
__global
TYPE*
dst2,
int
dst2StepBytes,
int2
dst2Offset,
#
endif
#
if
DATA_CHAN
>
3
__global
TYPE*
dst3,
int
dst3StepBytes,
int2
dst3Offset,
#
endif
int2
size
)
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
*
VEC_SIZE
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
)
)
if
(
x
<
size.x
&&
y
<
size.y
)
{
x
=
x
<<
1
;
#
define
dst0_align
((
dst0_offset
&
3
)
<<
1
)
#
define
dst1_align
((
dst1_offset
&
3
)
<<
1
)
int
src_idx_0
=
mad24
(
y,
src_step,
src_offset
-
dst0_align
+
(
x
<<
2
))
;
int
src_idx_1
=
mad24
(
y,
src_step,
src_offset
-
dst1_align
+
(
x
<<
2
))
;
int
dst0_start
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst0_end
=
mad24
(
y,
dst0_step,
dst0_offset
+
dst_step1
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
dst1_start
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst1_end
=
mad24
(
y,
dst1_step,
dst1_offset
+
dst_step1
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src_idx_0
<
0
?
0
:
src_idx_0
;
int
src2_index_fix
=
src_idx_1
<
0
?
0
:
src_idx_1
;
short4
src_data_0
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx_0
))
;
short4
src_data_1
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
mat_src
+
src_idx_1
))
;
if
(
src_idx_0
<
0
)
SRC_TYPE
srcData[VEC_SIZE]
;
int
xOffsetLimitBytes
=
srcOffset.x
+
size.x
*
sizeof
(
SRC_TYPE
)
;
int
xOffsetBytes
=
srcOffset.x
+
x
*
sizeof
(
SRC_TYPE
)
;
int
yOffsetBytes
=
(
srcOffset.y
+
y
)
*
srcStepBytes
;
#
pragma
unroll
for
(
int
i
=
0
; i < VEC_SIZE; i++, xOffsetBytes += sizeof(SRC_TYPE))
{
short4
tmp
;
tmp.xyzw
=
(
src_idx_0
==
-2
)
?
src_data_0.zwxy
:
src_data_0.yzwx
;
src_data_0.xyzw
=
(
src_idx_0
==
-1
)
?
src_data_0.wxyz:tmp.xyzw
;
srcData[i]
=
(
xOffsetBytes
>=
xOffsetLimitBytes
)
?
(
SRC_TYPE
)
0
:
*
(
__global
SRC_TYPE*
)((
__global
char*
)
src
+
yOffsetBytes
+
xOffsetBytes
)
;
}
if
(
src_idx_1<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src_idx_1==
-2
)
?
src_data_1.zwxy
:
src_data_1.yzwx
;
src_data_1.xyzw
=
(
src_idx_1==
-1
)
?
src_data_1.wxyz
:
tmp.xyzw
;
}
short2
dst0_data
=
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
;
short2
dst1_data
=
*
((
__global
short2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
;
short2
tmp_data0,
tmp_data1
;
tmp_data0.x
=
((
dst0_idx
+
0
>=
dst0_start
)
&&
(
dst0_idx
+
0
<
dst0_end
))
?
src_data_0.x
:
dst0_data.x
;
tmp_data0.y
=
((
dst0_idx
+
2
>=
dst0_start
)
&&
(
dst0_idx
+
2
<
dst0_end
))
?
src_data_0.z
:
dst0_data.y
;
tmp_data1.x
=
((
dst1_idx
+
0
>=
dst1_start
)
&&
(
dst1_idx
+
0
<
dst1_end
))
?
src_data_1.y
:
dst1_data.x
;
tmp_data1.y
=
((
dst1_idx
+
2
>=
dst1_start
)
&&
(
dst1_idx
+
2
<
dst1_end
))
?
src_data_1.w
:
dst1_data.y
;
*
((
global
short2
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
=
tmp_data0
;
*
((
global
short2
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
=
tmp_data1
;
}
}
__kernel
void
split_vector_C4_D4
(
__global
int
*mat_src,
int
src_step,
int
src_offset,
__global
int
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
int
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
int
*mat_dst2,
int
dst2_step,
int
dst2_offset,
__global
int
*mat_dst3,
int
dst3_step,
int
dst3_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst3_idx
=
mad24
(
y,
dst3_step,
dst3_offset
)
;
int4
src_data
=
((
__global
int4
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[x]
;
((
__global
int
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
[x]
=
src_data.x
;
((
__global
int
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
[x]
=
src_data.y
;
((
__global
int
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
[x]
=
src_data.z
;
((
__global
int
*
)((
__global
char
*
)
mat_dst3
+
dst3_idx
))
[x]
=
src_data.w
;
}
}
__kernel
void
split_vector_C3_D4
(
__global
int
*mat_src,
int
src_step,
int
src_offset,
__global
int
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
int
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
int
*mat_dst2,
int
dst2_step,
int
dst2_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
src_data_0
=
((
__global
int
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
0]
;
int
src_data_1
=
((
__global
int
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
1]
;
int
src_data_2
=
((
__global
int
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
2]
;
((
__global
int
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
[x]
=
src_data_0
;
((
__global
int
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
[x]
=
src_data_1
;
((
__global
int
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
[x]
=
src_data_2
;
}
}
__kernel
void
split_vector_C2_D4
(
__global
int
*mat_src,
int
src_step,
int
src_offset,
__global
int
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
int
*mat_dst1,
int
dst1_step,
int
dst1_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int2
src_data
=
((
__global
int2
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[x]
;
((
__global
int
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
[x]
=
src_data.x
;
((
__global
int
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
[x]
=
src_data.y
;
}
}
__kernel
void
split_vector_C4_D5
(
__global
float
*mat_src,
int
src_step,
int
src_offset,
__global
float
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
float
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
float
*mat_dst2,
int
dst2_step,
int
dst2_offset,
__global
float
*mat_dst3,
int
dst3_step,
int
dst3_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst3_idx
=
mad24
(
y,
dst3_step,
dst3_offset
)
;
float4
src_data
=
((
__global
float4
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[x]
;
((
__global
float
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
[x]
=
src_data.x
;
((
__global
float
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
[x]
=
src_data.y
;
((
__global
float
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
[x]
=
src_data.z
;
((
__global
float
*
)((
__global
char
*
)
mat_dst3
+
dst3_idx
))
[x]
=
src_data.w
;
}
}
__kernel
void
split_vector_C3_D5
(
__global
float
*mat_src,
int
src_step,
int
src_offset,
__global
float
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
float
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
float
*mat_dst2,
int
dst2_step,
int
dst2_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
float
src_data_0
=
((
__global
float
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
0]
;
float
src_data_1
=
((
__global
float
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
1]
;
float
src_data_2
=
((
__global
float
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
2]
;
((
__global
float
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
[x]
=
src_data_0
;
((
__global
float
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
[x]
=
src_data_1
;
((
__global
float
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
[x]
=
src_data_2
;
}
}
__kernel
void
split_vector_C2_D5
(
__global
float
*mat_src,
int
src_step,
int
src_offset,
__global
float
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
float
*mat_dst1,
int
dst1_step,
int
dst1_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
float2
src_data
=
((
__global
float2
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[x]
;
((
__global
float
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
[x]
=
src_data.x
;
((
__global
float
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
[x]
=
src_data.y
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
split_vector_C4_D6
(
__global
double
*mat_src,
int
src_step,
int
src_offset,
__global
double
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
double
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
double
*mat_dst2,
int
dst2_step,
int
dst2_offset,
__global
double
*mat_dst3,
int
dst3_step,
int
dst3_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
int
dst3_idx
=
mad24
(
y,
dst3_step,
dst3_offset
)
;
double4
src_data
=
((
__global
double4
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[x]
;
((
__global
double
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
[x]
=
src_data.x
;
((
__global
double
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
[x]
=
src_data.y
;
((
__global
double
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
[x]
=
src_data.z
;
((
__global
double
*
)((
__global
char
*
)
mat_dst3
+
dst3_idx
))
[x]
=
src_data.w
;
}
}
__kernel
void
split_vector_C3_D6
(
__global
double
*mat_src,
int
src_step,
int
src_offset,
__global
double
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
double
*mat_dst1,
int
dst1_step,
int
dst1_offset,
__global
double
*mat_dst2,
int
dst2_step,
int
dst2_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
int
dst2_idx
=
mad24
(
y,
dst2_step,
dst2_offset
)
;
double
src_data_0
=
((
__global
double
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
0]
;
double
src_data_1
=
((
__global
double
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
1]
;
double
src_data_2
=
((
__global
double
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[3
*
x
+
2]
;
((
__global
double
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
[x]
=
src_data_0
;
((
__global
double
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
[x]
=
src_data_1
;
((
__global
double
*
)((
__global
char
*
)
mat_dst2
+
dst2_idx
))
[x]
=
src_data_2
;
}
}
__kernel
void
split_vector_C2_D6
(
__global
double
*mat_src,
int
src_step,
int
src_offset,
__global
double
*mat_dst0,
int
dst0_step,
int
dst0_offset,
__global
double
*mat_dst1,
int
dst1_step,
int
dst1_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
((
x
<
cols
)
&&
(
y
<
rows
))
{
int
src_idx
=
mad24
(
y,
src_step,
src_offset
)
;
int
dst0_idx
=
mad24
(
y,
dst0_step,
dst0_offset
)
;
int
dst1_idx
=
mad24
(
y,
dst1_step,
dst1_offset
)
;
#
if
VEC_SIZE
==
1
TYPE
dstC0
=
srcData[0].s0
;
TYPE
dstC1
=
srcData[0].s1
;
#
if
DATA_CHAN
>
2
TYPE
dstC2
=
srcData[0].s2
;
#
endif
#
if
DATA_CHAN
>
3
TYPE
dstC3
=
srcData[0].s3
;
#
endif
#
define
VEC_TO_ARRAY
(
v,
a
)
TYPE
a[1]
=
{v}
;
#
elif
VEC_SIZE
==
2
DST_VEC_TYPE
dstC0
=
(
DST_VEC_TYPE
)(
srcData[0].s0,
srcData[1].s0
)
;
DST_VEC_TYPE
dstC1
=
(
DST_VEC_TYPE
)(
srcData[0].s1,
srcData[1].s1
)
;
#
if
DATA_CHAN
>
2
DST_VEC_TYPE
dstC2
=
(
DST_VEC_TYPE
)(
srcData[0].s2,
srcData[1].s2
)
;
#
endif
#
if
DATA_CHAN
>
3
DST_VEC_TYPE
dstC3
=
(
DST_VEC_TYPE
)(
srcData[0].s3,
srcData[1].s3
)
;
#
endif
#
define
VEC_TO_ARRAY
(
v,
a
)
TYPE
a[2]
=
{v.s0,
v.s1}
;
#
elif
VEC_SIZE
==
4
DST_VEC_TYPE
dstC0
=
(
DST_VEC_TYPE
)(
srcData[0].s0,
srcData[1].s0,
srcData[2].s0,
srcData[3].s0
)
;
DST_VEC_TYPE
dstC1
=
(
DST_VEC_TYPE
)(
srcData[0].s1,
srcData[1].s1,
srcData[2].s1,
srcData[3].s1
)
;
#
if
DATA_CHAN
>
2
DST_VEC_TYPE
dstC2
=
(
DST_VEC_TYPE
)(
srcData[0].s2,
srcData[1].s2,
srcData[2].s2,
srcData[3].s2
)
;
#
endif
#
if
DATA_CHAN
>
3
DST_VEC_TYPE
dstC3
=
(
DST_VEC_TYPE
)(
srcData[0].s3,
srcData[1].s3,
srcData[2].s3,
srcData[3].s3
)
;
#
endif
#
define
VEC_TO_ARRAY
(
v,
a
)
TYPE
a[4]
=
{v.s0,
v.s1,
v.s2,
v.s3}
;
#
endif
double2
src_data
=
((
__global
double2
*
)((
__global
char
*
)
mat_src
+
src_idx
))
[x]
;
#
ifndef
BYPASS_VSTORE
#
define
BYPASS_VSTORE
false
#
endif
((
__global
double
*
)((
__global
char
*
)
mat_dst0
+
dst0_idx
))
[x]
=
src_data.x
;
((
__global
double
*
)((
__global
char
*
)
mat_dst1
+
dst1_idx
))
[x]
=
src_data.y
;
#
define
WRITE_VEC_DST
(
dst,
vecValue
)
\
{
\
int
dst
##
xOffsetLimitBytes
=
dst
##
Offset.x
+
size.x
*
sizeof
(
TYPE
)
; \
int
dst
##
xOffsetBytes
=
dst
##
Offset.x
+
x
*
sizeof
(
TYPE
)
; \
int
dst
##
yOffsetBytes
=
(
dst
##
Offset.y
+
y
)
*
dst
##
StepBytes
; \
if
(
!BYPASS_VSTORE
&&
dst
##
xOffsetBytes
+
sizeof
(
DST_VEC_TYPE
)
<=
dst
##
xOffsetLimitBytes
)
\
{
\
VSTORE_
##
dst
(((
__global
char*
)
dst
+
dst
##
yOffsetBytes
+
dst
##
xOffsetBytes
)
,
vecValue
)
; \
}
\
else
\
{
\
VEC_TO_ARRAY
(
vecValue,
vecValue##Array
)
; \
for
(
int
i
=
0
; i < VEC_SIZE; i++, dst ## xOffsetBytes += sizeof(TYPE)) \
{
\
if
(
dst
##
xOffsetBytes
+
sizeof
(
TYPE
)
<=
dst
##
xOffsetLimitBytes
)
\
*
(
__global
TYPE*
)((
__global
char*
)
dst
+
dst
##
yOffsetBytes
+
dst
##
xOffsetBytes
)
=
vecValue##Array[i]
; \
else
\
break
; \
}
\
}
\
}
WRITE_VEC_DST
(
dst0,
dstC0
)
;
WRITE_VEC_DST
(
dst1,
dstC1
)
;
#
if
DATA_CHAN
>
2
WRITE_VEC_DST
(
dst2,
dstC2
)
;
#
endif
#
if
DATA_CHAN
>
3
WRITE_VEC_DST
(
dst3,
dstC3
)
;
#
endif
}
}
#
endif
modules/ocl/src/safe_call.hpp
View file @
c70fbb95
...
...
@@ -66,7 +66,7 @@ namespace cv
static
inline
void
___openCLSafeCall
(
int
err
,
const
char
*
file
,
const
int
line
,
const
char
*
func
=
""
)
{
if
(
CL_SUCCESS
!=
err
)
if
(
CL_SUCCESS
!=
err
)
cv
::
ocl
::
error
(
getOpenCLErrorString
(
err
),
file
,
line
,
func
);
}
}
...
...
modules/ocl/src/split_merge.cpp
View file @
c70fbb95
...
...
@@ -149,90 +149,128 @@ namespace cv
mat_dst
.
create
(
size
,
CV_MAKETYPE
(
depth
,
total_channels
));
merge_vector_run
(
mat_src
,
n
,
mat_dst
);
}
static
void
split_vector_run
(
const
oclMat
&
mat_src
,
oclMat
*
mat_
dst
)
static
void
split_vector_run
(
const
oclMat
&
src
,
oclMat
*
dst
)
{
if
(
!
mat_src
.
clCxt
->
supportsFeature
(
FEATURE_CL_DOUBLE
)
&&
mat_
src
.
type
()
==
CV_64F
)
if
(
!
src
.
clCxt
->
supportsFeature
(
FEATURE_CL_DOUBLE
)
&&
src
.
type
()
==
CV_64F
)
{
CV_Error
(
CV_OpenCLDoubleNotSupported
,
"Selected device doesn't support double"
);
return
;
}
Context
*
clCxt
=
mat_src
.
clCxt
;
int
channels
=
mat_src
.
oclchannels
();
int
depth
=
mat_src
.
depth
();
Context
*
clCtx
=
src
.
clCxt
;
int
channels
=
src
.
channels
();
int
depth
=
src
.
depth
();
depth
=
(
depth
==
CV_8S
)
?
CV_8U
:
depth
;
depth
=
(
depth
==
CV_16S
)
?
CV_16U
:
depth
;
string
kernelName
=
"split_vector"
;
int
vector_lengths
[
4
][
7
]
=
{{
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
4
,
4
,
2
,
2
,
1
,
1
,
1
},
{
4
,
4
,
2
,
2
,
1
,
1
,
1
},
{
4
,
4
,
2
,
2
,
1
,
1
,
1
}
};
size_t
vector_length
=
vector_lengths
[
channels
-
1
][
mat_dst
[
0
].
depth
()];
int
max_offset_cols
=
0
;
for
(
int
i
=
0
;
i
<
channels
;
i
++
)
{
int
offset_cols
=
(
mat_dst
[
i
].
offset
/
mat_dst
[
i
].
elemSize
())
&
(
vector_length
-
1
);
if
(
max_offset_cols
<
offset_cols
)
max_offset_cols
=
offset_cols
;
}
int
cols
=
vector_length
==
1
?
divUp
(
mat_src
.
cols
,
vector_length
)
:
divUp
(
mat_src
.
cols
+
max_offset_cols
,
vector_length
);
size_t
localThreads
[
3
]
=
{
64
,
4
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
,
mat_src
.
rows
,
1
};
size_t
VEC_SIZE
=
4
;
int
dst_step1
=
mat_dst
[
0
].
cols
*
mat_dst
[
0
].
elemSize
();
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_dst
[
0
].
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_dst
[
0
].
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_dst
[
0
].
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_dst
[
1
].
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_dst
[
1
].
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_dst
[
1
].
offset
));
if
(
channels
>=
3
)
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
int
srcOffsetXBytes
=
src
.
offset
%
src
.
step
;
int
srcOffsetY
=
src
.
offset
/
src
.
step
;
cl_int2
srcOffset
=
{{
srcOffsetXBytes
,
srcOffsetY
}};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int2
),
(
void
*
)
&
srcOffset
));
bool
dst0Aligned
=
false
,
dst1Aligned
=
false
,
dst2Aligned
=
false
,
dst3Aligned
=
false
;
int
alignSize
=
dst
[
0
].
elemSize1
()
*
VEC_SIZE
;
int
alignMask
=
alignSize
-
1
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
[
0
].
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
[
0
].
step
));
int
dst0OffsetXBytes
=
dst
[
0
].
offset
%
dst
[
0
].
step
;
int
dst0OffsetY
=
dst
[
0
].
offset
/
dst
[
0
].
step
;
cl_int2
dst0Offset
=
{{
dst0OffsetXBytes
,
dst0OffsetY
}};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int2
),
(
void
*
)
&
dst0Offset
));
if
((
dst0OffsetXBytes
&
alignMask
)
==
0
)
dst0Aligned
=
true
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
[
1
].
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
[
1
].
step
));
int
dst1OffsetXBytes
=
dst
[
1
].
offset
%
dst
[
1
].
step
;
int
dst1OffsetY
=
dst
[
1
].
offset
/
dst
[
1
].
step
;
cl_int2
dst1Offset
=
{{
dst1OffsetXBytes
,
dst1OffsetY
}};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int2
),
(
void
*
)
&
dst1Offset
));
if
((
dst1OffsetXBytes
&
alignMask
)
==
0
)
dst1Aligned
=
true
;
// DON'T MOVE VARIABLES INTO 'IF' BODY
int
dst2OffsetXBytes
,
dst2OffsetY
;
cl_int2
dst2Offset
;
int
dst3OffsetXBytes
,
dst3OffsetY
;
cl_int2
dst3Offset
;
if
(
channels
>=
3
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_dst
[
2
].
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_dst
[
2
].
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_dst
[
2
].
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
[
2
].
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
[
2
].
step
));
dst2OffsetXBytes
=
dst
[
2
].
offset
%
dst
[
2
].
step
;
dst2OffsetY
=
dst
[
2
].
offset
/
dst
[
2
].
step
;
dst2Offset
.
s
[
0
]
=
dst2OffsetXBytes
;
dst2Offset
.
s
[
1
]
=
dst2OffsetY
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int2
),
(
void
*
)
&
dst2Offset
));
if
((
dst2OffsetXBytes
&
alignMask
)
==
0
)
dst2Aligned
=
true
;
}
if
(
channels
>=
4
)
if
(
channels
>=
4
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mat_dst
[
3
].
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_dst
[
3
].
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_dst
[
3
].
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
[
3
].
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
[
3
].
step
));
dst3OffsetXBytes
=
dst
[
3
].
offset
%
dst
[
3
].
step
;
dst3OffsetY
=
dst
[
3
].
offset
/
dst
[
3
].
step
;
dst3Offset
.
s
[
0
]
=
dst3OffsetXBytes
;
dst3Offset
.
s
[
1
]
=
dst3OffsetY
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int2
),
(
void
*
)
&
dst3Offset
));
if
((
dst3OffsetXBytes
&
alignMask
)
==
0
)
dst3Aligned
=
true
;
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mat_src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step1
));
openCLExecuteKernel
(
clCxt
,
&
split_mat
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
);
cl_int2
size
=
{{
src
.
cols
,
src
.
rows
}};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int2
),
(
void
*
)
&
size
));
string
build_options
=
cv
::
format
(
"-D VEC_SIZE=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d"
,
(
int
)
VEC_SIZE
,
depth
,
channels
);
if
(
dst0Aligned
)
build_options
+=
" -D DST0_ALIGNED"
;
if
(
dst1Aligned
)
build_options
+=
" -D DST1_ALIGNED"
;
if
(
dst2Aligned
)
build_options
+=
" -D DST2_ALIGNED"
;
if
(
dst3Aligned
)
build_options
+=
" -D DST3_ALIGNED"
;
const
DeviceInfo
&
devInfo
=
clCtx
->
getDeviceInfo
();
// TODO Workaround for issues. Need to investigate a problem.
if
(
channels
==
2
&&
devInfo
.
deviceType
==
CVCL_DEVICE_TYPE_CPU
&&
devInfo
.
platform
->
platformVendor
.
find
(
"Intel"
)
!=
std
::
string
::
npos
&&
(
devInfo
.
deviceVersion
.
find
(
"Build 56860"
)
!=
std
::
string
::
npos
||
devInfo
.
deviceVersion
.
find
(
"Build 76921"
)
!=
std
::
string
::
npos
))
build_options
+=
" -D BYPASS_VSTORE=true"
;
size_t
globalThreads
[
3
]
=
{
divUp
(
src
.
cols
,
VEC_SIZE
),
src
.
rows
,
1
};
openCLExecuteKernel
(
clCtx
,
&
split_mat
,
kernelName
,
globalThreads
,
NULL
,
args
,
-
1
,
-
1
,
build_options
.
c_str
());
}
static
void
split
(
const
oclMat
&
mat_src
,
oclMat
*
mat_dst
)
{
CV_Assert
(
mat_dst
);
int
depth
=
mat_src
.
depth
();
int
num_channels
=
mat_src
.
ocl
channels
();
int
num_channels
=
mat_src
.
channels
();
Size
size
=
mat_src
.
size
();
if
(
num_channels
==
1
)
if
(
num_channels
==
1
)
{
mat_src
.
copyTo
(
mat_dst
[
0
]);
return
;
}
int
i
;
for
(
i
=
0
;
i
<
num_channels
;
i
++
)
for
(
int
i
=
0
;
i
<
mat_src
.
oclchannels
();
i
++
)
mat_dst
[
i
].
create
(
size
,
CV_MAKETYPE
(
depth
,
1
));
split_vector_run
(
mat_src
,
mat_dst
);
...
...
@@ -256,7 +294,7 @@ void cv::ocl::split(const oclMat &src, oclMat *dst)
}
void
cv
::
ocl
::
split
(
const
oclMat
&
src
,
vector
<
oclMat
>
&
dst
)
{
dst
.
resize
(
src
.
oclchannels
());
dst
.
resize
(
src
.
oclchannels
());
// TODO Why oclchannels?
if
(
src
.
oclchannels
()
>
0
)
split_merge
::
split
(
src
,
&
dst
[
0
]);
}
modules/ocl/test/test_split_merge.cpp
View file @
c70fbb95
...
...
@@ -158,81 +158,32 @@ PARAM_TEST_CASE(SplitTestBase, MatType, int, bool)
int
channels
;
bool
use_roi
;
//src mat
cv
::
Mat
mat
;
//dstmat
cv
::
Mat
dst
[
MAX_CHANNELS
];
// set up roi
int
roicols
,
roirows
;
int
srcx
,
srcy
;
int
dstx
[
MAX_CHANNELS
];
int
dsty
[
MAX_CHANNELS
];
//src mat with roi
cv
::
Mat
mat_roi
;
//dst mat with roi
cv
::
Mat
dst_roi
[
MAX_CHANNELS
];
cv
::
Mat
src
,
src_roi
;
cv
::
Mat
dst
[
MAX_CHANNELS
],
dst_roi
[
MAX_CHANNELS
];
//ocl dst mat for testing
cv
::
ocl
::
oclMat
gdst_whole
[
MAX_CHANNELS
];
//ocl mat with roi
cv
::
ocl
::
oclMat
gmat
;
cv
::
ocl
::
oclMat
gdst
[
MAX_CHANNELS
];
cv
::
ocl
::
oclMat
gsrc_whole
,
gsrc_roi
;
cv
::
ocl
::
oclMat
gdst_whole
[
MAX_CHANNELS
],
gdst_roi
[
MAX_CHANNELS
];
virtual
void
SetUp
()
{
type
=
GET_PARAM
(
0
);
channels
=
GET_PARAM
(
1
);
use_roi
=
GET_PARAM
(
2
);
cv
::
Size
size
(
MWIDTH
,
MHEIGHT
);
mat
=
randomMat
(
size
,
CV_MAKETYPE
(
type
,
channels
),
5
,
16
,
false
);
for
(
int
i
=
0
;
i
<
channels
;
++
i
)
dst
[
i
]
=
randomMat
(
size
,
CV_MAKETYPE
(
type
,
1
),
5
,
16
,
false
);
}
}
void
random_roi
()
{
if
(
use_roi
)
{
//randomize ROI
roicols
=
rng
.
uniform
(
1
,
mat
.
cols
);
roirows
=
rng
.
uniform
(
1
,
mat
.
rows
);
srcx
=
rng
.
uniform
(
0
,
mat
.
cols
-
roicols
);
srcy
=
rng
.
uniform
(
0
,
mat
.
rows
-
roirows
);
for
(
int
i
=
0
;
i
<
channels
;
++
i
)
{
dstx
[
i
]
=
rng
.
uniform
(
0
,
dst
[
i
].
cols
-
roicols
);
dsty
[
i
]
=
rng
.
uniform
(
0
,
dst
[
i
].
rows
-
roirows
);
}
}
else
{
roicols
=
mat
.
cols
;
roirows
=
mat
.
rows
;
srcx
=
srcy
=
0
;
for
(
int
i
=
0
;
i
<
channels
;
++
i
)
dstx
[
i
]
=
dsty
[
i
]
=
0
;
}
mat_roi
=
mat
(
Rect
(
srcx
,
srcy
,
roicols
,
roirows
));
for
(
int
i
=
0
;
i
<
channels
;
++
i
)
dst_roi
[
i
]
=
dst
[
i
](
Rect
(
dstx
[
i
],
dsty
[
i
],
roicols
,
roirows
));
Size
roiSize
=
randomSize
(
1
,
MAX_VALUE
);
Border
srcBorder
=
randomBorder
(
0
,
use_roi
?
MAX_VALUE
:
0
);
randomSubMat
(
src
,
src_roi
,
roiSize
,
srcBorder
,
CV_MAKETYPE
(
type
,
channels
),
0
,
256
);
generateOclMat
(
gsrc_whole
,
gsrc_roi
,
src
,
roiSize
,
srcBorder
);
for
(
int
i
=
0
;
i
<
channels
;
++
i
)
{
gdst_whole
[
i
]
=
dst
[
i
];
gdst
[
i
]
=
gdst_whole
[
i
](
Rect
(
dstx
[
i
],
dsty
[
i
],
roicols
,
roirows
));
Border
dstBorder
=
randomBorder
(
0
,
use_roi
?
MAX_VALUE
:
0
);
randomSubMat
(
dst
[
i
],
dst_roi
[
i
],
roiSize
,
dstBorder
,
CV_MAKETYPE
(
type
,
1
),
5
,
16
);
generateOclMat
(
gdst_whole
[
i
],
gdst_roi
[
i
],
dst
[
i
],
roiSize
,
dstBorder
);
}
gmat
=
mat_roi
;
}
};
...
...
@@ -244,11 +195,14 @@ OCL_TEST_P(Split, Accuracy)
{
random_roi
();
cv
::
split
(
mat
_roi
,
dst_roi
);
cv
::
ocl
::
split
(
g
mat
,
gdst
);
cv
::
split
(
src
_roi
,
dst_roi
);
cv
::
ocl
::
split
(
g
src_roi
,
gdst_roi
);
for
(
int
i
=
0
;
i
<
channels
;
++
i
)
EXPECT_MAT_NEAR
(
dst
[
i
],
Mat
(
gdst_whole
[
i
]),
0.0
);
{
EXPECT_MAT_NEAR
(
dst
[
i
],
gdst_whole
[
i
],
0.0
);
EXPECT_MAT_NEAR
(
dst_roi
[
i
],
gdst_roi
[
i
],
0.0
);
}
}
}
...
...
modules/ocl/test/utility.hpp
View file @
c70fbb95
...
...
@@ -88,14 +88,16 @@ inline double checkNormRelative(const Mat &m1, const Mat &m2)
{ \
ASSERT_EQ(mat1.type(), mat2.type()); \
ASSERT_EQ(mat1.size(), mat2.size()); \
EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps) \
<< cv::format("Size: %d x %d", mat1.cols, mat1.rows) << std::endl; \
}
#define EXPECT_MAT_NEAR_RELATIVE(mat1, mat2, eps) \
{ \
ASSERT_EQ(mat1.type(), mat2.type()); \
ASSERT_EQ(mat1.size(), mat2.size()); \
EXPECT_LE(checkNormRelative(cv::Mat(mat1), cv::Mat(mat2)), eps); \
EXPECT_LE(checkNormRelative(cv::Mat(mat1), cv::Mat(mat2)), eps) \
<< cv::format("Size: %d x %d", mat1.cols, mat1.rows) << std::endl; \
}
#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment