Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
2c06e59a
Commit
2c06e59a
authored
Mar 26, 2013
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix some mismatch
parent
f428d187
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
439 additions
and
541 deletions
+439
-541
moments.cpp
modules/ocl/src/moments.cpp
+15
-39
moments.cl
modules/ocl/src/opencl/moments.cl
+212
-262
pyr_up.cl
modules/ocl/src/opencl/pyr_up.cl
+212
-240
No files found.
modules/ocl/src/moments.cpp
View file @
2c06e59a
...
...
@@ -106,7 +106,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
bool
is_float
=
CV_SEQ_ELTYPE
(
contour
)
==
CV_32FC2
;
if
(
!
cv
::
ocl
::
Context
::
getContext
()
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
is_float
)
if
(
!
cv
::
ocl
::
Context
::
getContext
()
->
impl
->
double_support
&&
is_float
)
{
CV_Error
(
CV_StsUnsupportedFormat
,
"Moments - double is not supported by your GPU!"
);
}
...
...
@@ -143,10 +143,10 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_step
));
openCLExecuteKernel
(
dst_a
.
clCxt
,
&
moments
,
"icvContourMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
cv
::
Mat
dst
(
dst_a
);
a00
=
a10
=
a01
=
a20
=
a11
=
a02
=
a30
=
a21
=
a12
=
a03
=
0.0
;
if
(
!
cv
::
ocl
::
Context
::
getContext
()
->
supportsFeature
(
Context
::
CL_DOUBLE
)
)
if
(
!
cv
::
ocl
::
Context
::
getContext
()
->
impl
->
double_support
)
{
for
(
int
i
=
0
;
i
<
contour
->
total
;
++
i
)
{
...
...
@@ -161,7 +161,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
a12
+=
dst
.
at
<
cl_long
>
(
8
,
i
);
a03
+=
dst
.
at
<
cl_long
>
(
9
,
i
);
}
}
}
else
{
a00
=
cv
::
sum
(
dst
.
row
(
0
))[
0
];
...
...
@@ -277,16 +277,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
blocky
=
size
.
height
/
TILE_SIZE
;
else
blocky
=
size
.
height
/
TILE_SIZE
+
1
;
cv
::
ocl
::
oclMat
dst_m00
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m10
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m01
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m20
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m11
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m02
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m30
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m21
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m12
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m03
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m
(
blocky
*
10
,
blockx
,
CV_64FC1
);
cl_mem
sum
=
openCLCreateBuffer
(
src
.
clCxt
,
CL_MEM_READ_WRITE
,
10
*
sizeof
(
double
));
int
tile_width
=
std
::
min
(
size
.
width
,
TILE_SIZE
);
int
tile_height
=
std
::
min
(
size
.
height
,
TILE_SIZE
);
...
...
@@ -299,25 +290,17 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tileSize
.
width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tileSize
.
height
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m00
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m10
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m01
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m20
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m11
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m02
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m30
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m21
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m12
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m03
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m00
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m00
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
blocky
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
type
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
depth
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
cn
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
coi
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
binary
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
TILE_SIZE
));
openCLExecuteKernel
(
dst_m
00
.
clCxt
,
&
moments
,
"CvMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
openCLExecuteKernel
(
dst_m
.
clCxt
,
&
moments
,
"CvMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
size_t
localThreadss
[
3
]
=
{
128
,
1
,
1
};
size_t
globalThreadss
[
3
]
=
{
128
,
1
,
1
};
...
...
@@ -327,20 +310,12 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tile_width
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
TILE_SIZE
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sum
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m00
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m10
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m01
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m20
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m11
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m02
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m30
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m21
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m12
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m03
.
data
));
openCLExecuteKernel
(
dst_m00
.
clCxt
,
&
moments
,
"dst_sum"
,
globalThreadss
,
localThreadss
,
args_sum
,
-
1
,
-
1
);
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
step
));
openCLExecuteKernel
(
dst_m
.
clCxt
,
&
moments
,
"dst_sum"
,
globalThreadss
,
localThreadss
,
args_sum
,
-
1
,
-
1
);
double
*
dstsum
=
new
double
[
10
];
memset
(
dstsum
,
0
,
10
*
sizeof
(
double
));
openCLReadBuffer
(
dst_m
00
.
clCxt
,
sum
,(
void
*
)
dstsum
,
10
*
sizeof
(
double
));
openCLReadBuffer
(
dst_m
.
clCxt
,
sum
,(
void
*
)
dstsum
,
10
*
sizeof
(
double
));
mom
->
m00
=
dstsum
[
0
];
mom
->
m10
=
dstsum
[
1
];
mom
->
m01
=
dstsum
[
2
];
...
...
@@ -351,6 +326,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
mom
->
m21
=
dstsum
[
7
];
mom
->
m12
=
dstsum
[
8
];
mom
->
m03
=
dstsum
[
9
];
delete
[]
dstsum
;
icvCompleteMomentState
(
mom
);
}
...
...
modules/ocl/src/opencl/moments.cl
View file @
2c06e59a
...
...
@@ -6,25 +6,27 @@
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
typedef
double
T
;
typedef
double
F
;
typedef
double4
F4
;
#
define
convert_F4
convert_double4
#
else
typedef
float
double
;
typedef
float4
double
4
;
typedef
float
F
;
typedef
float4
F
4
;
typedef
long
T
;
#
define
convert_
double
4
convert_float4
#
define
convert_
F
4
convert_float4
#
endif
//#pragma
OPENCL
EXTENSION
cl_amd_printf:enable
//#if
defined
(
DOUBLE_SUPPORT
)
#
define
DST_ROW_A00
0
#
define
DST_ROW_A10
1
#
define
DST_ROW_A01
2
#
define
DST_ROW_A20
3
#
define
DST_ROW_A11
4
#
define
DST_ROW_A02
5
#
define
DST_ROW_A30
6
#
define
DST_ROW_A21
7
#
define
DST_ROW_A12
8
#
define
DST_ROW_A03
9
#
define
DST_ROW_00
0
#
define
DST_ROW_10
1
#
define
DST_ROW_01
2
#
define
DST_ROW_20
3
#
define
DST_ROW_11
4
#
define
DST_ROW_02
5
#
define
DST_ROW_30
6
#
define
DST_ROW_21
7
#
define
DST_ROW_12
8
#
define
DST_ROW_03
9
__kernel
void
icvContourMoments
(
int
contour_total,
__global
float*
reader_oclmat_data,
...
...
@@ -60,36 +62,76 @@ __kernel void icvContourMoments(int contour_total,
yii_1
=
yi_1
+
yi
;
dst_step
/=
sizeof
(
T
)
;
*
(
dst_a
+
DST_ROW_
A
00
*
dst_step
+
idx
)
=
dxy
;
*
(
dst_a
+
DST_ROW_
A
10
*
dst_step
+
idx
)
=
dxy
*
xii_1
;
*
(
dst_a
+
DST_ROW_
A
01
*
dst_step
+
idx
)
=
dxy
*
yii_1
;
*
(
dst_a
+
DST_ROW_
A
20
*
dst_step
+
idx
)
=
dxy
*
(
xi_1
*
xii_1
+
xi2
)
;
*
(
dst_a
+
DST_ROW_
A
11
*
dst_step
+
idx
)
=
dxy
*
(
xi_1
*
(
yii_1
+
yi_1
)
+
xi
*
(
yii_1
+
yi
))
;
*
(
dst_a
+
DST_ROW_
A
02
*
dst_step
+
idx
)
=
dxy
*
(
yi_1
*
yii_1
+
yi2
)
;
*
(
dst_a
+
DST_ROW_
A
30
*
dst_step
+
idx
)
=
dxy
*
xii_1
*
(
xi_12
+
xi2
)
;
*
(
dst_a
+
DST_ROW_
A
03
*
dst_step
+
idx
)
=
dxy
*
yii_1
*
(
yi_12
+
yi2
)
;
*
(
dst_a
+
DST_ROW_
A
21
*
dst_step
+
idx
)
=
*
(
dst_a
+
DST_ROW_00
*
dst_step
+
idx
)
=
dxy
;
*
(
dst_a
+
DST_ROW_10
*
dst_step
+
idx
)
=
dxy
*
xii_1
;
*
(
dst_a
+
DST_ROW_01
*
dst_step
+
idx
)
=
dxy
*
yii_1
;
*
(
dst_a
+
DST_ROW_20
*
dst_step
+
idx
)
=
dxy
*
(
xi_1
*
xii_1
+
xi2
)
;
*
(
dst_a
+
DST_ROW_11
*
dst_step
+
idx
)
=
dxy
*
(
xi_1
*
(
yii_1
+
yi_1
)
+
xi
*
(
yii_1
+
yi
))
;
*
(
dst_a
+
DST_ROW_02
*
dst_step
+
idx
)
=
dxy
*
(
yi_1
*
yii_1
+
yi2
)
;
*
(
dst_a
+
DST_ROW_30
*
dst_step
+
idx
)
=
dxy
*
xii_1
*
(
xi_12
+
xi2
)
;
*
(
dst_a
+
DST_ROW_03
*
dst_step
+
idx
)
=
dxy
*
yii_1
*
(
yi_12
+
yi2
)
;
*
(
dst_a
+
DST_ROW_21
*
dst_step
+
idx
)
=
dxy
*
(
xi_12
*
(
3
*
yi_1
+
yi
)
+
2
*
xi
*
xi_1
*
yii_1
+
xi2
*
(
yi_1
+
3
*
yi
))
;
*
(
dst_a
+
DST_ROW_
A
12
*
dst_step
+
idx
)
=
*
(
dst_a
+
DST_ROW_12
*
dst_step
+
idx
)
=
dxy
*
(
yi_12
*
(
3
*
xi_1
+
xi
)
+
2
*
yi
*
yi_1
*
xii_1
+
yi2
*
(
xi_1
+
3
*
xi
))
;
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
dst_sum
(
int
src_rows,
int
src_cols,
int
tile_height,
int
tile_width,
int
TILE_SIZE,
__global
F*
sum,
__global
F*
dst_m,
int
dst_step
)
{
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
block_y
=
src_rows/tile_height
;
int
block_x
=
src_cols/tile_width
;
int
block_num
;
if
(
src_rows
>
TILE_SIZE
&&
src_rows
%
TILE_SIZE
!=
0
)
block_y
++
;
if
(
src_cols
>
TILE_SIZE
&&
src_cols
%
TILE_SIZE
!=
0
)
block_x
++
;
block_num
=
block_y
*
block_x
;
__local
F
dst_sum[10][128]
;
if
(
gidy<128-block_num
)
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy+block_num]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
dst_step
/=
sizeof
(
F
)
;
if
(
gidy<block_num
)
{
dst_sum[0][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_00
*
block_y,
dst_step,
gidy
))
;
dst_sum[1][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_10
*
block_y,
dst_step,
gidy
))
;
dst_sum[2][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_01
*
block_y,
dst_step,
gidy
))
;
dst_sum[3][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_20
*
block_y,
dst_step,
gidy
))
;
dst_sum[4][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_11
*
block_y,
dst_step,
gidy
))
;
dst_sum[5][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_02
*
block_y,
dst_step,
gidy
))
;
dst_sum[6][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_30
*
block_y,
dst_step,
gidy
))
;
dst_sum[7][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_21
*
block_y,
dst_step,
gidy
))
;
dst_sum[8][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_12
*
block_y,
dst_step,
gidy
))
;
dst_sum[9][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_03
*
block_y,
dst_step,
gidy
))
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
lsize=64
; lsize>0; lsize>>=1)
{
if
(
gidy<lsize
)
{
int
lsize2
=
gidy
+
lsize
;
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy]
+=
dst_sum[i][lsize2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
gidy==0
)
for
(
int
i=0
; i<10; i++)
sum[i]
=
dst_sum[i][0]
;
}
__kernel
void
CvMoments_D0
(
__global
uchar16*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
int
TILE_SIZE
)
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
int
TILE_SIZE
)
{
uchar
tmp_coi[16]
; // get the coi data
uchar16
tmp[16]
;
...
...
@@ -127,7 +169,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C]
=
(
tmp[i/VLEN_C]!=zero
)
?full:zero
;
double
mom[10]
;
F
mom[10]
;
__local
int
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i=0
; i<10; i++)
...
...
@@ -197,119 +239,53 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
F
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
dst_sum
(
int
src_rows,
int
src_cols,
int
tile_height,
int
tile_width,
int
TILE_SIZE,
__global
double*
sum,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03
)
{
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
block_y
=
src_rows/tile_height
;
int
block_x
=
src_cols/tile_width
;
int
block_num
;
if
(
src_rows
>
TILE_SIZE
&&
src_rows
%
TILE_SIZE
!=
0
)
block_y
++
;
if
(
src_cols
>
TILE_SIZE
&&
src_cols
%
TILE_SIZE
!=
0
)
block_x
++
;
block_num
=
block_y
*
block_x
;
__local
double
dst_sum[10][128]
;
if
(
gidy<128-block_num
)
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy+block_num]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
gidy<block_num
)
{
dst_sum[0][gidy]
=
dst_m00[gidy]
;
dst_sum[1][gidy]
=
dst_m10[gidy]
;
dst_sum[2][gidy]
=
dst_m01[gidy]
;
dst_sum[3][gidy]
=
dst_m20[gidy]
;
dst_sum[4][gidy]
=
dst_m11[gidy]
;
dst_sum[5][gidy]
=
dst_m02[gidy]
;
dst_sum[6][gidy]
=
dst_m30[gidy]
;
dst_sum[7][gidy]
=
dst_m21[gidy]
;
dst_sum[8][gidy]
=
dst_m12[gidy]
;
dst_sum[9][gidy]
=
dst_m03[gidy]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
lsize=64
; lsize>0; lsize>>=1)
{
if
(
gidy<lsize
)
{
int
lsize2
=
gidy
+
lsize
;
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy]
+=
dst_sum[i][lsize2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
gidy==0
)
for
(
int
i=0
; i<10; i++)
sum[i]
=
dst_sum[i][0]
;
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
CvMoments_D2
(
__global
ushort8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
ushort
tmp_coi[8]
; // get the coi data
...
...
@@ -345,7 +321,7 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US]
=
(
tmp[i/VLEN_US]!=zero
)
?full:zero
;
double
mom[10]
;
F
mom[10]
;
__local
long
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i=0
; i<10; i++)
...
...
@@ -415,64 +391,55 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*mom[0],
ym
=
y
*
mom[0]
;
F
xm
=
x
*mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
CvMoments_D3
(
__global
short8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
short
tmp_coi[8]
; // get the coi data
...
...
@@ -509,7 +476,7 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
for
(
int
i=0
; i < tileSize_width; i+=(VLEN_S))
tmp[i/VLEN_S]
=
(
tmp[i/VLEN_S]!=zero
)
?full:zero
;
double
mom[10]
;
F
mom[10]
;
__local
long
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i=0
; i<10; i++)
...
...
@@ -579,64 +546,55 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
if
(
lidy
==0
&&lidx
==0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*
mom[0],
ym
=
y*mom[0]
;
F
xm
=
x
*
mom[0],
ym
=
y*mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
CvMoments_D5
(
__global
float*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
float
tmp_coi[4]
; // get the coi data
...
...
@@ -672,23 +630,23 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=4)
tmp[i/VLEN_F]
=
(
tmp[i/VLEN_F]!=zero
)
?full:zero
;
double
mom[10]
;
__local
double
m[10][128]
;
F
mom[10]
;
__local
F
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i
=
0
; i < 10; i ++)
for
(
int
j
=
0
; j < 128; j ++)
m[i][j]
=
0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
double
lm[10]
=
{0}
;
double4
x0
=
(
double
4
)(
0
)
;
double4
x1
=
(
double
4
)(
0
)
;
double4
x2
=
(
double
4
)(
0
)
;
double4
x3
=
(
double
4
)(
0
)
;
F
lm[10]
=
{0}
;
F4
x0
=
(
F
4
)(
0
)
;
F4
x1
=
(
F
4
)(
0
)
;
F4
x2
=
(
F
4
)(
0
)
;
F4
x3
=
(
F
4
)(
0
)
;
for
(
int
xt
=
0
; xt < tileSize_width; xt+=VLEN_F )
{
double4
v_xt
=
(
double
4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
double4
p
=
convert_double
4
(
tmp[xt/VLEN_F]
)
;
double
4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
F4
v_xt
=
(
F
4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
F4
p
=
convert_F
4
(
tmp[xt/VLEN_F]
)
;
F
4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
x0
+=
p
;
x1
+=
xp
;
x2
+=
xxp
;
...
...
@@ -698,14 +656,14 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
x1.s0
+=
x1.s1
+
x1.s2
+
x1.s3
;
x2.s0
+=
x2.s1
+
x2.s2
+
x2.s3
;
x3.s0
+=
x3.s1
+
x3.s2
+
x3.s3
;
/*
double
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
F
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
int
bheight
=
min
(
tileSize_height,
TILE_SIZE/2
)
;
if
(
bheight
>=
TILE_SIZE/2&&lidy
>
bheight-1&&lidy
<
tileSize_height
)
{
m[9][lidy-bheight]
=
((
double
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
double
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
double
)
x2.s0
)
*
lidy
; // m21
m[9][lidy-bheight]
=
((
F
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
F
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
F
)
x2.s0
)
*
lidy
; // m21
m[6][lidy-bheight]
=
x3.s0
; // m30
m[5][lidy-bheight]
=
x0.s0
*
sy
; // m02
m[4][lidy-bheight]
=
x1.s0
*
lidy
; // m11
...
...
@@ -714,11 +672,12 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
m[1][lidy-bheight]
=
x1.s0
; // m10
m[0][lidy-bheight]
=
x0.s0
; // m00
}
else
if
(
lidy
<
bheight
)
{
lm[9]
=
((
double
)
py
)
*
sy
; // m03
lm[8]
=
((
double
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
double
)
x2.s0
)
*
lidy
; // m21
lm[9]
=
((
F
)
py
)
*
sy
; // m03
lm[8]
=
((
F
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
F
)
x2.s0
)
*
lidy
; // m21
lm[6]
=
x3.s0
; // m30
lm[5]
=
x0.s0
*
sy
; // m02
lm[4]
=
x1.s0
*
lidy
; // m11
...
...
@@ -741,69 +700,59 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
}
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
F
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
*/
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
CvMoments_D6
(
__global
double*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
__kernel
void
CvMoments_D6
(
__global
F*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
double
tmp_coi[4]
; // get the coi data
double
4
tmp[64]
;
F
tmp_coi[4]
; // get the coi data
F
4
tmp[64]
;
int
VLEN_D
=
4
; // length of vetor
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
...
...
@@ -820,39 +769,39 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep; i++ )
*
((
__global
double
*
)
src_data+
(
y+lidy
)
*src_step/8+x+i
)
=
0
;
*
((
__global
F
*
)
src_data+
(
y+lidy
)
*src_step/8+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
{
for
(
int
j=0
; j<4; j++)
tmp_coi[j]
=
*
(
src_data+
(
y+lidy
)
*src_step/8+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_D]
=
(
double
4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
tmp[i/VLEN_D]
=
(
F
4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D]
=
(
double
4
)(
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+1
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+2
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+3
))
;
double4
zero
=
(
double
4
)(
0
)
;
double4
full
=
(
double
4
)(
255
)
;
tmp[i/VLEN_D]
=
(
F
4
)(
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+1
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+2
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+3
))
;
F4
zero
=
(
F
4
)(
0
)
;
F4
full
=
(
F
4
)(
255
)
;
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D]
=
(
tmp[i/VLEN_D]!=zero
)
?full:zero
;
double
mom[10]
;
__local
double
m[10][128]
;
F
mom[10]
;
__local
F
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i=0
; i<10; i++)
for
(
int
j=0
; j<128; j++)
m[i][j]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
double
lm[10]
=
{0}
;
double4
x0
=
(
double
4
)(
0
)
;
double4
x1
=
(
double
4
)(
0
)
;
double4
x2
=
(
double
4
)(
0
)
;
double4
x3
=
(
double
4
)(
0
)
;
F
lm[10]
=
{0}
;
F4
x0
=
(
F
4
)(
0
)
;
F4
x1
=
(
F
4
)(
0
)
;
F4
x2
=
(
F
4
)(
0
)
;
F4
x3
=
(
F
4
)(
0
)
;
for
(
int
xt
=
0
; xt < tileSize_width; xt+=VLEN_D )
{
double4
v_xt
=
(
double
4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
double
4
p
=
tmp[xt/VLEN_D]
;
double
4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
F4
v_xt
=
(
F
4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
F
4
p
=
tmp[xt/VLEN_D]
;
F
4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
x0
+=
p
;
x1
+=
xp
;
x2
+=
xxp
;
...
...
@@ -863,13 +812,13 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col
x2.s0
+=
x2.s1
+
x2.s2
+
x2.s3
;
x3.s0
+=
x3.s1
+
x3.s2
+
x3.s3
;
double
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
F
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
int
bheight
=
min
(
tileSize_height,
TILE_SIZE/2
)
;
if
(
bheight
>=
TILE_SIZE/2&&lidy
>
bheight-1&&lidy
<
tileSize_height
)
{
m[9][lidy-bheight]
=
((
double
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
double
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
double
)
x2.s0
)
*
lidy
; // m21
m[9][lidy-bheight]
=
((
F
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
F
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
F
)
x2.s0
)
*
lidy
; // m21
m[6][lidy-bheight]
=
x3.s0
; // m30
m[5][lidy-bheight]
=
x0.s0
*
sy
; // m02
m[4][lidy-bheight]
=
x1.s0
*
lidy
; // m11
...
...
@@ -881,9 +830,9 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col
else
if
(
lidy
<
bheight
)
{
lm[9]
=
((
double
)
py
)
*
sy
; // m03
lm[8]
=
((
double
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
double
)
x2.s0
)
*
lidy
; // m21
lm[9]
=
((
F
)
py
)
*
sy
; // m03
lm[8]
=
((
F
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
F
)
x2.s0
)
*
lidy
; // m21
lm[6]
=
x3.s0
; // m30
lm[5]
=
x0.s0
*
sy
; // m02
lm[4]
=
x1.s0
*
lidy
; // m11
...
...
@@ -907,47 +856,47 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
F
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
}
\ No newline at end of file
modules/ocl/src/opencl/pyr_up.cl
View file @
2c06e59a
...
...
@@ -16,6 +16,8 @@
//
//
@Authors
//
Zhang
Chunpeng
chunpeng@multicorewareinc.com
//
Dachuan
Zhao,
dachuan@multicorewareinc.com
//
Yao
Wang,
yao@multicorewareinc.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
...
...
@@ -53,20 +55,22 @@ uchar get_valid_uchar(uchar data)
//////////////////////////
CV_8UC1
//////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel
void
pyrUp_C1_D0
(
__global
uchar*
src,__global
uchar*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
__local
float
s_srcPatch[10][10]
;
__local
float
s_dstPatch[20][16]
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
tidy
=
get_local_id
(
1
)
;
const
int
lsizex
=
get_local_size
(
0
)
;
const
int
lsizey
=
get_local_size
(
1
)
;
if
(
get_local_id
(
0
)
<
10
&&
get_local_id
(
1
)
<
10
)
if
(
tidx
<
10
&&
tidy
<
10
)
{
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
get_local_id
(
0
)
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
get_local_id
(
1
)
)
-
1
;
int
srcx
=
mad24
((
int
)
get_group_id
(
0
)
,
(
lsizex>>1
)
,
tidx
)
-
1
;
int
srcy
=
mad24
((
int
)
get_group_id
(
1
)
,
(
lsizey>>1
)
,
tidy
)
-
1
;
srcx
=
abs
(
srcx
)
;
srcx
=
min
(
srcCols
-
1
,
srcx
)
;
...
...
@@ -74,25 +78,24 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
srcy
=
abs
(
srcy
)
;
srcy
=
min
(
srcRows
-1
,
srcy
)
;
s_srcPatch[
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
(
float
)(
src[srcx
+
srcy
*
srcStep]
)
;
s_srcPatch[
tidy][tidx
]
=
(
float
)(
src[srcx
+
srcy
*
srcStep]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float
sum
=
0
;
const
int
evenFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
!=
0
)
;
const
bool
eveny
=
((
get_local_id
(
1
)
&
1
)
==
0
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
evenFlag
=
(
int
)((
tidx
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
tidx
&
1
)
!=
0
)
;
const
bool
eveny
=
((
tidy
&
1
)
==
0
)
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
sum
=
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[2
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
...
...
@@ -103,42 +106,40 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[0
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
0
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
0
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
0
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
0
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
sum
=
(
evenFlag
*
0.0625f
)
*
s_srcPatch[lsizey
-
16
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[9
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
9
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
9
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
9
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
(
evenFlag
*
0.0625f
)
*
s_srcPatch[lsizey
-
7
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[4
+
tidy][tidx
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
0
;
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
;
sum
=
sum
+
0.375f
*
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
;
sum
=
0.0625f
*
s_dstPatch[2
+
tidy
-
2][tidx]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
-
1][tidx]
;
sum
=
sum
+
0.375f
*
s_dstPatch[2
+
tidy
][tidx]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
+
1][tidx]
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
+
2][tidx]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
dst[x
+
y
*
dstStep]
=
(
float
)(
4.0f
*
sum
)
;
...
...
@@ -149,8 +150,8 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
//////////////////////////
CV_16UC1
/////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel
void
pyrUp_C1_D2
(
__global
ushort*
src,__global
ushort*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
...
...
@@ -210,13 +211,13 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
0
;
if
(
eveny
)
...
...
@@ -228,7 +229,7 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -251,12 +252,15 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
//////////////////////////
CV_32FC1
/////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel
void
pyrUp_C1_D5
(
__global
float*
src,__global
float*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
tidy
=
get_local_id
(
1
)
;
const
int
lsizex
=
get_local_size
(
0
)
;
const
int
lsizey
=
get_local_size
(
1
)
;
__local
float
s_srcPatch[10][10]
;
__local
float
s_dstPatch[20][16]
;
...
...
@@ -266,10 +270,10 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
dstStep
=
dstStep
>>
2
;
if
(
get_local_id
(
0
)
<
10
&&
get_local_id
(
1
)
<
10
)
if
(
tidx
<
10
&&
tidy
<
10
)
{
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
get_local_id
(
0
)
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
get_local_id
(
1
)
)
-
1
;
int
srcx
=
mad24
((
int
)
get_group_id
(
0
)
,
lsizex>>1,
tidx
)
-
1
;
int
srcy
=
mad24
((
int
)
get_group_id
(
1
)
,
lsizey>>1,
tidy
)
-
1
;
srcx
=
abs
(
srcx
)
;
srcx
=
min
(
srcCols
-
1
,
srcx
)
;
...
...
@@ -277,71 +281,67 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
srcy
=
abs
(
srcy
)
;
srcy
=
min
(
srcRows
-1
,
srcy
)
;
s_srcPatch[
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
(
float
)(
src[srcx
+
srcy
*
srcStep]
)
;
s_srcPatch[
tidy][tidx
]
=
(
float
)(
src[srcx
+
srcy
*
srcStep]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float
sum
=
0
;
const
int
evenFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
!=
0
)
;
const
bool
eveny
=
((
get_local_id
(
1
)
&
1
)
==
0
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
evenFlag
=
(
int
)((
tidx
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
tidx
&
1
)
!=
0
)
;
const
bool
eveny
=
((
tidy
&
1
)
==
0
)
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[2
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
s_dstPatch[2
+
tidy][tidx
]
=
sum
;
if
(
get_local_id
(
1
)
<
2
)
if
(
tidy
<
2
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
0
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
0
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
0
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
0
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
0
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[tidy][tidx
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
if
(
tidy
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
9
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
9
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
9
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
9
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
9
][1
+
((
tidx
+
2
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[4
+
tidy][tidx
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
0
;
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
;
sum
=
sum
+
0.375f
*
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
;
sum
=
0.0625f
*
s_dstPatch[2
+
tidy
-
2][tidx]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
-
1][tidx]
;
sum
=
sum
+
0.375f
*
s_dstPatch[2
+
tidy
][tidx]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
+
1][tidx]
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
+
2][tidx]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
dst[x
+
y
*
dstStep]
=
(
float
)(
4.0f
*
sum
)
;
...
...
@@ -376,37 +376,16 @@ uchar4 convert_float4_to_uchar4(float4 data)
return
u4Data
;
}
float4
int_x_float4
(
int
leftOpr,float4
rightOpr
)
{
float4
result
=
{0,0,0,0}
;
result.x
=
rightOpr.x
*
leftOpr
;
result.y
=
rightOpr.y
*
leftOpr
;
result.z
=
rightOpr.z
*
leftOpr
;
result.w
=
rightOpr.w
*
leftOpr
;
return
result
;
}
float4
float4_x_float4
(
float4
leftOpr,float4
rightOpr
)
{
float4
result
;
result.x
=
leftOpr.x
*
rightOpr.x
;
result.y
=
leftOpr.y
*
rightOpr.y
;
result.z
=
leftOpr.z
*
rightOpr.z
;
result.w
=
leftOpr.w
*
rightOpr.w
;
return
result
;
}
__kernel
void
pyrUp_C4_D0
(
__global
uchar4*
src,__global
uchar4*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
tidy
=
get_local_id
(
1
)
;
const
int
lsizex
=
get_local_size
(
0
)
;
const
int
lsizey
=
get_local_size
(
1
)
;
__local
float4
s_srcPatch[10][10]
;
__local
float4
s_dstPatch[20][16]
;
...
...
@@ -416,10 +395,10 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
dstStep
>>=
2
;
if
(
get_local_id
(
0
)
<
10
&&
get_local_id
(
1
)
<
10
)
if
(
tidx
<
10
&&
tidy
<
10
)
{
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
get_local_id
(
0
)
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
get_local_id
(
1
)
)
-
1
;
int
srcx
=
mad24
((
int
)
get_group_id
(
0
)
,
lsizex>>1,
tidx
)
-
1
;
int
srcy
=
mad24
((
int
)
get_group_id
(
1
)
,
lsizey>>1,
tidy
)
-
1
;
srcx
=
abs
(
srcx
)
;
srcx
=
min
(
srcCols
-
1
,
srcx
)
;
...
...
@@ -427,17 +406,16 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
srcy
=
abs
(
srcy
)
;
srcy
=
min
(
srcRows
-1
,
srcy
)
;
s_srcPatch[
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
covert_uchar4_to_float4
(
src[srcx
+
srcy
*
srcStep]
)
;
s_srcPatch[
tidy][tidx
]
=
covert_uchar4_to_float4
(
src[srcx
+
srcy
*
srcStep]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float4
sum
=
(
float4
)(
0
,
0
,
0
,
0
)
;
const
int
evenFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
!=
0
)
;
const
bool
eveny
=
((
get_local_id
(
1
)
&
1
)
==
0
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
evenFlag
=
(
int
)((
tidx
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
tidx
&
1
)
!=
0
)
;
const
bool
eveny
=
((
tidy
&
1
)
==
0
)
;
float4
co1
=
(
float4
)(
0.375f,
0.375f,
0.375f,
0.375f
)
;
float4
co2
=
(
float4
)(
0.25f,
0.25f,
0.25f,
0.25f
)
;
...
...
@@ -446,63 +424,59 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co1
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[2
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
s_dstPatch[2
+
tidy][tidx
]
=
sum
;
if
(
get_local_id
(
1
)
<
2
)
if
(
tidy
<
2
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[tidy][tidx
]
=
sum
;
}
if
(
tidy
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[9][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[4
+
tidy][tidx
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
0
;
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co1
,
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
)
;
sum
=
co3
*
s_dstPatch[2
+
tidy
-
2][tidx]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
-
1][tidx]
;
sum
=
sum
+
co1
*
s_dstPatch[2
+
tidy
][tidx]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
+
1][tidx]
;
sum
=
sum
+
co3
*
s_dstPatch[2
+
tidy
+
2][tidx]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
{
dst[x
+
y
*
dstStep]
=
convert_float4_to_uchar4
(
int_x_float4
(
4.0f,sum
)
)
;
dst[x
+
y
*
dstStep]
=
convert_float4_to_uchar4
(
4.0f
*
sum
)
;
}
}
///////////////////////////////////////////////////////////////////////
...
...
@@ -535,8 +509,8 @@ ushort4 convert_float4_to_ushort4(float4 data)
__kernel
void
pyrUp_C4_D2
(
__global
ushort4*
src,__global
ushort4*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
...
...
@@ -580,11 +554,11 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co1
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag*
co3
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag*
co1
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag*
co3
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
...
...
@@ -596,31 +570,31 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[0][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[0][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[9][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[9][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[9][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[9][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[9][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -628,15 +602,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co1
,
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
)
;
sum
=
sum
+
co3
*
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
;
sum
=
sum
+
co1
*
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
;
sum
=
sum
+
co3
*
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
{
dst[x
+
y
*
dstStep]
=
convert_float4_to_ushort4
(
int_x_float4
(
4.0f,sum
)
)
;
dst[x
+
y
*
dstStep]
=
convert_float4_to_ushort4
(
4.0f
*
sum
)
;
}
}
...
...
@@ -644,12 +618,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
//////////////////////////
CV_32FC4
//////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel
void
pyrUp_C4_D5
(
__global
float4*
src,__global
float4*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
tidy
=
get_local_id
(
1
)
;
const
int
lsizex
=
get_local_size
(
0
)
;
const
int
lsizey
=
get_local_size
(
1
)
;
__local
float4
s_srcPatch[10][10]
;
__local
float4
s_dstPatch[20][16]
;
...
...
@@ -659,10 +636,10 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
dstStep
>>=
4
;
if
(
get_local_id
(
0
)
<
10
&&
get_local_id
(
1
)
<
10
)
if
(
tidx
<
10
&&
tidy
<
10
)
{
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
get_local_id
(
0
)
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
get_local_id
(
1
)
)
-
1
;
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
tidx
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
tidy
)
-
1
;
srcx
=
abs
(
srcx
)
;
srcx
=
min
(
srcCols
-
1
,
srcx
)
;
...
...
@@ -670,17 +647,16 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
srcy
=
abs
(
srcy
)
;
srcy
=
min
(
srcRows
-1
,
srcy
)
;
s_srcPatch[
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
(
float4
)(
src[srcx
+
srcy
*
srcStep]
)
;
s_srcPatch[
tidy][tidx
]
=
(
float4
)(
src[srcx
+
srcy
*
srcStep]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float4
sum
=
(
float4
)(
0
,
0
,
0
,
0
)
;
const
int
evenFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
!=
0
)
;
const
bool
eveny
=
((
get_local_id
(
1
)
&
1
)
==
0
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
evenFlag
=
(
int
)((
tidx
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
tidx
&
1
)
!=
0
)
;
const
bool
eveny
=
((
tidy
&
1
)
==
0
)
;
float4
co1
=
(
float4
)(
0.375f,
0.375f,
0.375f,
0.375f
)
;
float4
co2
=
(
float4
)(
0.25f,
0.25f,
0.25f,
0.25f
)
;
...
...
@@ -689,59 +665,55 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co1
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag*
co3
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag*
co1
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag*
co3
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[2
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
s_dstPatch[2
+
tidy][tidx
]
=
sum
;
if
(
get_local_id
(
1
)
<
2
)
if
(
tidy
<
2
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[tidy][tidx
]
=
sum
;
}
if
(
tidy
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[9][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[4
+
tidy][tidx
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
0
;
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co1
,
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
)
;
sum
=
co3
*
s_dstPatch[2
+
tidy
-
2][tidx]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
-
1][tidx]
;
sum
=
sum
+
co1
*
s_dstPatch[2
+
tidy
][tidx]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
+
1][tidx]
;
sum
=
sum
+
co3
*
s_dstPatch[2
+
tidy
+
2][tidx]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment